diff --git a/Makefile b/Makefile
index b04e0edfdfaa4b1bdc71c078c7c76c8eb16b7b22..9e1c698f9105ec7dab4d7fcc26f15d425bc8ddc4 100644
--- a/Makefile
+++ b/Makefile
@@ -27,9 +27,10 @@ venv-create:
 
 dev-install:
 	$(PIP) install -r requirements.build.txt
-	$(PIP) install -r requirements.prereq.txt
-	$(PIP) install -r requirements.txt
-	$(PIP) install -r requirements.dev.txt
+	$(PIP) install \
+		-r requirements.prereq.txt \
+		-r requirements.txt \
+		-r requirements.dev.txt
 
 
 dev-nltk-download-models:
diff --git a/requirements.dev.txt b/requirements.dev.txt
index 42326685fcfb4c4e59086cbe9e97ebfefaabc261..bcf004cfd24935507af9e3fa076dbca6db4cd16c 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -1,6 +1,6 @@
 # astroid and pylint raising false positives:
 # https://github.com/PyCQA/pylint/issues/3139
-pylint==2.8.3
+pylint==2.10.2
 flake8==3.9.2
 nose==1.3.7
 pytest==6.2.4
diff --git a/requirements.txt b/requirements.txt
index d8b44c749b243876ccbbc1205291c022250f3a4d..0e162729815801f193159cf2b4fa73cb97bfe507 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,13 +4,15 @@ gevent==21.8.0
 gunicorn==20.1.0
 fsspec==2021.5.0
 gcsfs==2021.5.0
+google-cloud-bigquery<=2.24.0
 futures
 fuzzywuzzy==0.18.0
 lxml==4.6.3
 matplotlib==2.2.5
-numpy==1.21.2
+numpy==1.18.5
 nltk==3.6.2
 Pillow>=3.4.1
+pyarrow>=0.17.1
 python-crfsuite==0.9.7
 Pyqtree==1.0.0
 requests==2.26.0
diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator.py b/sciencebeam_gym/preprocess/annotation/matching_annotator.py
index f2a33d62304811ea3c40e1a2f1f7118d370828fd..e355a4a099726ff0208ff540dbdac8cdc2f84566 100644
--- a/sciencebeam_gym/preprocess/annotation/matching_annotator.py
+++ b/sciencebeam_gym/preprocess/annotation/matching_annotator.py
@@ -676,7 +676,7 @@ class MatchingAnnotator(AbstractAnnotator):
 
         conditional_match: Optional[dict] = None
 
-        matched_choices_map = dict()
+        matched_choices_map = {}
         for target_annotation in self.target_annotations:
             get_logger().debug('target annotation: %s', target_annotation)
             target_value = normalise_and_remove_junk_str_or_list(target_annotation.value)
diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation.py b/sciencebeam_gym/preprocess/annotation/target_annotation.py
index 892d4c3ed68f4d6b4831d546eadb0eb1ddef40ee..b91529c6f1e671394dd5081acf4cd5a4577de5b0 100644
--- a/sciencebeam_gym/preprocess/annotation/target_annotation.py
+++ b/sciencebeam_gym/preprocess/annotation/target_annotation.py
@@ -67,7 +67,7 @@ class TargetAnnotation(object):
 
 
 def parse_xml_mapping(xml_mapping_filename):
-    with open(xml_mapping_filename, 'r') as f:
+    with open(xml_mapping_filename, 'r', encoding='utf-8') as f:
         config = ConfigParser()
         config.read_file(f)
         return {
diff --git a/sciencebeam_gym/preprocess/color_map.py b/sciencebeam_gym/preprocess/color_map.py
index a4fee41b683529f824a43e9744f91200592ace92..b05613611ca28280ef6f470b9b4fba2276070369 100644
--- a/sciencebeam_gym/preprocess/color_map.py
+++ b/sciencebeam_gym/preprocess/color_map.py
@@ -19,7 +19,7 @@ def parse_color_map_from_configparser(color_map_config):
                 return (int(m.group(1)), int(m.group(2)), int(m.group(3)))
         raise Exception('invalid color value: {}'.format(s))
 
-    color_map = dict()
+    color_map = {}
     for k, v in color_map_config.items('color_map'):
         color_map[k] = parse_color(v)
     return color_map
@@ -28,7 +28,7 @@ def parse_color_map_from_configparser(color_map_config):
 def parse_color_map_from_file(f):
     color_map_config = ConfigParser()
     if isinstance(f, str):
-        with open(f, 'r') as fp:
+        with open(f, 'r', encoding='utf-8') as fp:
             color_map_config.read_file(fp)
     else:
         color_map_config.read_file(f)
diff --git a/sciencebeam_gym/preprocess/preprocessing_transforms.py b/sciencebeam_gym/preprocess/preprocessing_transforms.py
index 1fea02fa971f67c0b715f5937cab028d1eae9fcd..b61d58edb53db507b5b392402bcd59e4af4cccf6 100644
--- a/sciencebeam_gym/preprocess/preprocessing_transforms.py
+++ b/sciencebeam_gym/preprocess/preprocessing_transforms.py
@@ -26,9 +26,9 @@ class WritePropsToTFRecord(beam.PTransform):
             raise RuntimeError('TensorFlow required for this transform')
         LOGGER.debug('tfrecords output file: %r', self.file_path + self.file_name_suffix)
 
-    def expand(self, pcoll):  # pylint: disable=W0221
+    def expand(self, input_or_inputs):  # pylint: disable=W0221
         return (
-            pcoll |
+            input_or_inputs |
             'ConvertToTfExamples' >> beam.FlatMap(lambda v: (
                 dict_to_example(props)
                 for props in self.extract_props(v)
diff --git a/sciencebeam_gym/tools/colorize_image.py b/sciencebeam_gym/tools/colorize_image.py
index d13900dbcd3af2cd739ea605e7ad901720333893..0b72f53bee3beacd2e2faf0498761291bdcd971a 100644
--- a/sciencebeam_gym/tools/colorize_image.py
+++ b/sciencebeam_gym/tools/colorize_image.py
@@ -53,7 +53,7 @@ def parse_color_map_from_configparser(color_map_config):
                 return (int(m.group(1)), int(m.group(2)), int(m.group(3)))
         raise Exception('invalid color value: {}'.format(s))
 
-    color_map = dict()
+    color_map = {}
     for k, v in color_map_config.items('color_map'):
         color_map[parse_color(k)] = parse_color(v)
     return color_map
diff --git a/sciencebeam_gym/tools/vocabulary/extract_embeddings_vocabulary.py b/sciencebeam_gym/tools/vocabulary/extract_embeddings_vocabulary.py
index 5b57915af095c0cac77b7fe590498295b48829b1..310c988709a03c416e3f18965fd367d9deaa8f78 100644
--- a/sciencebeam_gym/tools/vocabulary/extract_embeddings_vocabulary.py
+++ b/sciencebeam_gym/tools/vocabulary/extract_embeddings_vocabulary.py
@@ -48,7 +48,7 @@ def run(args: argparse.Namespace):
     tokens_iterable = iter_tokens_from_embeddings_file(
         args.input_file
     )
-    with open(args.output_vocabulary_file, 'wt') as out_fp:
+    with open(args.output_vocabulary_file, 'wt', encoding='utf-8') as out_fp:
         out_fp.writelines((
             token + '\n'
             for token in tokens_iterable
diff --git a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py
index 7c8eb82f8f45467ec5b9a3349159e5cfb1cfd66a..606ef29647dfca7025dc7a7bcba6b2c14ce348cc 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py
@@ -72,7 +72,7 @@ class GraphReferences(object):
 
     def __init__(self):
         self.is_training = None
-        self.inputs = dict()
+        self.inputs = {}
         self.examples = None
         self.train = None
         self.global_step = None
@@ -504,14 +504,14 @@ class Model(object):
         batched_tensors: dict = tf.train.batch(
             remove_none_from_dict({
                 k: getattr(tensors, k)
-                for k in {
+                for k in [
                     'input_uri',
                     'annotation_uri',
                     'image_tensor',
                     'annotation_tensor',
                     'separate_channel_annotation_tensor',
                     'pos_weight'
-                }
+                ]
             }),
             batch_size=batch_size
         )
diff --git a/setup.py b/setup.py
index 3db2bc01fe5d38725b97f165502de1e57655b822..5f60d81773471297434f0f120df4e7fa04204541 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@ CUSTOM_COMMANDS = [
     ]
 ]
 
-with open(os.path.join('requirements.txt'), 'r') as f:
+with open(os.path.join('requirements.txt'), 'r', encoding='utf-8') as f:
     REQUIRED_PACKAGES = f.readlines()
 
 packages = find_packages()
diff --git a/tests/models/crf/crfsuite_training_pipeline_test.py b/tests/models/crf/crfsuite_training_pipeline_test.py
index 70d0e9ca2dfd00256dc7f4e87debed4bda090163..6c0cabd484c348b40a6f50828298802243539d60 100644
--- a/tests/models/crf/crfsuite_training_pipeline_test.py
+++ b/tests/models/crf/crfsuite_training_pipeline_test.py
@@ -16,7 +16,7 @@ from sciencebeam_gym.models.text.feature_extractor import (
     CV_TAG_SCOPE
 )
 
-import sciencebeam_gym.models.text.crf.crfsuite_training_pipeline as crfsuite_training_pipeline
+from sciencebeam_gym.models.text.crf import crfsuite_training_pipeline
 from sciencebeam_gym.models.text.crf.crfsuite_training_pipeline import (
     load_and_convert_to_token_props,
     load_token_props_list_by_document,
diff --git a/tests/pdf/pdf_to_png_test.py b/tests/pdf/pdf_to_png_test.py
index 0cfbd96c2c732ef34164b49d9c051e63cd06fa6a..ceca30e0c718c2ea9f4bb2708f140442fc7b2949 100644
--- a/tests/pdf/pdf_to_png_test.py
+++ b/tests/pdf/pdf_to_png_test.py
@@ -6,7 +6,7 @@ from sciencebeam_gym.pdf.pdf_to_png import (
     PdfToPng
 )
 
-import sciencebeam_gym.pdf.pdf_to_png as pdf_to_png
+from sciencebeam_gym.pdf import pdf_to_png
 
 
 TEMP_DIR = '/tmp/1'
diff --git a/tests/preprocess/lxml_to_svg_test.py b/tests/preprocess/lxml_to_svg_test.py
index 8ae3c0e4ebb58626f9a93481862c2ea206b3c575..f0ae89102db336984b55a946ee77de026a53a464 100644
--- a/tests/preprocess/lxml_to_svg_test.py
+++ b/tests/preprocess/lxml_to_svg_test.py
@@ -62,7 +62,7 @@ COMMON_LXML_TOKEN_ATTRIBS = {
 
 
 def dict_extend(*dicts):
-    d = dict()
+    d = {}
     for x in dicts:
         d.update(x)
     return d
diff --git a/tests/structured_document/structured_document_loader_test.py b/tests/structured_document/structured_document_loader_test.py
index 3c51f8ab636fe48d60b0ca7f23b774a77b271042..d65f7148eba36a3f4a16d2853cbcb1ce84dab873 100644
--- a/tests/structured_document/structured_document_loader_test.py
+++ b/tests/structured_document/structured_document_loader_test.py
@@ -7,7 +7,7 @@ from unittest.mock import patch
 from lxml import etree
 from lxml.builder import E
 
-import sciencebeam_gym.structured_document.structured_document_loader as structured_document_loader
+from sciencebeam_gym.structured_document import structured_document_loader
 
 from sciencebeam_gym.structured_document.structured_document_loader import (
     StructuredDocumentType,
diff --git a/tests/structured_document/structured_document_saver_test.py b/tests/structured_document/structured_document_saver_test.py
index 5291ad64ffee1422895046393a7861027b1305d2..a919cff1b59f9535ef9ec0366632c627c2f4317f 100644
--- a/tests/structured_document/structured_document_saver_test.py
+++ b/tests/structured_document/structured_document_saver_test.py
@@ -12,7 +12,7 @@ from sciencebeam_gym.structured_document.svg import (
     SvgStructuredDocument
 )
 
-import sciencebeam_gym.structured_document.structured_document_saver as structured_document_saver
+from sciencebeam_gym.structured_document import structured_document_saver
 from sciencebeam_gym.structured_document.structured_document_saver import (
     save_lxml_structured_document,
     save_svg_structured_document,
diff --git a/tests/trainer/evaluator_test.py b/tests/trainer/evaluator_test.py
index aa63d982d2301dc15fdaf6eb0337de76d433616e..7106ee43c770ad7f862e85d41519a32feb78fbc2 100644
--- a/tests/trainer/evaluator_test.py
+++ b/tests/trainer/evaluator_test.py
@@ -75,7 +75,7 @@ class ExampleModel(object):
         self.examples = examples
 
     def build_graph(self, data_paths, batch_size, graph_mode):  # pylint: disable=unused-argument
-        tensors = dict()
+        tensors = {}
         tensors['is_training'] = tf.placeholder(tf.bool)
         map_keys_tracker = MapKeysTracker()
         dataset = example_dataset(map_keys_tracker, self.examples)
@@ -86,8 +86,8 @@ class ExampleModel(object):
         tensors['metric_values'] = []
         tensors['metric_updates'] = []
         tensors['global_step'] = tf.constant(100, tf.int32)
-        tensors['summaries'] = dict()
-        tensors['image_tensors'] = dict()
+        tensors['summaries'] = {}
+        tensors['image_tensors'] = {}
         tensors['evaluation_result'] = None
         image_shape = (10, 10, 3)
         pre_batch_tensors = {
diff --git a/tests/trainer/models/pix2pix/pix2pix_core_test.py b/tests/trainer/models/pix2pix/pix2pix_core_test.py
index 5afd1099ae5acfebd34640db75c2466e6067fed7..caa527da2f3010cc555872bbfcfac3272806b9fc 100644
--- a/tests/trainer/models/pix2pix/pix2pix_core_test.py
+++ b/tests/trainer/models/pix2pix/pix2pix_core_test.py
@@ -15,7 +15,7 @@ from sciencebeam_utils.utils.collection import (
     extend_dict
 )
 
-import sciencebeam_gym.trainer.models.pix2pix.pix2pix_core as pix2pix_core
+from sciencebeam_gym.trainer.models.pix2pix import pix2pix_core
 
 from sciencebeam_gym.trainer.models.pix2pix.pix2pix_core import (
     create_encoder_decoder,
diff --git a/tests/trainer/models/pix2pix/pix2pix_model_test.py b/tests/trainer/models/pix2pix/pix2pix_model_test.py
index d34d4860d76bcb7454d338595d95c48c39e16dec..59bc5044a95638d11149b22c4fac321ea83bb34c 100644
--- a/tests/trainer/models/pix2pix/pix2pix_model_test.py
+++ b/tests/trainer/models/pix2pix/pix2pix_model_test.py
@@ -15,7 +15,7 @@ from sciencebeam_gym.trainer.models.pix2pix.pix2pix_core import (
     ALL_BASE_LOSS
 )
 
-import sciencebeam_gym.trainer.models.pix2pix.pix2pix_model as pix2pix_model
+from sciencebeam_gym.trainer.models.pix2pix import pix2pix_model
 
 from sciencebeam_gym.trainer.models.pix2pix.pix2pix_model import (
     parse_color_map,