From bd100f4d3b80ef27c1e7e2c7c50f71af736e5c8e Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 24 Aug 2018 20:36:26 +0100
Subject: [PATCH] Use sciencebeam-utils and sciencebeam-alignment (#35)

* added sciencebeam-utils and sciencebeam-alignment dependency

* replaced local alignment module with sciencebeam-alignment

* replaced local beam_utils with sciencebeam-utils

* replaced local utils with sciencebeam-utils for modules that have moved

* removed tools that have been moved to sciencebeam-utils

* removed utility functions that have moved to sciencebeam-utils

* updated readme
---
 README.md                                     |  12 +-
 requirements.txt                              |   2 +
 sciencebeam_gym/alignment/SequenceMatcher.py  |  14 -
 .../alignment/WordSequenceMatcher.py          |  42 ---
 .../alignment/WordSequenceMatcher_test.py     |  46 ---
 sciencebeam_gym/alignment/__init__.py         |   0
 sciencebeam_gym/alignment/align.py            | 338 ------------------
 .../alignment/align_fast_utils.pyx            | 147 --------
 .../alignment/align_performance.py            |  85 -----
 sciencebeam_gym/alignment/align_test.py       | 143 --------
 sciencebeam_gym/beam_utils/__init__.py        |   0
 sciencebeam_gym/beam_utils/csv.py             | 180 ----------
 sciencebeam_gym/beam_utils/csv_test.py        | 126 -------
 sciencebeam_gym/beam_utils/files.py           |  49 ---
 sciencebeam_gym/beam_utils/files_test.py      |  77 ----
 sciencebeam_gym/beam_utils/io.py              |  46 ---
 sciencebeam_gym/beam_utils/main.py            | 136 -------
 sciencebeam_gym/beam_utils/testing.py         | 242 -------------
 sciencebeam_gym/beam_utils/utils.py           | 107 ------
 sciencebeam_gym/beam_utils/utils_test.py      | 152 --------
 .../convert/conversion_pipeline.py            |  27 +-
 .../convert/conversion_pipeline_test.py       |   2 +-
 .../convert/grobid/grobid_service_wrapper.py  |   4 +-
 .../inference_model/__init___test.py          |   2 +-
 .../annotate_using_predictions.py             |   2 +-
 .../inference_model/extract_to_xml.py         |   2 +-
 .../inference_model/extract_to_xml_test.py    |   2 +-
 .../text/crf/crfsuite_training_pipeline.py    |  12 +-
 .../crf/crfsuite_training_pipeline_test.py    |   2 +-
 sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py    |   4 +-
 .../annotation/annotation_evaluation.py       |   2 +-
 .../annotation/find_line_numbers_test.py      |   2 +-
 .../preprocess/annotation/fuzzy_match.py      |   6 +-
 .../annotation/matching_annotator.py          |   8 +-
 .../annotation/matching_annotator_test.py     |   8 +-
 .../annotation/target_annotation.py           |   8 +-
 sciencebeam_gym/preprocess/check_file_list.py |  85 -----
 .../preprocess/check_file_list_test.py        |  58 ---
 sciencebeam_gym/preprocess/find_file_pairs.py |  93 -----
 .../preprocess/find_file_pairs_test.py        | 152 --------
 .../preprocess/get_output_files.py            | 153 --------
 .../preprocess/get_output_files_test.py       | 181 ----------
 sciencebeam_gym/preprocess/lxml_to_svg.py     |  10 +-
 .../preprocess/preprocessing_pipeline.py      |  35 +-
 .../preprocess/preprocessing_pipeline_test.py |  12 +-
 .../preprocessing_transforms_test.py          |  12 +-
 .../preprocess/preprocessing_utils.py         | 144 +-------
 .../preprocess/preprocessing_utils_test.py    | 127 -------
 .../preprocess/split_csv_dataset.py           | 142 --------
 .../preprocess/split_csv_dataset_test.py      |  63 ----
 sciencebeam_gym/structured_document/lxml.py   |   8 +-
 .../structured_document_saver.py              |   2 +-
 sciencebeam_gym/structured_document/svg.py    |   8 +-
 .../tools/calculate_class_weights_test.py     |   2 +-
 sciencebeam_gym/trainer/data/examples.py      |   2 +-
 sciencebeam_gym/trainer/data/examples_test.py |   2 +-
 sciencebeam_gym/trainer/evaluator_test.py     |   2 +-
 .../trainer/models/pix2pix/evaluate_test.py   |   2 +-
 .../trainer/models/pix2pix/loss_test.py       |   2 +-
 .../models/pix2pix/pix2pix_core_test.py       |   4 +-
 .../models/pix2pix/pix2pix_model_test.py      |   2 +-
 .../trainer/models/pix2pix/tf_utils_test.py   |   2 +-
 sciencebeam_gym/utils/collection.py           |  63 ----
 sciencebeam_gym/utils/compat.py               |  14 -
 sciencebeam_gym/utils/compat_test.py          |  57 ---
 sciencebeam_gym/utils/csv.py                  |  56 ---
 sciencebeam_gym/utils/file_list.py            |  88 -----
 sciencebeam_gym/utils/file_list_test.py       | 202 -----------
 sciencebeam_gym/utils/file_path.py            |  21 --
 sciencebeam_gym/utils/file_path_test.py       |  24 --
 sciencebeam_gym/utils/io.py                   |  16 -
 sciencebeam_gym/utils/num.py                  |  21 --
 sciencebeam_gym/utils/pages_zip.py            |   2 +-
 sciencebeam_gym/utils/stopwatch.py            |  46 ---
 sciencebeam_gym/utils/string.py               |   9 -
 sciencebeam_gym/utils/xml.py                  |  41 ---
 sciencebeam_gym/utils/xml_test.py             |  62 ----
 sciencebeam_gym/utils/zip.py                  |  25 --
 78 files changed, 127 insertions(+), 3962 deletions(-)
 delete mode 100644 sciencebeam_gym/alignment/SequenceMatcher.py
 delete mode 100644 sciencebeam_gym/alignment/WordSequenceMatcher.py
 delete mode 100644 sciencebeam_gym/alignment/WordSequenceMatcher_test.py
 delete mode 100644 sciencebeam_gym/alignment/__init__.py
 delete mode 100644 sciencebeam_gym/alignment/align.py
 delete mode 100644 sciencebeam_gym/alignment/align_fast_utils.pyx
 delete mode 100644 sciencebeam_gym/alignment/align_performance.py
 delete mode 100644 sciencebeam_gym/alignment/align_test.py
 delete mode 100644 sciencebeam_gym/beam_utils/__init__.py
 delete mode 100644 sciencebeam_gym/beam_utils/csv.py
 delete mode 100644 sciencebeam_gym/beam_utils/csv_test.py
 delete mode 100644 sciencebeam_gym/beam_utils/files.py
 delete mode 100644 sciencebeam_gym/beam_utils/files_test.py
 delete mode 100644 sciencebeam_gym/beam_utils/io.py
 delete mode 100644 sciencebeam_gym/beam_utils/main.py
 delete mode 100644 sciencebeam_gym/beam_utils/testing.py
 delete mode 100644 sciencebeam_gym/beam_utils/utils.py
 delete mode 100644 sciencebeam_gym/beam_utils/utils_test.py
 delete mode 100644 sciencebeam_gym/preprocess/check_file_list.py
 delete mode 100644 sciencebeam_gym/preprocess/check_file_list_test.py
 delete mode 100644 sciencebeam_gym/preprocess/find_file_pairs.py
 delete mode 100644 sciencebeam_gym/preprocess/find_file_pairs_test.py
 delete mode 100644 sciencebeam_gym/preprocess/get_output_files.py
 delete mode 100644 sciencebeam_gym/preprocess/get_output_files_test.py
 delete mode 100644 sciencebeam_gym/preprocess/split_csv_dataset.py
 delete mode 100644 sciencebeam_gym/preprocess/split_csv_dataset_test.py
 delete mode 100644 sciencebeam_gym/utils/collection.py
 delete mode 100644 sciencebeam_gym/utils/compat.py
 delete mode 100644 sciencebeam_gym/utils/compat_test.py
 delete mode 100644 sciencebeam_gym/utils/csv.py
 delete mode 100644 sciencebeam_gym/utils/file_list.py
 delete mode 100644 sciencebeam_gym/utils/file_list_test.py
 delete mode 100644 sciencebeam_gym/utils/file_path.py
 delete mode 100644 sciencebeam_gym/utils/file_path_test.py
 delete mode 100644 sciencebeam_gym/utils/io.py
 delete mode 100644 sciencebeam_gym/utils/num.py
 delete mode 100644 sciencebeam_gym/utils/stopwatch.py
 delete mode 100644 sciencebeam_gym/utils/string.py
 delete mode 100644 sciencebeam_gym/utils/xml.py
 delete mode 100644 sciencebeam_gym/utils/xml_test.py
 delete mode 100644 sciencebeam_gym/utils/zip.py

diff --git a/README.md b/README.md
index a8d0a71..5a73a6f 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ The parent directory per manuscript is optional. If that is not the case then th
 Run:
 
 ```bash
-python -m sciencebeam_gym.preprocess.find_file_pairs \
+python -m sciencebeam_utils.tools.find_file_pairs \
 --data-path <source directory> \
 --source-pattern *.pdf.gz --xml-pattern *.nxml.gz \
 --out <output file list csv/tsv>
@@ -116,7 +116,7 @@ python -m sciencebeam_gym.preprocess.find_file_pairs \
 e.g.:
 
 ```bash
-python -m sciencebeam_gym.preprocess.find_file_pairs \
+python -m sciencebeam_utils.tools.find_file_pairs \
 --data-path gs://some-bucket/some-dataset \
 --source-pattern *.pdf.gz --xml-pattern *.nxml.gz \
 --out gs://some-bucket/some-dataset/file-list.tsv
@@ -134,7 +134,7 @@ That file could also be generated using any other preferred method.
 To separate the file list into a _training_, _validation_ and _test_ dataset, the following script can be used:
 
 ```bash
-python -m sciencebeam_gym.preprocess.split_csv_dataset \
+python -m sciencebeam_utils.tools.split_csv_dataset \
 --input <csv/tsv file list> \
 --train 0.5 --validation 0.2 --test 0.3 --random --fill
 ```
@@ -142,7 +142,7 @@ python -m sciencebeam_gym.preprocess.split_csv_dataset \
 e.g.:
 
 ```bash
-python -m sciencebeam_gym.preprocess.split_csv_dataset \
+python -m sciencebeam_utils.tools.split_csv_dataset \
 --input gs://some-bucket/some-dataset/file-list.tsv \
 --train 0.5 --validation 0.2 --test 0.3 --random --fill
 ```
@@ -261,7 +261,7 @@ A (Linear Chain) [CRF](https://en.wikipedia.org/wiki/Conditional_random_field) m
 When running the (CV) preprocessing pipeline with the parameter `--save-svg`, files with the file ext `.svg.zip` will be written to the _output path_. To get a list of those one can use the following command:
 
 ```bash
-python -m sciencebeam_gym.preprocess.get_output_files \
+python -m sciencebeam_utils.tools.get_output_files \
   --source-file-list path/to/file-list-train.tsv --source-file-column=pdf-url \
   --output-file-suffix=.svg.zip --output-file-list path/to/file-list-train-svg.tsv
 ```
@@ -273,7 +273,7 @@ The CRF Model can also be trained using the CV output as an additional input.
 When running the CV conversion pipeline with the parameter `--save-annot-lxml`, files with the ext `.cv.lxml.gz` will be written to the _output path_. To get a list of those one can use the following command:
 
 ```bash
-python -m sciencebeam_gym.preprocess.get_output_files \
+python -m sciencebeam_utils.tools.get_output_files \
   --source-file-list path/to/file-list-train.tsv --source-file-column=pdf-url \
   --output-file-suffix=.cv.lxml.gz --output-file-list path/to/file-list-train-cv-lxml.tsv
 ```
diff --git a/requirements.txt b/requirements.txt
index 7120aaa..4802b98 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,8 @@ protobuf==3.5.2.post1
 python-crfsuite==0.9.5
 Pyqtree==0.24
 requests==2.18.4
+sciencebeam-alignment==0.0.2
+sciencebeam-utils==0.0.1
 sklearn-crfsuite==0.3.6
 six==1.11.0
 tensorflow-transform==0.6.0
diff --git a/sciencebeam_gym/alignment/SequenceMatcher.py b/sciencebeam_gym/alignment/SequenceMatcher.py
deleted file mode 100644
index 0f11012..0000000
--- a/sciencebeam_gym/alignment/SequenceMatcher.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import platform
-import warnings
-
-try:
-  from fuzzywuzzy.StringMatcher import StringMatcher as SequenceMatcher
-except ImportError:
-  if platform.python_implementation() != "PyPy":
-    warnings.warn(
-      'Using slow pure-python SequenceMatcher.'
-      ' Install python-Levenshtein (and fuzzywuzzy) to remove this warning'
-    )
-  from difflib import SequenceMatcher
-
-assert SequenceMatcher is not None
diff --git a/sciencebeam_gym/alignment/WordSequenceMatcher.py b/sciencebeam_gym/alignment/WordSequenceMatcher.py
deleted file mode 100644
index 75ee892..0000000
--- a/sciencebeam_gym/alignment/WordSequenceMatcher.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import logging
-from difflib import SequenceMatcher
-
-DEFAULT_SEPARATORS = ' ,'
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def split_with_offset(s, sep):
-  previous_start = 0
-  tokens = []
-  for i, c in enumerate(s):
-    if c in sep:
-      if previous_start < i:
-        tokens.append((previous_start, s[previous_start:i]))
-      previous_start = i + 1
-  if previous_start < len(s):
-    tokens.append((previous_start, s[previous_start:]))
-  return tokens
-
-class WordSequenceMatcher(object):
-  def __init__(self, isjunk=None, a=None, b=None, sep=None):
-    if isjunk:
-      raise ValueError('isjunk not supported')
-    self.a = a
-    self.b = b
-    self.sep = sep or DEFAULT_SEPARATORS
-
-  def get_matching_blocks(self):
-    a_words_with_offsets = split_with_offset(self.a, self.sep)
-    b_words_with_offsets = split_with_offset(self.b, self.sep)
-    a_words = [w for _, w in a_words_with_offsets]
-    b_words = [w for _, w in b_words_with_offsets]
-    a_indices = [i for i, _ in a_words_with_offsets]
-    b_indices = [i for i, _ in b_words_with_offsets]
-    sm = SequenceMatcher(None, a_words, b_words)
-    matching_blocks = [
-      (a_indices[ai], b_indices[bi], len(a_words[ai]))
-      for ai, bi, size in sm.get_matching_blocks()
-      if size
-    ]
-    return matching_blocks
diff --git a/sciencebeam_gym/alignment/WordSequenceMatcher_test.py b/sciencebeam_gym/alignment/WordSequenceMatcher_test.py
deleted file mode 100644
index 56a52d7..0000000
--- a/sciencebeam_gym/alignment/WordSequenceMatcher_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from sciencebeam_gym.alignment.WordSequenceMatcher import (
-  WordSequenceMatcher
-)
-
-WORD_1 = 'word1'
-WORD_2 = 'word2'
-WORD_3 = 'word3'
-
-class TestWordSequenceMatcher(object):
-  def test_should_not_match_different_words(self):
-    sm = WordSequenceMatcher(None, WORD_1, WORD_2)
-    matching_blocks = sm.get_matching_blocks()
-    assert len(matching_blocks) == 0
-
-  def test_should_match_same_words_standalone(self):
-    sm = WordSequenceMatcher(None, WORD_1, WORD_1)
-    matching_blocks = sm.get_matching_blocks()
-    assert matching_blocks == [(
-      0,
-      0,
-      len(WORD_1)
-    )]
-
-  def test_should_match_same_words_within_other_words(self):
-    a_words = ['pre_a__', WORD_1, 'post_a']
-    b_words = ['pre_b', WORD_1, 'post_b']
-    sm = WordSequenceMatcher(
-      None,
-      ' '.join(a_words),
-      ' '.join(b_words)
-    )
-    matching_blocks = sm.get_matching_blocks()
-    assert matching_blocks == [(
-      len(a_words[0]) + 1,
-      len(b_words[0]) + 1,
-      len(WORD_1)
-    )]
-
-  def test_should_match_same_words_standalone_ignore_comma_after_word(self):
-    sm = WordSequenceMatcher(None, WORD_1 + ',', WORD_1)
-    matching_blocks = sm.get_matching_blocks()
-    assert matching_blocks == [(
-      0,
-      0,
-      len(WORD_1)
-    )]
diff --git a/sciencebeam_gym/alignment/__init__.py b/sciencebeam_gym/alignment/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/sciencebeam_gym/alignment/align.py b/sciencebeam_gym/alignment/align.py
deleted file mode 100644
index c3094c2..0000000
--- a/sciencebeam_gym/alignment/align.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import logging
-from collections import deque
-from itertools import islice
-from abc import ABCMeta, abstractmethod
-from contextlib import contextmanager
-
-import numpy as np
-
-from six import (
-  with_metaclass,
-  string_types,
-  binary_type
-)
-
-try:
-  from sciencebeam_gym.alignment.align_fast_utils import ( # pylint: disable=E0611
-    native_compute_inner_alignment_matrix_simple_scoring_int,
-    native_compute_inner_alignment_matrix_simple_scoring_any,
-    native_compute_inner_alignment_matrix_scoring_fn_any,
-    native_alignment_matrix_single_path_traceback
-  )
-  native_enabled = True
-except ImportError:
-  logging.getLogger(__name__).warning('fast implementation not available')
-  native_enabled = False
-
-MIN_INT = -2147483647
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-@contextmanager
-def require_native(required=True):
-  global native_enabled # pylint: disable=W0603
-  was_enabled = native_enabled
-  native_enabled = required
-  yield
-  native_enabled = was_enabled
-
-def _is_array_of_type(a, dtype):
-  return np.issubdtype(a.dtype, dtype)
-
-# fallback implementation
-def compute_inner_alignment_matrix_simple_scoring_py(
-  scoring_matrix, a, b, match_score, mismatch_score, gap_score, min_score):
-  m = len(a) + 1
-  n = len(b) + 1
-  for i in range(1, m):
-    for j in range(1, n):
-      scoring_matrix[i, j] = max(
-        min_score,
-
-        # Match elements.
-        scoring_matrix[i - 1, j - 1] +
-        (match_score if a[i - 1] == b[j - 1] else mismatch_score),
-
-        # Gap on sequenceA.
-        scoring_matrix[i, j - 1] + gap_score,
-
-        # Gap on sequenceB.
-        scoring_matrix[i - 1, j] + gap_score
-      )
-
-# fallback implementation
-def compute_inner_alignment_matrix_scoring_fn_py(
-  scoring_matrix, a, b, scoring_fn, gap_score, min_score):
-  m = len(a) + 1
-  n = len(b) + 1
-  for i in range(1, m):
-    for j in range(1, n):
-      scoring_matrix[i, j] = max(
-        min_score,
-
-        # Match elements.
-        scoring_matrix[i - 1, j - 1] +
-        scoring_fn(a[i - 1], b[j - 1]),
-
-        # Gap on sequenceA.
-        scoring_matrix[i, j - 1] + gap_score,
-
-        # Gap on sequenceB.
-        scoring_matrix[i - 1, j] + gap_score
-      )
-
-def compute_inner_alignment_matrix_simple_scoring(
-  scoring_matrix, a, b, match_score, mismatch_score, gap_score, min_score):
-  try:
-    if (
-      native_enabled and
-      _is_array_of_type(a, np.int32) and _is_array_of_type(b, np.int32)
-    ):
-      native_compute_inner_alignment_matrix_simple_scoring_int(
-        scoring_matrix, a, b,
-        match_score, mismatch_score, gap_score, min_score
-      )
-      return
-    elif native_enabled:
-      native_compute_inner_alignment_matrix_simple_scoring_any(
-        scoring_matrix, a, b,
-        match_score, mismatch_score, gap_score, min_score
-      )
-      return
-  except AttributeError:
-    pass
-  compute_inner_alignment_matrix_simple_scoring_py(
-    scoring_matrix, a, b,
-    match_score, mismatch_score, gap_score, min_score
-  )
-
-def compute_inner_alignment_matrix_custom_scoring(
-  scoring_matrix, a, b, scoring_fn, gap_score, min_score):
-
-  if native_enabled:
-    native_compute_inner_alignment_matrix_scoring_fn_any(
-      scoring_matrix, a, b,
-      scoring_fn, gap_score, min_score
-    )
-  else:
-    compute_inner_alignment_matrix_scoring_fn_py(
-      scoring_matrix, a, b,
-      scoring_fn, gap_score, min_score
-    )
-
-def compute_inner_alignment_matrix(
-  scoring_matrix, a, b, scoring, min_score):
-  if isinstance(scoring, CustomScoring):
-    compute_inner_alignment_matrix_custom_scoring(
-      scoring_matrix, a, b,
-      scoring.scoring_fn, scoring.gap_score, min_score
-    )
-  else:
-    compute_inner_alignment_matrix_simple_scoring(
-      scoring_matrix, a, b,
-      scoring.match_score, scoring.mismatch_score, scoring.gap_score,
-      min_score
-    )
-
-def _next_locs(score_matrix, i, j, is_local):
-  diag_score = score_matrix[i - 1][j - 1] if (i != 0 and j != 0) else MIN_INT
-  up_score = score_matrix[i - 1][j] if i != 0 else MIN_INT
-  left_score = score_matrix[i][j - 1] if j != 0 else MIN_INT
-  max_score = max(diag_score, up_score, left_score)
-  if max_score == MIN_INT:
-    return []
-  if (max_score == 0 or diag_score == 0) and (is_local or (i == 1 and j == 1)):
-    return []
-  if diag_score == max_score:
-    get_logger().debug('diag_score: %s (%s)', diag_score, max_score)
-    return [(i - 1, j - 1)]
-  locs = []
-  if up_score == max_score:
-    locs.append((i - 1, j))
-  if left_score == max_score:
-    locs.append((i, j - 1))
-  return locs
-
-def alignment_matrix_traceback_py(score_matrix, start_locs, is_local):
-  # Using LinkedListNode to cheaply branch off to multiple paths
-  pending_roots = deque([
-    LinkedListNode(tuple(loc))
-    for loc in start_locs
-  ])
-  while len(pending_roots) > 0:
-    n = pending_roots.pop()
-    i, j = n.data
-    next_locs = _next_locs(score_matrix, i, j, is_local)
-    get_logger().debug('next_locs: %s', next_locs)
-    if len(next_locs) == 0:
-      yield n
-    else:
-      pending_roots.extend([
-        LinkedListNode(next_loc, n)
-        for next_loc in next_locs
-      ])
-
-def alignment_matrix_traceback(score_matrix, start_locs, is_local, limit):
-  if native_enabled and limit == 1:
-    yield native_alignment_matrix_single_path_traceback(
-      score_matrix, start_locs[0], 1 if is_local else 0
-    )
-  else:
-    paths = alignment_matrix_traceback_py(
-      score_matrix, reversed(start_locs), is_local
-    )
-    if limit:
-      paths = islice(paths, limit)
-    for path in paths:
-      yield path
-
-class SimpleScoring(object):
-  def __init__(self, match_score, mismatch_score, gap_score):
-    self.match_score = match_score
-    self.mismatch_score = mismatch_score
-    self.gap_score = gap_score
-
-class CustomScoring(object):
-  def __init__(self, scoring_fn, gap_score):
-    self.scoring_fn = scoring_fn
-    self.gap_score = gap_score
-
-class LinkedListNode(object):
-  def __init__(self, data, next_node=None):
-    self.data = data
-    self.next_node = next_node
-
-  def __str__(self):
-    if self.next_node is not None:
-      return str(self.data) + ' -> ' + str(self.next_node)
-    else:
-      return str(self.data)
-
-  def __iter__(self):
-    yield self.data
-    next_node = self.next_node
-    while next_node is not None:
-      yield next_node.data
-      next_node = next_node.next_node
-
-def _path_to_matching_blocks(path, a, b):
-  block_ai = 0
-  block_bi = 0
-  block_size = 0
-  for ai, bi in ((ai_ - 1, bi_ - 1) for ai_, bi_ in path):
-    if a[ai] == b[bi]:
-      if block_size and block_ai + block_size == ai and block_bi + block_size == bi:
-        block_size += 1
-      else:
-        if block_size:
-          yield (block_ai, block_bi, block_size)
-        block_ai = ai
-        block_bi = bi
-        block_size = 1
-  if block_size:
-    yield (block_ai, block_bi, block_size)
-
-def _as_np_array(s):
-  if isinstance(s, binary_type):
-    return np.frombuffer(s, dtype=np.uint8).astype(np.int32)
-  if isinstance(s, string_types):
-    return np.array([ord(c) for c in s], dtype=np.int32)
-  return np.asarray(s)
-
-wrap_sequence = _as_np_array
-
-class AbstractSequenceMatcher(object, with_metaclass(ABCMeta)):
-  def __init__(self, a, b, scoring):
-    self.a = a
-    self.b = b
-    self.scoring = scoring
-    self._alignment_matrix = None
-    self._a = _as_np_array(a)
-    self._b = _as_np_array(b)
-
-  @abstractmethod
-  def _computer_alignment_matrix(self):
-    pass
-
-  def _get_alignment_matrix(self):
-    if self._alignment_matrix is None:
-      self._alignment_matrix = self._computer_alignment_matrix()
-    return self._alignment_matrix
-
-  @abstractmethod
-  def get_multiple_matching_blocks(self, limit=None):
-    pass
-
-  def get_matching_blocks(self):
-    for matching_blocks in self.get_multiple_matching_blocks(limit=1):
-      return list(matching_blocks) + [(len(self.a), len(self.b), 0)]
-    return [(len(self.a), len(self.b), 0)]
-
-class LocalSequenceMatcher(AbstractSequenceMatcher):
-  """
-  Local sequence matcher using Smith-Waterman algorithm
-  """
-
-  def _computer_alignment_matrix(self):
-    m = len(self._a) + 1
-    n = len(self._b) + 1
-    scoring_matrix = np.empty((m, n), dtype=int)
-    scoring_matrix[:, 0] = 0
-    scoring_matrix[0, :] = 0
-    min_score = 0
-    compute_inner_alignment_matrix(
-      scoring_matrix,
-      self._a, self._b,
-      self.scoring,
-      min_score
-    )
-    return scoring_matrix
-
-  def get_multiple_matching_blocks(self, limit=None):
-    score_matrix = self._get_alignment_matrix()
-    max_score = score_matrix.max()
-
-    max_score_loc = np.argwhere(score_matrix == max_score)
-    get_logger().debug('max_score_loc: %s', max_score_loc)
-    is_local = True
-    paths = alignment_matrix_traceback(score_matrix, max_score_loc, is_local, limit=limit or 0)
-    return (
-      list(_path_to_matching_blocks(path, self.a, self.b))
-      for path in paths
-    )
-
-class GlobalSequenceMatcher(AbstractSequenceMatcher):
-  """
-  Global sequence matcher using Needleman-Wunsch algorithm
-  """
-
-  def _computer_alignment_matrix(self):
-    m = len(self._a) + 1
-    n = len(self._b) + 1
-    scoring_matrix = np.empty((m, n), dtype=int)
-    for i in range(m):
-      scoring_matrix[i, 0] = self.scoring.gap_score * i
-    for j in range(n):
-      scoring_matrix[0, j] = self.scoring.gap_score * j
-    min_score = MIN_INT
-    compute_inner_alignment_matrix(
-      scoring_matrix,
-      self._a, self._b,
-      self.scoring,
-      min_score
-    )
-    return scoring_matrix
-
-  def get_multiple_matching_blocks(self, limit=None):
-    score_matrix = self._get_alignment_matrix()
-
-    m = len(self._a) + 1
-    n = len(self._b) + 1
-    start_locs = [(m - 1, n - 1)]
-    is_local = False
-    paths = alignment_matrix_traceback(score_matrix, start_locs, is_local, limit=limit or 0)
-    return (
-      list(_path_to_matching_blocks(path, self.a, self.b))
-      for path in paths
-    )
diff --git a/sciencebeam_gym/alignment/align_fast_utils.pyx b/sciencebeam_gym/alignment/align_fast_utils.pyx
deleted file mode 100644
index bc263ad..0000000
--- a/sciencebeam_gym/alignment/align_fast_utils.pyx
+++ /dev/null
@@ -1,147 +0,0 @@
-from cpython cimport array
-cimport cython
-
-import logging
-
-import numpy as np
-cimport numpy as np
-
-DEF MIN_INT = -2147483647
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-ctypedef np.int_t int_t
-ctypedef int_t[:, :] score_matrix_t
-ctypedef bint bool_t
-
-cdef inline int imax2(int a, int b):
-  if a >= b:
-    return a
-  else:
-    return b
-
-cdef inline int imax3(int a, int b, int c):
-  if a >= b:
-    return imax2(a, c)
-  else:
-    return imax2(b, c)
-
-cdef inline int imax4(int a, int b, int c, int d):
-  if a >= b:
-    return imax3(a, c, d)
-  else:
-    return imax3(b, c, d)
-
-def native_compute_inner_alignment_matrix_simple_scoring_int(
-  score_matrix_t scoring_matrix,
-  int[:] a,
-  int[:] b,
-  int match_score, int mismatch_score, int gap_score, int min_score):
-  cdef int m = len(a) + 1
-  cdef int n = len(b) + 1
-  cdef int i, j
-  for i in range(1, m):
-    for j in range(1, n):
-      scoring_matrix[i, j] = imax4(
-        min_score,
-
-        # Match elements.
-        scoring_matrix[i - 1, j - 1] +
-        (match_score if a[i - 1] == b[j - 1] else mismatch_score),
-
-        # Gap on sequenceA.
-        scoring_matrix[i, j - 1] + gap_score,
-
-        # Gap on sequenceB.
-        scoring_matrix[i - 1, j] + gap_score
-      )
-
-def native_compute_inner_alignment_matrix_simple_scoring_any(
-  score_matrix_t scoring_matrix,
-  a,
-  b,
-  int match_score, int mismatch_score, int gap_score, int min_score):
-  cdef list ca = list(a)
-  cdef list cb = list(b)
-  cdef int m = len(ca) + 1
-  cdef int n = len(cb) + 1
-  cdef int i, j
-  for i in range(1, m):
-    for j in range(1, n):
-      scoring_matrix[i, j] = imax4(
-        min_score,
-
-        # Match elements.
-        scoring_matrix[i - 1, j - 1] +
-        (match_score if ca[i - 1] == cb[j - 1] else mismatch_score),
-
-        # Gap on sequenceA.
-        scoring_matrix[i, j - 1] + gap_score,
-
-        # Gap on sequenceB.
-        scoring_matrix[i - 1, j] + gap_score
-      )
-
-def native_compute_inner_alignment_matrix_scoring_fn_any(
-  score_matrix_t scoring_matrix,
-  a,
-  b,
-  scoring_fn, int gap_score, int min_score):
-  cdef list ca = list(a)
-  cdef list cb = list(b)
-  cdef int m = len(ca) + 1
-  cdef int n = len(cb) + 1
-  cdef int i, j
-  for i in range(1, m):
-    for j in range(1, n):
-      scoring_matrix[i, j] = imax4(
-        min_score,
-
-        # Match elements.
-        scoring_matrix[i - 1, j - 1] +
-        scoring_fn(ca[i - 1], cb[j - 1]),
-
-        # Gap on sequenceA.
-        scoring_matrix[i, j - 1] + gap_score,
-
-        # Gap on sequenceB.
-        scoring_matrix[i - 1, j] + gap_score
-      )
-
-cdef inline _next_loc(
-  score_matrix_t score_matrix, int i, int j, bool_t is_local):
-
-  diag_score = score_matrix[i - 1][j - 1] if (i != 0 and j != 0) else MIN_INT
-  up_score = score_matrix[i - 1][j] if i != 0 else MIN_INT
-  left_score = score_matrix[i][j - 1] if j != 0 else MIN_INT
-  max_score = imax3(diag_score, up_score, left_score)
-  if max_score == MIN_INT:
-    return None
-  if (max_score == 0 or diag_score == 0) and (is_local or (i == 1 and j == 1)):
-    # stop at local match, or end
-    return None
-  if diag_score == max_score:
-    return (i - 1, j - 1)
-  if up_score == max_score:
-    return (i - 1, j)
-  if left_score == max_score:
-    return (i, j - 1)
-  return None
-
-def native_alignment_matrix_single_path_traceback(
-  score_matrix_t score_matrix,
-  start_loc, bool_t is_local):
-
-  cdef int[2] cur_loc = (int(start_loc[0]), int(start_loc[1]))
-  cdef list path = [cur_loc]
-  cdef int i, j
-  cdef tuple next_loc
-  while True:
-    i, j = cur_loc
-    next_loc = _next_loc(score_matrix, i, j, is_local)
-    if not next_loc:
-      return path
-    else:
-      cur_loc = next_loc
-      path.insert(0, cur_loc)
diff --git a/sciencebeam_gym/alignment/align_performance.py b/sciencebeam_gym/alignment/align_performance.py
deleted file mode 100644
index 0901f58..0000000
--- a/sciencebeam_gym/alignment/align_performance.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from __future__ import absolute_import, print_function
-
-import logging
-import timeit
-
-import numpy as np
-
-from sciencebeam_gym.alignment.align import (
-  SimpleScoring,
-  CustomScoring,
-  LocalSequenceMatcher,
-  require_native
-)
-
-DEFAULT_MATCH_SCORE = 2
-DEFAULT_MISMATCH_SCORE = -1
-DEFAULT_GAP_SCORE = -3
-
-DEFAULT_SCORING = SimpleScoring(
-  DEFAULT_MATCH_SCORE, DEFAULT_MISMATCH_SCORE, DEFAULT_GAP_SCORE
-)
-
-CUSTOM_SCORING = CustomScoring(
-  lambda a, b: DEFAULT_MATCH_SCORE if a == b else DEFAULT_MISMATCH_SCORE,
-  DEFAULT_GAP_SCORE
-)
-
-SHORT_STRING1 = 'abc'
-SHORT_STRING2 = 'def'
-
-LONG_STRING1 = 'abcefghijk' * 100
-LONG_STRING2 = ''.join(list(reversed(LONG_STRING1)))
-
-def encode_str(s):
-  return np.array([int(ord(x)) for x in s], dtype=np.int32)
-
-LONG_ENCODED1 = encode_str(LONG_STRING1)
-LONG_ENCODED2 = encode_str(LONG_STRING2)
-
-def test_align_with_scoring_fn_py():
-  with require_native(False):
-    LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, CUSTOM_SCORING).get_matching_blocks()
-
-def test_align_with_scoring_fn():
-  with require_native(True):
-    LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, CUSTOM_SCORING).get_matching_blocks()
-
-def test_align_with_simple_scoring():
-  with require_native(True):
-    LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, DEFAULT_SCORING).get_matching_blocks()
-
-def test_align_with_simple_scoring_int():
-  with require_native(True):
-    LocalSequenceMatcher(LONG_ENCODED1, LONG_ENCODED2, DEFAULT_SCORING).get_matching_blocks()
-
-def test_align_with_simple_scoring_str():
-  with require_native(True):
-    LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, DEFAULT_SCORING).get_matching_blocks()
-
-def report_timing(fn, number=1):
-  timeit_result_ms = timeit.timeit(
-    fn + "()",
-    setup="from __main__ import " + fn,
-    number=number
-  ) * 1000
-  print("{} ({}x):\n{:f} ms / it ({:f} ms total)\n".format(
-    fn,
-    number,
-    timeit_result_ms / number,
-    timeit_result_ms
-  ))
-
-def main():
-  print("len LONG_STRING1: {}\n".format(len(LONG_STRING1)))
-  print("len LONG_ENCODED1: {}\n".format(len(LONG_ENCODED1)))
-  report_timing("test_align_with_scoring_fn_py")
-  report_timing("test_align_with_scoring_fn", 3)
-  report_timing("test_align_with_simple_scoring", 3)
-  report_timing("test_align_with_simple_scoring_int", 3)
-  report_timing("test_align_with_simple_scoring_str", 3)
-
-if __name__ == "__main__":
-  logging.basicConfig(level='INFO')
-
-  main()
diff --git a/sciencebeam_gym/alignment/align_test.py b/sciencebeam_gym/alignment/align_test.py
deleted file mode 100644
index 3e62b4c..0000000
--- a/sciencebeam_gym/alignment/align_test.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from abc import ABCMeta, abstractmethod
-from contextlib import contextmanager
-
-from six import (
-  with_metaclass,
-  u as as_u,
-  b as as_b
-)
-
-import numpy as np
-
-from sciencebeam_gym.alignment.align import (
-  LocalSequenceMatcher,
-  GlobalSequenceMatcher,
-  SimpleScoring,
-  CustomScoring,
-  require_native
-)
-
-DEFAULT_SCORING = SimpleScoring(
-  match_score=3,
-  mismatch_score=-1,
-  gap_score=-2
-)
-
-DEFAULT_CUSTOM_SCORING = CustomScoring(
-  scoring_fn=lambda a, b: (
-    DEFAULT_SCORING.match_score if a == b else DEFAULT_SCORING.mismatch_score
-  ),
-  gap_score=-2
-)
-
-def _non_zero(matching_blocks):
-  return [(ai, bi, size) for ai, bi, size in matching_blocks if size]
-
-class CharWrapper(object):
-  def __init__(self, c):
-    self.c = c
-
-  def __eq__(self, b):
-    return self.c == b.c
-
-class AbstractTestCommonSequenceMatcher(object, with_metaclass(ABCMeta)):
-  @abstractmethod
-  def _convert(self, x):
-    pass
-
-  @contextmanager
-  def _wrap_test(self):
-    yield
-
-  def test_should_not_return_non_zero_blocks_for_no(self):
-    with self._wrap_test():
-      sm = self._matcher(a='a', b='b')
-      assert _non_zero(sm.get_matching_blocks()) == []
-
-  def test_should_add_zero_block_with_final_indices(self):
-    with self._wrap_test():
-      sm = self._matcher(a='a', b='b')
-      assert sm.get_matching_blocks() == [(1, 1, 0)]
-
-  def test_should_return_single_character_match(self):
-    with self._wrap_test():
-      sm = self._matcher(a='a', b='a')
-      assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 1)]
-
-  def test_should_return_combine_character_match_block(self):
-    with self._wrap_test():
-      sm = self._matcher(a='abc', b='abc')
-      assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3)]
-
-  def test_should_return_combine_character_match_blocks_with_gaps(self):
-    with self._wrap_test():
-      sm = self._matcher(a='abc123xyz', b='abc987xyz')
-      assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3), (6, 6, 3)]
-
-  def test_should_align_using_custom_scoring_fn(self):
-    with self._wrap_test():
-      sm = self._matcher(a='a', b='a', scoring=DEFAULT_CUSTOM_SCORING)
-      assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 1)]
-
-class AbstractTestLocalSequenceMatcher(AbstractTestCommonSequenceMatcher):
-  def _matcher(self, a, b, scoring=None):
-    return LocalSequenceMatcher(
-      a=self._convert(a),
-      b=self._convert(b),
-      scoring=scoring or DEFAULT_SCORING
-    )
-
-  def test_should_not_match_block_after_big_gap(self):
-    with self._wrap_test():
-      sm = self._matcher(a='abcxyz', b='abc123456xyz')
-      assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3)]
-
-class TestLocalSequenceMatcherWithUnicode(AbstractTestLocalSequenceMatcher):
-  def _convert(self, x):
-    return as_u(x)
-
-class TestLocalSequenceMatcherWithBytes(AbstractTestLocalSequenceMatcher):
-  def _convert(self, x):
-    return as_b(x)
-
-class TestLocalSequenceMatcherWithIntList(AbstractTestLocalSequenceMatcher):
-  def _convert(self, x):
-    return [ord(c) for c in x]
-
-class TestLocalSequenceMatcherWithNumpyInt32Array(AbstractTestLocalSequenceMatcher):
-  def _convert(self, x):
-    return np.array([ord(c) for c in x], dtype=np.int32)
-
-class TestLocalSequenceMatcherWithNumpyInt64Array(AbstractTestLocalSequenceMatcher):
-  def _convert(self, x):
-    return np.array([ord(c) for c in x], dtype=np.int64)
-
-class TestLocalSequenceMatcherWithCustomObjectList(AbstractTestLocalSequenceMatcher):
-  def _convert(self, x):
-    return [CharWrapper(c) for c in x]
-
-class TestLocalSequenceMatcherWithNumpyInt32ArrayWithoutNative(
-  TestLocalSequenceMatcherWithNumpyInt32Array):
-
-  @contextmanager
-  def _wrap_test(self):
-    with require_native(False):
-      yield
-
-class AbstractTestGlobalSequenceMatcher(AbstractTestCommonSequenceMatcher):
-  def _matcher(self, a, b, scoring=None):
-    with require_native(False):
-      return GlobalSequenceMatcher(
-        a=self._convert(a),
-        b=self._convert(b),
-        scoring=scoring or DEFAULT_SCORING
-      )
-
-  def test_should_prefer_match_block_after_big_gap(self):
-    with self._wrap_test():
-      sm = self._matcher(a='abcxyz', b='abc123456xyz')
-      assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3), (3, 9, 3)]
-
-class TestGlobalSequenceMatcherWithUnicode(AbstractTestGlobalSequenceMatcher):
-  def _convert(self, x):
-    return as_u(x)
diff --git a/sciencebeam_gym/beam_utils/__init__.py b/sciencebeam_gym/beam_utils/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/sciencebeam_gym/beam_utils/csv.py b/sciencebeam_gym/beam_utils/csv.py
deleted file mode 100644
index b8a51e7..0000000
--- a/sciencebeam_gym/beam_utils/csv.py
+++ /dev/null
@@ -1,180 +0,0 @@
-from __future__ import absolute_import
-
-import logging
-import csv
-from io import BytesIO
-
-from six import string_types
-
-import apache_beam as beam
-from apache_beam.io.textio import WriteToText, ReadFromText
-from apache_beam.io.filesystem import CompressionTypes
-from apache_beam.io.filebasedsource import FileBasedSource
-
-from sciencebeam_gym.beam_utils.utils import (
-  TransformAndLog
-)
-
-from sciencebeam_gym.utils.csv import (
-  csv_delimiter_by_filename
-)
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def DictToList(fields):
-  def wrapper(x):
-    get_logger().debug('DictToList: %s -> %s', fields, x)
-    return [x.get(field) for field in fields]
-  return wrapper
-
-def format_csv_rows(rows, delimiter=','):
-  get_logger().debug('format_csv_rows, rows: %s', rows)
-  out = BytesIO()
-  writer = csv.writer(out, delimiter=delimiter)
-  writer.writerows([
-    [
-      x.encode('utf-8') if isinstance(x, string_types) else x
-      for x in row
-    ]
-    for row in rows
-  ])
-  result = out.getvalue().decode('utf-8').rstrip('\r\n')
-  get_logger().debug('format_csv_rows, result: %s', result)
-  return result
-
-class WriteDictCsv(beam.PTransform):
-  def __init__(self, path, columns, file_name_suffix=None):
-    super(WriteDictCsv, self).__init__()
-    self.path = path
-    self.columns = columns
-    self.file_name_suffix = file_name_suffix
-    self.delimiter = csv_delimiter_by_filename(path + file_name_suffix)
-
-  def expand(self, pcoll):
-    return (
-      pcoll |
-      "ToList" >> beam.Map(DictToList(self.columns)) |
-      "Format" >> TransformAndLog(
-        beam.Map(lambda x: format_csv_rows([x], delimiter=self.delimiter)),
-        log_prefix='formatted csv: ',
-        log_level='debug'
-      ) |
-      "Utf8Encode" >> beam.Map(lambda x: x.encode('utf-8')) |
-      "Write" >> WriteToText(
-        self.path,
-        file_name_suffix=self.file_name_suffix,
-        header=format_csv_rows([self.columns], delimiter=self.delimiter).encode('utf-8')
-      )
-    )
-
-def _strip_quotes(s):
-  return s[1:-1] if len(s) >= 2 and s[0] == '"' and s[-1] == '"' else s
-
-# copied and modified from https://github.com/pabloem/beam_utils
-# (move back if still active)
-
-class ReadLineIterator(object):
-  def __init__(self, obj):
-    self._obj = obj
-
-  def __iter__(self):
-    return self
-
-  def next(self):
-    line = self._obj.readline()
-    if line == None or line == '':
-      raise StopIteration
-    return line
-
-class CsvFileSource(FileBasedSource):
-  """ A source for a GCS or local comma-separated-file
-  Parses a text file assuming newline-delimited lines,
-  and comma-delimited fields. Assumes UTF-8 encoding.
-  """
-
-  def __init__(
-    self, file_pattern,
-    compression_type=CompressionTypes.AUTO,
-    delimiter=',', header=True, dictionary_output=True,
-    validate=True, limit=None):
-    """ Initialize a CsvFileSource.
-    Args:
-      delimiter: The delimiter character in the CSV file.
-      header: Whether the input file has a header or not.
-        Default: True
-      dictionary_output: The kind of records that the CsvFileSource outputs.
-        If True, then it will output dict()'s, if False it will output list()'s.
-        Default: True
-    Raises:
-      ValueError: If the input arguments are not consistent.
-    """
-    super(CsvFileSource, self).__init__(
-      file_pattern,
-      compression_type=compression_type,
-      validate=validate,
-      splittable=False # Can't just split anywhere
-    )
-    self.delimiter = delimiter
-    self.header = header
-    self.dictionary_output = dictionary_output
-    self.limit = limit
-    self._file = None
-
-    if not self.header and dictionary_output:
-      raise ValueError(
-        'header is required for the CSV reader to provide dictionary output'
-      )
-
-  def read_records(self, file_name, range_tracker):
-    # If a multi-file pattern was specified as a source then make sure the
-    # start/end offsets use the default values for reading the entire file.
-    headers = None
-    self._file = self.open_file(file_name)
-
-    reader = csv.reader(ReadLineIterator(self._file), delimiter=self.delimiter)
-
-    line_no = 0
-    for i, row in enumerate(reader):
-      if self.header and i == 0:
-        headers = row
-        continue
-
-      if self.limit and line_no >= self.limit:
-        break
-
-      line_no += 1
-      if self.dictionary_output:
-        yield {
-          header: value
-          for header, value in zip(headers, row)
-        }
-      else:
-        yield row
-
-
-class ReadDictCsv(beam.PTransform):
-  """
-  Simplified CSV parser, which does not support:
-  * multi-line values
-  * delimiter within value
-  """
-  def __init__(self, filename, header=True, limit=None):
-    super(ReadDictCsv, self).__init__()
-    if not header:
-      raise RuntimeError('header required')
-    self.filename = filename
-    self.columns = None
-    self.delimiter = csv_delimiter_by_filename(filename)
-    self.limit = limit
-    self.row_num = 0
-
-  def expand(self, pcoll):
-    return (
-      pcoll |
-      beam.io.Read(CsvFileSource(
-        self.filename,
-        delimiter=self.delimiter,
-        limit=self.limit
-      ))
-    )
diff --git a/sciencebeam_gym/beam_utils/csv_test.py b/sciencebeam_gym/beam_utils/csv_test.py
deleted file mode 100644
index f4c5c4c..0000000
--- a/sciencebeam_gym/beam_utils/csv_test.py
+++ /dev/null
@@ -1,126 +0,0 @@
-from __future__ import absolute_import
-
-from contextlib import contextmanager
-from mock import patch
-
-import pytest
-
-import apache_beam as beam
-from apache_beam.testing.util import (
-  assert_that,
-  equal_to
-)
-
-from sciencebeam_gym.beam_utils.testing import (
-  TestPipeline,
-  BeamTest,
-  MockWriteToText,
-  patch_beam_io
-)
-
-from sciencebeam_gym.beam_utils.csv import (
-  WriteDictCsv,
-  ReadDictCsv,
-  format_csv_rows
-)
-
-MODULE_UNDER_TEST = 'sciencebeam_gym.beam_utils.csv'
-
-@contextmanager
-def patch_module_under_test(**kwargs):
-  with patch.multiple(
-    MODULE_UNDER_TEST,
-    **kwargs
-  ) as mocks:
-    yield mocks
-
-def to_csv(rows, delimiter):
-  return format_csv_rows(rows, delimiter).encode('utf-8').replace('\r\n', '\n') + '\n'
-
-@pytest.mark.slow
-class TestWriteDictCsv(BeamTest):
-  def test_should_write_tsv_with_header(self, test_context):
-    with patch_module_under_test(WriteToText=MockWriteToText):
-      with TestPipeline() as p:
-        _ = (
-          p |
-          beam.Create([{
-            'a': 'a1',
-            'b': 'b1'
-          }]) |
-          WriteDictCsv(
-            '.temp/dummy',
-            ['a', 'b'],
-            '.tsv'
-          )
-        )
-      assert test_context.get_file_content('.temp/dummy.tsv') == to_csv([
-        ['a', 'b'],
-        ['a1', 'b1']
-      ], '\t')
-
-@pytest.mark.slow
-class TestReadDictCsv(BeamTest):
-  def test_should_read_rows_as_dict(self, test_context):
-    with patch_beam_io():
-      test_context.set_file_content('.temp/dummy.tsv', to_csv([
-        ['a', 'b'],
-        ['a1', 'b1']
-      ], '\t'))
-
-      with TestPipeline() as p:
-        result = (
-          p |
-          ReadDictCsv('.temp/dummy.tsv')
-        )
-        assert_that(result, equal_to([{
-          'a': 'a1',
-          'b': 'b1'
-        }]))
-
-  def test_should_read_multiple(self, test_context):
-    with patch_beam_io():
-      test_context.set_file_content('.temp/dummy.tsv', to_csv([
-        ['a', 'b'],
-        ['a1', 'b1'],
-        ['a2', 'b2'],
-        ['a3', 'b3']
-      ], '\t'))
-
-      with TestPipeline() as p:
-        result = (
-          p |
-          ReadDictCsv('.temp/dummy.tsv')
-        )
-        assert_that(result, equal_to([{
-          'a': 'a1',
-          'b': 'b1'
-        }, {
-          'a': 'a2',
-          'b': 'b2'
-        }, {
-          'a': 'a3',
-          'b': 'b3'
-        }]))
-
-  def test_should_limit_number_of_rows(self, test_context):
-    with patch_beam_io():
-      test_context.set_file_content('.temp/dummy.tsv', to_csv([
-        ['a', 'b'],
-        ['a1', 'b1'],
-        ['a2', 'b2'],
-        ['a3', 'b3']
-      ], '\t'))
-
-      with TestPipeline() as p:
-        result = (
-          p |
-          ReadDictCsv('.temp/dummy.tsv', limit=2)
-        )
-        assert_that(result, equal_to([{
-          'a': 'a1',
-          'b': 'b1'
-        }, {
-          'a': 'a2',
-          'b': 'b2'
-        }]))
diff --git a/sciencebeam_gym/beam_utils/files.py b/sciencebeam_gym/beam_utils/files.py
deleted file mode 100644
index 9a3227c..0000000
--- a/sciencebeam_gym/beam_utils/files.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from itertools import islice
-
-import apache_beam as beam
-
-from sciencebeam_gym.beam_utils.csv import (
-  ReadDictCsv
-)
-
-from sciencebeam_gym.beam_utils.io import (
-  find_matching_filenames
-)
-
-from sciencebeam_gym.beam_utils.utils import (
-  GroupTransforms
-)
-
-from sciencebeam_gym.utils.file_list import (
-  load_file_list
-)
-
-def find_matching_filenames_with_limit(pattern, limit=None):
-  return islice(
-    find_matching_filenames(pattern),
-    limit
-  )
-
-def ReadFileList(file_list_path, column, limit=None):
-  file_list = load_file_list(file_list_path, column=column, limit=limit)
-  return beam.Create(file_list)
-
-def DeferredReadFileList(file_list_path, column, limit=None):
-  return GroupTransforms(lambda p: (
-    p |
-    "ReadFileUrls" >> ReadDictCsv(file_list_path, limit=limit) |
-    "TranslateFileUrls" >> beam.Map(lambda row: row[column])
-  ))
-
-def FindFiles(file_pattern, limit=None):
-  file_list = list(find_matching_filenames_with_limit(file_pattern, limit=limit))
-  return beam.Create(file_list)
-
-def DeferredFindFiles(file_pattern, limit=None):
-  return GroupTransforms(lambda p: (
-    p |
-    beam.Create([file_pattern]) |
-    "FindFiles" >> beam.FlatMap(
-      lambda pattern: find_matching_filenames_with_limit(pattern, limit)
-    )
-  ))
diff --git a/sciencebeam_gym/beam_utils/files_test.py b/sciencebeam_gym/beam_utils/files_test.py
deleted file mode 100644
index c11b5dc..0000000
--- a/sciencebeam_gym/beam_utils/files_test.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from mock import patch
-
-import apache_beam as beam
-from apache_beam.testing.util import (
-  assert_that,
-  equal_to
-)
-
-from sciencebeam_gym.beam_utils.testing import (
-  BeamTest,
-  TestPipeline
-)
-
-import sciencebeam_gym.beam_utils.files as files_module
-from sciencebeam_gym.beam_utils.files import (
-  ReadFileList,
-  DeferredReadFileList,
-  FindFiles,
-  DeferredFindFiles
-)
-
-FILE_1 = 'file1.pdf'
-FILE_2 = 'file2.pdf'
-
-FILE_LIST_PATH = 'file-list.lst'
-COLUMN = 'url'
-LIMIT = 10
-
-class TestReadFileList(BeamTest):
-  def test_should_use_load_file_list(self):
-    with patch.object(files_module, 'load_file_list') as load_file_list:
-      load_file_list.return_value = [FILE_1, FILE_2]
-      with TestPipeline() as p:
-        result = p | ReadFileList(FILE_LIST_PATH, column=COLUMN, limit=LIMIT)
-        assert_that(result, equal_to([FILE_1, FILE_2]))
-      load_file_list.assert_called_with(FILE_LIST_PATH, column=COLUMN, limit=LIMIT)
-
-class TestDeferredReadFileList(BeamTest):
-  def test_should_use_read_dict_csv(self):
-    with patch.object(files_module, 'ReadDictCsv') as ReadDictCsv:
-      ReadDictCsv.return_value = beam.Create([{COLUMN: FILE_1}, {COLUMN: FILE_2}])
-      with TestPipeline() as p:
-        result = p | DeferredReadFileList(FILE_LIST_PATH, column=COLUMN, limit=LIMIT)
-        assert_that(result, equal_to([FILE_1, FILE_2]))
-      ReadDictCsv.assert_called_with(FILE_LIST_PATH, limit=LIMIT)
-
-class TestFindFiles(BeamTest):
-  def test_should_use_find_matching_filenames(self):
-    with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames:
-      find_matching_filenames.return_value = [FILE_1, FILE_2]
-      with TestPipeline() as p:
-        result = p | FindFiles(FILE_LIST_PATH, limit=LIMIT)
-        assert_that(result, equal_to([FILE_1, FILE_2]))
-      find_matching_filenames.assert_called_with(FILE_LIST_PATH)
-
-  def test_should_apply_limit(self):
-    with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames:
-      find_matching_filenames.return_value = [FILE_1, FILE_2]
-      with TestPipeline() as p:
-        result = p | FindFiles(FILE_LIST_PATH, limit=1)
-        assert_that(result, equal_to([FILE_1]))
-
-class TestDeferredFindFiles(BeamTest):
-  def test_should_use_find_matching_filenames(self):
-    with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames:
-      find_matching_filenames.return_value = [FILE_1, FILE_2]
-      with TestPipeline() as p:
-        result = p | DeferredFindFiles(FILE_LIST_PATH, limit=LIMIT)
-        assert_that(result, equal_to([FILE_1, FILE_2]))
-      find_matching_filenames.assert_called_with(FILE_LIST_PATH)
-
-  def test_should_apply_limit(self):
-    with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames:
-      find_matching_filenames.return_value = [FILE_1, FILE_2]
-      with TestPipeline() as p:
-        result = p | DeferredFindFiles(FILE_LIST_PATH, limit=1)
-        assert_that(result, equal_to([FILE_1]))
diff --git a/sciencebeam_gym/beam_utils/io.py b/sciencebeam_gym/beam_utils/io.py
deleted file mode 100644
index 0e2ec91..0000000
--- a/sciencebeam_gym/beam_utils/io.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import absolute_import
-
-from io import BytesIO
-import logging
-
-from apache_beam.io.filesystems import FileSystems
-
-DEFAULT_BUFFER_SIZE = 4096 * 1024
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def read_all_from_path(path, buffer_size=DEFAULT_BUFFER_SIZE):
-  with FileSystems.open(path) as f:
-    out = BytesIO()
-    while True:
-      buf = f.read(buffer_size)
-      if not buf:
-        break
-      out.write(buf)
-    return out.getvalue()
-
-def dirname(path):
-  return FileSystems.split(path)[0]
-
-def basename(path):
-  return FileSystems.split(path)[1]
-
-def find_matching_filenames(pattern):
-  return (x.path for x in FileSystems.match([pattern])[0].metadata_list)
-
-def mkdirs_if_not_exists(path):
-  if not FileSystems.exists(path):
-    try:
-      get_logger().info('attempting to create directory: %s', path)
-      FileSystems.mkdirs(path)
-    except IOError:
-      if not FileSystems.exists(path):
-        raise
-
-def save_file_content(output_filename, data):
-  mkdirs_if_not_exists(dirname(output_filename))
-  # Note: FileSystems.create transparently handles compression based on the file extension
-  with FileSystems.create(output_filename) as f:
-    f.write(data)
-  return output_filename
diff --git a/sciencebeam_gym/beam_utils/main.py b/sciencebeam_gym/beam_utils/main.py
deleted file mode 100644
index e5bb44f..0000000
--- a/sciencebeam_gym/beam_utils/main.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import errno
-import logging
-import os
-import subprocess
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def create_fn_api_runner():
-  from apache_beam.runners.portability.fn_api_runner import FnApiRunner
-  return FnApiRunner()
-
-def get_cloud_project():
-  cmd = [
-    'gcloud', '-q', 'config', 'list', 'project',
-    '--format=value(core.project)'
-  ]
-  with open(os.devnull, 'w') as dev_null:
-    try:
-      res = subprocess.check_output(cmd, stderr=dev_null).strip()
-      if not res:
-        raise Exception(
-          '--cloud specified but no Google Cloud Platform '
-          'project found.\n'
-          'Please specify your project name with the --project '
-          'flag or set a default project: '
-          'gcloud config set project YOUR_PROJECT_NAME'
-        )
-      return res
-    except OSError as e:
-      if e.errno == errno.ENOENT:
-        raise Exception(
-          'gcloud is not installed. The Google Cloud SDK is '
-          'necessary to communicate with the Cloud ML service. '
-          'Please install and set up gcloud.'
-        )
-      raise
-
-def get_default_job_name(name, suffix=''):
-  from getpass import getuser
-  from time import gmtime, strftime
-  timestamp_str = strftime("%Y%m%d-%H%M%S", gmtime())
-  return '%s-%s%s-%s' % (name or 'beamapp', getuser(), suffix or '', timestamp_str)
-
-def get_or_create_sciencebeam_gym_dist_path():
-  import sys
-  import pkg_resources
-
-  dist = pkg_resources.get_distribution("sciencebeam_gym")
-  sciencebeam_gym_path = dist.location
-  sciencebeam_gym_version = dist.version
-  subprocess.call([
-    'python', 'setup.py', 'sdist'
-  ], cwd=sciencebeam_gym_path, stdout=sys.stdout, stderr=sys.stderr)
-  sciencebeam_gym_dist_path = os.path.join(
-    sciencebeam_gym_path,
-    'dist/sciencebeam_gym-%s.tar.gz' % sciencebeam_gym_version
-  )
-  return sciencebeam_gym_dist_path
-
-def process_sciencebeam_gym_dep_args(args):
-  """
-  If in cloud mode, add local sciencebeam-gym dependency and build distribution.
-  That way we don't need to keep an updated public package available.
-  (the project may be re-structured by then)
-  """
-  if args.cloud:
-    sciencebeam_gym_dist_path = get_or_create_sciencebeam_gym_dist_path()
-    get_logger().info('sciencebeam_gym_dist_path: %s', sciencebeam_gym_dist_path)
-    args.extra_package = sciencebeam_gym_dist_path
-
-def add_cloud_args(parser):
-  parser.add_argument(
-    '--cloud',
-    default=False,
-    action='store_true'
-  )
-  parser.add_argument(
-    '--runner',
-    required=False,
-    default=None,
-    help='Runner.'
-  )
-  parser.add_argument(
-    '--project',
-    type=str,
-    help='The cloud project name to be used for running this pipeline'
-  )
-  parser.add_argument(
-    '--num_workers',
-    default=1,
-    type=int,
-    help='The number of workers.'
-  )
-  parser.add_argument(
-    '--job_name', type=str, required=False,
-    help='The name of the cloud job'
-  )
-  parser.add_argument(
-    '--job-name-suffix', type=str, required=False,
-    help='A suffix appended to the job name'
-  )
-
-def process_cloud_args(parsed_args, output_path, name=None):
-  if parsed_args.num_workers:
-    parsed_args.autoscaling_algorithm = 'NONE'
-    parsed_args.max_num_workers = parsed_args.num_workers
-  parsed_args.setup_file = './setup.py'
-
-  if parsed_args.cloud:
-    # Flags which need to be set for cloud runs.
-    default_values = {
-      'project':
-        get_cloud_project(),
-      'temp_location':
-        os.path.join(os.path.dirname(output_path), 'temp'),
-      'runner':
-        'DataflowRunner',
-      'save_main_session':
-        True,
-    }
-    if not parsed_args.job_name:
-      parsed_args.job_name = get_default_job_name(name, parsed_args.job_name_suffix)
-  else:
-    # Flags which need to be set for local runs.
-    default_values = {
-      'runner': 'DirectRunner',
-    }
-
-  get_logger().info('default_values: %s', default_values)
-  for kk, vv in default_values.iteritems():
-    if kk not in parsed_args or not vars(parsed_args)[kk]:
-      vars(parsed_args)[kk] = vv
-
-  if parsed_args.runner == 'FnApiRunner':
-    parsed_args.runner = create_fn_api_runner()
diff --git a/sciencebeam_gym/beam_utils/testing.py b/sciencebeam_gym/beam_utils/testing.py
deleted file mode 100644
index 0e0307b..0000000
--- a/sciencebeam_gym/beam_utils/testing.py
+++ /dev/null
@@ -1,242 +0,0 @@
-from __future__ import absolute_import
-
-import logging
-from contextlib import contextmanager
-from io import BytesIO
-from mock import patch, Mock, MagicMock
-from mock.mock import MagicProxy
-from abc import ABCMeta, abstractmethod
-
-import pytest
-
-from six import with_metaclass
-
-import apache_beam as beam
-from apache_beam.coders.coders import ToStringCoder, StrUtf8Coder
-from apache_beam.testing.test_pipeline import TestPipeline as _TestPipeline
-from apache_beam.io.filesystem import FileMetadata, MatchResult, CompressionTypes
-from apache_beam.io.filesystems import FileSystems
-from apache_beam.metrics.metric import MetricsFilter
-
-
-class TestPipeline(_TestPipeline):
-  __test__ = False
-
-  def __init__(self, *args, **kwargs):
-    super(TestPipeline, self).__init__(*args, **kwargs)
-    self._pipeline_result = None
-
-  def run(self):
-    # Make sure we're only running the pipeline once
-    if not self._pipeline_result:
-      self._pipeline_result = super(TestPipeline, self).run()
-    return self._pipeline_result
-
-
-_local = {}
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-class TestContext(object):
-  def __init__(self):
-    self.file_content_map = dict()
-    self.object_map = dict()
-
-  def set_file_content(self, name, content):
-    get_logger().debug('set_file_content: %s (size: %d)', name, len(content))
-    self.file_content_map[name] = content
-
-  def get_file_content(self, name):
-    return self.file_content_map.get(name)
-
-def get_current_test_context():
-  return _local['test_context']
-
-# Apache Beam serialises everything, pretend Mocks being serialised
-def unpickle_mock(state):
-  # get_logger().debug('unpickle mock: state=%s', state)
-  obj_id = state[0] if isinstance(state, tuple) else state
-  obj = get_current_test_context().object_map[obj_id]
-  return obj
-
-unpickle_mock.__safe_for_unpickling__ = True
-
-def mock_reduce(obj):
-  obj_id = id(obj)
-  # get_logger().debug('pickle mock, obj_id: %s', obj_id)
-  get_current_test_context().object_map[obj_id] = obj
-  return unpickle_mock, (obj_id,)
-
-for c in [Mock, MagicMock, MagicProxy]:
-  c.__reduce__ = mock_reduce
-
-@pytest.mark.filterwarnings('ignore::DeprecationWarning')
-@pytest.mark.filterwarnings('ignore::UserWarning')
-class BeamTest(object):
-  @pytest.fixture(name='test_context', autouse=True)
-  def init_test_context(self):
-    get_logger().debug('setting up test context')
-    test_context = TestContext()
-    _local['test_context'] = test_context
-    yield test_context
-    get_logger().debug('clearing test context')
-    del _local['test_context']
-
-class MockWriteToText(beam.PTransform):
-  class WriteDoFn(beam.DoFn):
-    def __init__(self, file_path_prefix,
-      file_name_suffix='',
-      coder=ToStringCoder(),
-      header=None):
-
-      self.filename = file_path_prefix + file_name_suffix
-      self.file_obj = None
-      self.coder = coder
-      self.header = header
-
-    def start_bundle(self):
-      assert self.filename
-      self.file_obj = BytesIO()
-      if self.header:
-        self.file_obj.write(self.coder.encode(self.header) + '\n')
-
-    def process(self, element):
-      assert self.file_obj
-      self.file_obj.write(self.coder.encode(element) + '\n')
-
-    def finish_bundle(self):
-      assert self.file_obj
-      self.file_obj.flush()
-      file_content = self.file_obj.getvalue()
-      get_logger().debug('file content: %s: %s', self.filename, file_content)
-      test_context = get_current_test_context()
-      test_context.set_file_content(self.filename, file_content)
-      self.file_obj.close()
-
-  def __init__(self, *args, **kwargs):
-    self._sink = MockWriteToText.WriteDoFn(*args, **kwargs)
-
-  def expand(self, pcoll):
-    return pcoll | 'MockWriteToText' >> beam.ParDo(self._sink)
-
-def MockReadFromText(
-  file_pattern=None,
-  coder=StrUtf8Coder(),
-  skip_header_lines=0):
-
-  file_content = get_current_test_context().get_file_content(file_pattern)
-  if file_content is None:
-    raise RuntimeError('no file content set for %s' % file_pattern)
-  lines = file_content.replace('\r\n', '\n').split('\n')
-  if skip_header_lines:
-    lines = lines[skip_header_lines:]
-  return 'MockReadFromText' >> beam.Create(
-    [
-      coder.decode(line)
-      for line in lines
-    ]
-  )
-
-class MockFileBasedSource(beam.io.filebasedsource.FileBasedSource):
-  def open_file(self, file_name):
-    file_content = get_current_test_context().get_file_content(file_name)
-    if file_content is None:
-      raise RuntimeError('no file content set for %s' % file_name)
-    return BytesIO(file_content)
-
-class AbstractFileSystem(with_metaclass(ABCMeta, object)):
-  @abstractmethod
-  def open(
-    self, path, mime_type='application/octet-stream',
-    compression_type=CompressionTypes.AUTO):
-    pass
-
-  @abstractmethod
-  def create(
-    self, path, mime_type='application/octet-stream',
-    compression_type=CompressionTypes.AUTO):
-    pass
-
-class MockFileSystem(AbstractFileSystem):
-  @classmethod
-  def scheme(cls):
-    return 'mock'
-
-  def match(self, patterns, limits=None):
-    test_context = get_current_test_context()
-    file_content_map = test_context.file_content_map
-    all_files = file_content_map.keys()
-    if limits is None:
-      limits = [None] * len(patterns)
-    results = []
-    for pattern, limit in zip(patterns, limits):
-      files = all_files[:limit]
-      metadata = [
-        FileMetadata(f, len(file_content_map[f]))
-        for f in files
-      ]
-      results.append(MatchResult(pattern, metadata))
-    return results
-
-  def open(
-    self, path, mime_type='application/octet-stream',
-    compression_type=CompressionTypes.AUTO):
-
-    file_content = get_current_test_context().get_file_content(path)
-    if file_content is None:
-      raise RuntimeError('no file content set for %s' % path)
-    return BytesIO(file_content)
-
-  def create(
-    self, path, mime_type='application/octet-stream',
-    compression_type=CompressionTypes.AUTO):
-
-    out = BytesIO()
-    out.close = lambda: (
-      get_current_test_context()
-      .set_file_content(path, out.getvalue())
-    )
-    return out
-
-  def rename(self, source_file_names, destination_file_names):
-    test_context = get_current_test_context()
-    file_content_map = test_context.file_content_map
-    for source_file_name, destination_file_name in zip(source_file_names, destination_file_names):
-      get_logger().debug('renaming %s to %s', source_file_name, destination_file_name)
-      if source_file_name not in file_content_map:
-        raise IOError('mock file does not exist: %s' % source_file_name)
-      if destination_file_name in file_content_map:
-        raise IOError('mock file already exists: %s' % destination_file_name)
-      file_content_map[destination_file_name] = file_content_map[source_file_name]
-      del file_content_map[source_file_name]
-
-  def mkdirs(self, path):
-    get_logger().debug('mkdirs: %s (no-op)', path)
-
-def mock_get_filesystem(*_):
-  return MockFileSystem()
-
-@contextmanager
-def patch_beam_io():
-  with patch.object(FileSystems, 'get_filesystem', classmethod(mock_get_filesystem)):
-    yield
-
-def get_counter_values(pipeline_result, names, wait_until_finish=True):
-  if wait_until_finish:
-    pipeline_result.wait_until_finish()
-  counter_values = dict()
-  for name in names:
-    counter = pipeline_result.metrics().query(
-      MetricsFilter().with_name(name)
-    )['counters']
-    assert len(counter) <= 1
-    if len(counter) == 1:
-      counter_values[name] = counter[0].committed
-  return counter_values
-
-def get_counter_value(pipeline_result, name, default_value=None, wait_until_finish=True):
-  counter_values = get_counter_values(
-    pipeline_result, [name], wait_until_finish=wait_until_finish
-  )
-  return counter_values.get(name, default_value)
diff --git a/sciencebeam_gym/beam_utils/utils.py b/sciencebeam_gym/beam_utils/utils.py
deleted file mode 100644
index 5b16af0..0000000
--- a/sciencebeam_gym/beam_utils/utils.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import logging
-from random import getrandbits
-
-import apache_beam as beam
-from apache_beam.metrics.metric import Metrics
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def Spy(f):
-  def spy_wrapper(x):
-    f(x)
-    return x
-  return spy_wrapper
-
-def MapSpy(f):
-  return beam.Map(Spy(f))
-
-def MapOrLog(fn, log_fn=None, error_count=None):
-  if log_fn is None:
-    log_fn = lambda e, x: (
-      get_logger().warning(
-        'caught exception (ignoring item): %s, input: %.100s...',
-        e, x, exc_info=e
-      )
-    )
-  error_counter = (
-    Metrics.counter('MapOrLog', error_count)
-    if error_count
-    else None
-  )
-  def wrapper(x):
-    try:
-      yield fn(x)
-    except Exception as e:
-      if error_counter:
-        error_counter.inc()
-      log_fn(e, x)
-  return beam.FlatMap(wrapper)
-
-LEVEL_MAP = {
-  'info': logging.INFO,
-  'debug': logging.DEBUG
-}
-
-def Count(name, counter_value_fn):
-  counter = Metrics.counter('Count', name)
-  def wrapper(x):
-    counter.inc(counter_value_fn(x) if counter_value_fn else 1)
-    return x
-  return name >> beam.Map(wrapper)
-
-class GroupTransforms(beam.PTransform):
-  """
-  Convenience method to allow a PTransform for grouping purpose
-  to be defined using a lambda function.
-  (Completely unrelated to GroupBy transforms)
-  """
-  def __init__(self, expand_fn):
-    super(GroupTransforms, self).__init__()
-    self.expand_fn = expand_fn
-
-  def expand(self, pcoll): # pylint: disable=W0221
-    return self.expand_fn(pcoll)
-
-def TransformAndCount(transform, counter_name, counter_value_fn=None):
-  return GroupTransforms(lambda pcoll: (
-    pcoll |
-    transform |
-    "Count" >> Count(counter_name, counter_value_fn)
-  ))
-
-def TransformAndLog(transform, log_fn=None, log_prefix='', log_value_fn=None, log_level='info'):
-  if log_fn is None:
-    if log_value_fn is None:
-      log_value_fn = lambda x: x
-    log_level = LEVEL_MAP.get(log_level, log_level)
-    log_fn = lambda x: get_logger().log(
-      log_level, '%s%.50s...', log_prefix, log_value_fn(x)
-    )
-
-  return GroupTransforms(lambda pcoll: (
-    pcoll |
-    transform |
-    "Log" >> MapSpy(log_fn)
-  ))
-
-def random_key():
-  return getrandbits(32)
-
-def PreventFusion(key_fn=None, name="PreventFusion"):
-  """
-  Prevents fusion to allow better distribution across workers.
-
-  See:
-  https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion
-
-  TODO Replace by: https://github.com/apache/beam/pull/4040
-  """
-  if key_fn is None:
-    key_fn = lambda _: random_key()
-  return name >> GroupTransforms(lambda pcoll: (
-    pcoll |
-    "AddKey" >> beam.Map(lambda x: (key_fn(x), x)) |
-    "GroupByKey" >> beam.GroupByKey() |
-    "Ungroup" >> beam.FlatMap(lambda element: element[1])
-  ))
diff --git a/sciencebeam_gym/beam_utils/utils_test.py b/sciencebeam_gym/beam_utils/utils_test.py
deleted file mode 100644
index f5804a5..0000000
--- a/sciencebeam_gym/beam_utils/utils_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import logging
-
-import pytest
-
-import apache_beam as beam
-from apache_beam.testing.util import (
-  assert_that,
-  equal_to
-)
-
-from sciencebeam_gym.beam_utils.testing import (
-  BeamTest,
-  TestPipeline,
-  get_counter_value
-)
-
-from sciencebeam_gym.beam_utils.utils import (
-  MapOrLog,
-  TransformAndLog,
-  TransformAndCount,
-  PreventFusion
-)
-
-SOME_VALUE_1 = 'value 1'
-SOME_VALUE_2 = 'value 2'
-SOME_VALUE_CAUSING_EXCEPTION = 1
-
-SOME_FN = lambda x: x.upper()
-def FN_RAISING_EXCEPTION(_):
-  raise RuntimeError('oh dear')
-
-ERROR_COUNT_METRIC_NAME = 'error_count'
-COUNT_METRIC_NAME_1 = 'count_1'
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def setup_module():
-  logging.basicConfig(level='DEBUG')
-
-@pytest.mark.slow
-class TestMapOrLog(BeamTest):
-  def test_should_pass_through_return_value_if_no_exception_was_raised(self):
-    fn = lambda x: x.upper()
-    with TestPipeline() as p:
-      result = (
-        p |
-        beam.Create([SOME_VALUE_1]) |
-        MapOrLog(SOME_FN)
-      )
-      assert_that(result, equal_to([fn(SOME_VALUE_1)]))
-
-  def test_should_skip_entries_that_cause_an_exception(self):
-    with TestPipeline() as p:
-      result = (
-        p |
-        beam.Create([SOME_VALUE_1]) |
-        MapOrLog(FN_RAISING_EXCEPTION)
-      )
-      assert_that(result, equal_to([]))
-
-  def test_should_not_increase_error_metric_counter_if_no_exception_raised(self):
-    with TestPipeline() as p:
-      _ = (
-        p |
-        beam.Create([SOME_VALUE_1]) |
-        MapOrLog(FN_RAISING_EXCEPTION, error_count=ERROR_COUNT_METRIC_NAME)
-      )
-      assert get_counter_value(p.run(), ERROR_COUNT_METRIC_NAME) == 1
-
-  def test_should_increase_error_metric_counter_if_exception_was_raised(self):
-    with TestPipeline() as p:
-      _ = (
-        p |
-        beam.Create([SOME_VALUE_1]) |
-        MapOrLog(FN_RAISING_EXCEPTION, error_count=ERROR_COUNT_METRIC_NAME)
-      )
-      assert get_counter_value(p.run(), ERROR_COUNT_METRIC_NAME) == 1
-
-@pytest.mark.slow
-class TestTransformAndCount(BeamTest):
-  def test_should_not_change_result(self):
-    with TestPipeline() as p:
-      result = (
-        p |
-        beam.Create([SOME_VALUE_1.lower()]) |
-        TransformAndCount(
-          beam.Map(lambda x: x.upper()),
-          COUNT_METRIC_NAME_1
-        )
-      )
-      assert_that(result, equal_to([SOME_VALUE_1.upper()]))
-
-  def test_should_increase_count_per_item(self):
-    with TestPipeline() as p:
-      _ = (
-        p |
-        beam.Create([SOME_VALUE_1, SOME_VALUE_2]) |
-        TransformAndCount(
-          beam.Map(lambda x: x),
-          COUNT_METRIC_NAME_1
-        )
-      )
-      assert get_counter_value(p.run(), COUNT_METRIC_NAME_1) == 2
-
-  def test_should_increase_count_per_item_using_function(self):
-    with TestPipeline() as p:
-      _ = (
-        p |
-        beam.Create([SOME_VALUE_1, SOME_VALUE_2]) |
-        TransformAndCount(
-          beam.Map(lambda x: x),
-          COUNT_METRIC_NAME_1,
-          lambda x: len(x)
-        )
-      )
-      assert get_counter_value(p.run(), COUNT_METRIC_NAME_1) == (
-        len(SOME_VALUE_1) + len(SOME_VALUE_2)
-      )
-
-@pytest.mark.slow
-class TestTransformAndLog(BeamTest):
-  def test_should_not_change_result(self):
-    with TestPipeline() as p:
-      result = (
-        p |
-        beam.Create([SOME_VALUE_1.lower()]) |
-        TransformAndLog(
-          beam.Map(lambda x: x.upper())
-        )
-      )
-      assert_that(result, equal_to([SOME_VALUE_1.upper()]))
-
-@pytest.mark.slow
-class TestPreventFusion(BeamTest):
-  def test_should_not_change_result_with_default_random_key(self):
-    with TestPipeline() as p:
-      result = (
-        p |
-        beam.Create([SOME_VALUE_1, SOME_VALUE_2]) |
-        PreventFusion()
-      )
-      assert_that(result, equal_to([SOME_VALUE_1, SOME_VALUE_2]))
-
-  def test_should_not_change_result_with_constant_key(self):
-    with TestPipeline() as p:
-      result = (
-        p |
-        beam.Create([SOME_VALUE_1, SOME_VALUE_2]) |
-        PreventFusion(lambda _: 1)
-      )
-      assert_that(result, equal_to([SOME_VALUE_1, SOME_VALUE_2]))
diff --git a/sciencebeam_gym/convert/conversion_pipeline.py b/sciencebeam_gym/convert/conversion_pipeline.py
index ddafb5f..3a47ef3 100644
--- a/sciencebeam_gym/convert/conversion_pipeline.py
+++ b/sciencebeam_gym/convert/conversion_pipeline.py
@@ -12,34 +12,39 @@ from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
 
 from lxml import etree
 
-from sciencebeam_gym.utils.collection import (
-  extend_dict,
-  remove_keys_from_dict
-)
-
-from sciencebeam_gym.beam_utils.utils import (
+from sciencebeam_utils.beam_utils.utils import (
   TransformAndCount,
   TransformAndLog,
   MapOrLog,
   PreventFusion
 )
 
-from sciencebeam_gym.beam_utils.files import (
+from sciencebeam_utils.beam_utils.files import (
   ReadFileList,
   FindFiles
 )
 
-from sciencebeam_gym.beam_utils.io import (
+from sciencebeam_utils.beam_utils.io import (
   read_all_from_path,
   save_file_content
 )
 
-from sciencebeam_gym.beam_utils.main import (
+from sciencebeam_utils.beam_utils.main import (
   add_cloud_args,
   process_cloud_args,
   process_sciencebeam_gym_dep_args
 )
 
+from sciencebeam_utils.utils.collection import (
+  extend_dict,
+  remove_keys_from_dict
+)
+
+from sciencebeam_utils.utils.file_path import (
+  join_if_relative_path,
+  get_output_file
+)
+
 from sciencebeam_gym.structured_document.structured_document_loader import (
   load_structured_document
 )
@@ -49,12 +54,10 @@ from sciencebeam_gym.structured_document.lxml import (
 )
 
 from sciencebeam_gym.preprocess.preprocessing_utils import (
-  join_if_relative_path,
   convert_pdf_bytes_to_lxml,
   parse_page_range,
   save_pages,
-  pdf_bytes_to_png_pages,
-  get_output_file
+  pdf_bytes_to_png_pages
 )
 
 from sciencebeam_gym.inference_model.extract_to_xml import (
diff --git a/sciencebeam_gym/convert/conversion_pipeline_test.py b/sciencebeam_gym/convert/conversion_pipeline_test.py
index aec25b1..4bf728d 100644
--- a/sciencebeam_gym/convert/conversion_pipeline_test.py
+++ b/sciencebeam_gym/convert/conversion_pipeline_test.py
@@ -5,7 +5,7 @@ import pytest
 
 import apache_beam as beam
 
-from sciencebeam_gym.beam_utils.testing import (
+from sciencebeam_utils.beam_utils.testing import (
   BeamTest,
   TestPipeline
 )
diff --git a/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py b/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py
index f9e0c8e..3f47f37 100644
--- a/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py
+++ b/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py
@@ -10,8 +10,8 @@ from zipfile import ZipFile
 from shutil import rmtree
 from urllib import URLopener
 
-from sciencebeam_gym.utils.io import makedirs
-from sciencebeam_gym.utils.zip import extract_all_with_executable_permission
+from sciencebeam_utils.utils.io import makedirs
+from sciencebeam_utils.utils.zip import extract_all_with_executable_permission
 
 def get_logger():
   return logging.getLogger(__name__)
diff --git a/sciencebeam_gym/inference_model/__init___test.py b/sciencebeam_gym/inference_model/__init___test.py
index 873d5e0..9eb117f 100644
--- a/sciencebeam_gym/inference_model/__init___test.py
+++ b/sciencebeam_gym/inference_model/__init___test.py
@@ -5,7 +5,7 @@ from shutil import rmtree
 import tensorflow as tf
 import numpy as np
 
-from sciencebeam_gym.utils.num import (
+from sciencebeam_utils.utils.num import (
   assert_all_close
 )
 
diff --git a/sciencebeam_gym/inference_model/annotate_using_predictions.py b/sciencebeam_gym/inference_model/annotate_using_predictions.py
index f2f0702..d3af139 100644
--- a/sciencebeam_gym/inference_model/annotate_using_predictions.py
+++ b/sciencebeam_gym/inference_model/annotate_using_predictions.py
@@ -7,7 +7,7 @@ from io import BytesIO
 import numpy as np
 from PIL import Image
 
-from sciencebeam_gym.beam_utils.io import (
+from sciencebeam_utils.beam_utils.io import (
   read_all_from_path
 )
 
diff --git a/sciencebeam_gym/inference_model/extract_to_xml.py b/sciencebeam_gym/inference_model/extract_to_xml.py
index 7436434..fba57f7 100644
--- a/sciencebeam_gym/inference_model/extract_to_xml.py
+++ b/sciencebeam_gym/inference_model/extract_to_xml.py
@@ -4,7 +4,7 @@ import logging
 from lxml import etree
 from lxml.builder import E
 
-from sciencebeam_gym.beam_utils.io import (
+from sciencebeam_utils.beam_utils.io import (
   save_file_content
 )
 
diff --git a/sciencebeam_gym/inference_model/extract_to_xml_test.py b/sciencebeam_gym/inference_model/extract_to_xml_test.py
index 223a905..dad436a 100644
--- a/sciencebeam_gym/inference_model/extract_to_xml_test.py
+++ b/sciencebeam_gym/inference_model/extract_to_xml_test.py
@@ -5,7 +5,7 @@ from backports.tempfile import TemporaryDirectory
 from lxml import etree
 from lxml.builder import E
 
-from sciencebeam_gym.utils.xml import (
+from sciencebeam_utils.utils.xml import (
   get_text_content,
   get_text_content_list
 )
diff --git a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py
index bbe3307..50d7c80 100644
--- a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py
+++ b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py
@@ -8,11 +8,15 @@ from six import raise_from
 
 from tqdm import tqdm
 
-from sciencebeam_gym.utils.stopwatch import (
+from sciencebeam_utils.beam_utils.io import (
+  save_file_content
+)
+
+from sciencebeam_utils.utils.stopwatch import (
   StopWatchRecorder
 )
 
-from sciencebeam_gym.utils.file_list import (
+from sciencebeam_utils.utils.file_list import (
   load_file_list
 )
 
@@ -36,10 +40,6 @@ from sciencebeam_gym.models.text.crf.crfsuite_model import (
   CrfSuiteModel
 )
 
-from sciencebeam_gym.beam_utils.io import (
-  save_file_content
-)
-
 def get_logger():
   return logging.getLogger(__name__)
 
diff --git a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py
index 79e7e19..24cd23e 100644
--- a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py
+++ b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py
@@ -2,7 +2,7 @@ from mock import patch, Mock, ANY
 
 import pytest
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   to_namedtuple
 )
 
diff --git a/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py b/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py
index 14bd082..0c6fadb 100644
--- a/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py
+++ b/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py
@@ -7,8 +7,8 @@ from shutil import rmtree
 from urllib import URLopener
 from tempfile import NamedTemporaryFile
 
-from sciencebeam_gym.utils.io import makedirs
-from sciencebeam_gym.utils.zip import extract_all_with_executable_permission
+from sciencebeam_utils.utils.io import makedirs
+from sciencebeam_utils.utils.zip import extract_all_with_executable_permission
 
 def get_logger():
   return logging.getLogger(__name__)
diff --git a/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py b/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py
index eccb7d2..5ba16ce 100644
--- a/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py
+++ b/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py
@@ -4,7 +4,7 @@ from collections import Counter
 
 from six import iteritems
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   flatten
 )
 
diff --git a/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py b/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py
index 32688bf..367ed0b 100644
--- a/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py
+++ b/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py
@@ -1,4 +1,4 @@
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   flatten
 )
 
diff --git a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py
index 7f4f595..aa80315 100644
--- a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py
+++ b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py
@@ -2,16 +2,16 @@ from __future__ import division
 
 import logging
 
-from sciencebeam_gym.utils.string import (
+from sciencebeam_utils.utils.string import (
   LazyStr
 )
 
-from sciencebeam_gym.alignment.align import (
+from sciencebeam_alignment.align import (
   LocalSequenceMatcher,
   SimpleScoring
 )
 
-from sciencebeam_gym.alignment.WordSequenceMatcher import (
+from sciencebeam_alignment.word_sequence_matcher import (
   WordSequenceMatcher
 )
 
diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator.py b/sciencebeam_gym/preprocess/annotation/matching_annotator.py
index b148828..564d45c 100644
--- a/sciencebeam_gym/preprocess/annotation/matching_annotator.py
+++ b/sciencebeam_gym/preprocess/annotation/matching_annotator.py
@@ -7,20 +7,20 @@ from itertools import tee, islice
 
 from six.moves import zip_longest
 
-from sciencebeam_gym.utils.compat import (
+from sciencebeam_utils.utils.compat import (
   python_2_unicode_compatible
 )
 
-from sciencebeam_gym.utils.csv import (
+from sciencebeam_utils.utils.csv import (
   csv_delimiter_by_filename,
   write_csv_row
 )
 
-from sciencebeam_gym.utils.string import (
+from sciencebeam_utils.utils.string import (
   LazyStr
 )
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   iter_flatten,
   extract_from_dict
 )
diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
index 0bfb597..76fb765 100644
--- a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
+++ b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
@@ -2,6 +2,10 @@ from __future__ import division
 
 import logging
 
+from sciencebeam_utils.utils.collection import (
+  flatten
+)
+
 from sciencebeam_gym.structured_document import (
   SimpleStructuredDocument,
   SimpleLine,
@@ -21,10 +25,6 @@ from sciencebeam_gym.preprocess.annotation.matching_annotator import (
   EM_DASH
 )
 
-from sciencebeam_gym.utils.collection import (
-  flatten
-)
-
 TAG1 = 'tag1'
 TAG2 = 'tag2'
 
diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation.py b/sciencebeam_gym/preprocess/annotation/target_annotation.py
index 073358a..b1ff43d 100644
--- a/sciencebeam_gym/preprocess/annotation/target_annotation.py
+++ b/sciencebeam_gym/preprocess/annotation/target_annotation.py
@@ -8,20 +8,20 @@ from six.moves.configparser import ConfigParser # pylint: disable=E0401
 
 from lxml import etree
 
-from sciencebeam_gym.utils.compat import (
+from sciencebeam_utils.utils.compat import (
   python_2_unicode_compatible
 )
 
-from sciencebeam_gym.utils.string import (
+from sciencebeam_utils.utils.string import (
   LazyStr
 )
 
-from sciencebeam_gym.utils.xml import (
+from sciencebeam_utils.utils.xml import (
   get_text_content,
   get_immediate_text
 )
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   filter_truthy,
   strip_all
 )
diff --git a/sciencebeam_gym/preprocess/check_file_list.py b/sciencebeam_gym/preprocess/check_file_list.py
deleted file mode 100644
index 79e21be..0000000
--- a/sciencebeam_gym/preprocess/check_file_list.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from __future__ import division
-
-import argparse
-import logging
-from concurrent.futures import ThreadPoolExecutor
-
-from apache_beam.io.filesystems import FileSystems
-
-from sciencebeam_gym.utils.file_list import (
-  load_file_list
-)
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def parse_args(argv=None):
-  parser = argparse.ArgumentParser(
-    'Check file list'
-  )
-
-  source = parser.add_argument_group('source')
-  source.add_argument(
-    '--file-list', type=str, required=True,
-    help='path to source file list (tsv/csv/lst)'
-  )
-  source.add_argument(
-    '--file-column', type=str, required=False,
-    default='url',
-    help='csv/tsv column (ignored for plain file list)'
-  )
-
-  parser.add_argument(
-    '--limit', type=int, required=False,
-    help='limit the files to process'
-  )
-
-  parser.add_argument(
-    '--debug', action='store_true', default=False,
-    help='enable debug output'
-  )
-  return parser.parse_args(argv)
-
-def map_file_list_to_file_exists(file_list):
-  with ThreadPoolExecutor(max_workers=50) as executor:
-    return list(executor.map(FileSystems.exists, file_list))
-
-def format_file_exists_results(file_exists):
-  if not file_exists:
-    return 'empty file list'
-  file_exists_count = sum(file_exists)
-  file_missing_count = len(file_exists) - file_exists_count
-  return (
-    'files exist: %d (%.0f%%), files missing: %d (%.0f%%)' %
-    (
-      file_exists_count, 100.0 * file_exists_count / len(file_exists),
-      file_missing_count, 100.0 * file_missing_count / len(file_exists)
-    )
-  )
-
-def check_files_and_report_result(file_list):
-  file_exists = map_file_list_to_file_exists(file_list)
-  get_logger().info('%s', format_file_exists_results(file_exists))
-  assert sum(file_exists) > 0
-
-def run(opt):
-  file_list = load_file_list(
-    opt.file_list,
-    column=opt.file_column,
-    limit=opt.limit
-  )
-  check_files_and_report_result(file_list)
-
-def main(argv=None):
-  args = parse_args(argv)
-
-  if args.debug:
-    logging.getLogger().setLevel('DEBUG')
-
-  run(args)
-
-if __name__ == '__main__':
-  logging.basicConfig(level='INFO')
-  logging.getLogger('oauth2client').setLevel('WARNING')
-
-  main()
diff --git a/sciencebeam_gym/preprocess/check_file_list_test.py b/sciencebeam_gym/preprocess/check_file_list_test.py
deleted file mode 100644
index 775769b..0000000
--- a/sciencebeam_gym/preprocess/check_file_list_test.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from mock import patch
-
-import pytest
-
-import sciencebeam_gym.preprocess.check_file_list as check_file_list_module
-from sciencebeam_gym.preprocess.check_file_list import (
-  map_file_list_to_file_exists,
-  format_file_exists_results,
-  check_files_and_report_result
-)
-
-FILE_1 = 'file1'
-FILE_2 = 'file2'
-
-class TestMapFileListToFileExists(object):
-  def test_should_return_single_file_exists(self):
-    m = check_file_list_module
-    with patch.object(m, 'FileSystems') as FileSystems:
-      assert map_file_list_to_file_exists(
-        [FILE_1]
-      ) == [FileSystems.exists.return_value]
-      FileSystems.exists.assert_called_with(FILE_1)
-
-class TestFormatFileExistsResults(object):
-  def test_should_format_no_files(self):
-    assert (
-      format_file_exists_results([]) ==
-      'empty file list'
-    )
-
-  def test_should_format_all_files_exist(self):
-    assert (
-      format_file_exists_results([True, True]) ==
-      'files exist: 2 (100%), files missing: 0 (0%)'
-    )
-
-  def test_should_format_files_partially_exist(self):
-    assert (
-      format_file_exists_results([True, False]) ==
-      'files exist: 1 (50%), files missing: 1 (50%)'
-    )
-
-class TestCheckFileListAndReportResults(object):
-  def test_should_pass_file_list_to_format(self):
-    m = check_file_list_module
-    with patch.object(m, 'map_file_list_to_file_exists') as map_file_list_to_file_exists_mock:
-      with patch.object(m, 'format_file_exists_results') as format_file_exists_results_mock:
-        map_file_list_to_file_exists_mock.return_value = [True, True]
-        check_files_and_report_result([FILE_1, FILE_2])
-        map_file_list_to_file_exists_mock.assert_called_with([FILE_1, FILE_2])
-        format_file_exists_results_mock.assert_called_with([True, True])
-
-  def test_should_raise_error_if_none_of_the_files_were_found(self):
-    m = check_file_list_module
-    with patch.object(m, 'map_file_list_to_file_exists') as map_file_list_to_file_exists_mock:
-      with pytest.raises(AssertionError):
-        map_file_list_to_file_exists_mock.return_value = [False, False]
-        check_files_and_report_result([FILE_1, FILE_2])
diff --git a/sciencebeam_gym/preprocess/find_file_pairs.py b/sciencebeam_gym/preprocess/find_file_pairs.py
deleted file mode 100644
index ff727d1..0000000
--- a/sciencebeam_gym/preprocess/find_file_pairs.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import argparse
-import csv
-import logging
-
-from apache_beam.io.filesystems import FileSystems
-
-from sciencebeam_gym.utils.csv import (
-  csv_delimiter_by_filename,
-  write_csv_rows
-)
-
-from sciencebeam_gym.beam_utils.io import (
-  dirname,
-  mkdirs_if_not_exists
-)
-
-from sciencebeam_gym.utils.file_path import (
-  join_if_relative_path,
-  relative_path
-)
-
-from sciencebeam_gym.preprocess.preprocessing_utils import (
-  find_file_pairs_grouped_by_parent_directory_or_name
-)
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def parse_args(argv=None):
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-    '--data-path', type=str, required=True,
-    help='base data path'
-  )
-  parser.add_argument(
-    '--source-pattern', type=str, required=True,
-    help='source pattern'
-  )
-  parser.add_argument(
-    '--xml-pattern', type=str, required=True,
-    help='xml pattern'
-  )
-  parser.add_argument(
-    '--out', type=str, required=True,
-    help='output csv/tsv file'
-  )
-
-  parser.add_argument(
-    '--use-relative-paths', action='store_true',
-    help='create a file list with relative paths (relative to the data path)'
-  )
-
-  return parser.parse_args(argv)
-
-
-def save_file_pairs_to_csv(output_path, source_xml_pairs):
-  mkdirs_if_not_exists(dirname(output_path))
-  delimiter = csv_delimiter_by_filename(output_path)
-  mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv'
-  with FileSystems.create(output_path, mime_type=mime_type) as f:
-    writer = csv.writer(f, delimiter=delimiter)
-    write_csv_rows(writer, [['source_url', 'xml_url']])
-    write_csv_rows(writer, source_xml_pairs)
-  get_logger().info('written results to %s', output_path)
-
-def to_relative_file_pairs(base_path, file_pairs):
-  return (
-    (relative_path(base_path, source_url), relative_path(base_path, xml_url))
-    for source_url, xml_url in file_pairs
-  )
-
-def run(args):
-  get_logger().info('finding file pairs')
-  source_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([
-    join_if_relative_path(args.data_path, args.source_pattern),
-    join_if_relative_path(args.data_path, args.xml_pattern)
-  ])
-
-  if args.use_relative_paths:
-    source_xml_pairs = to_relative_file_pairs(args.data_path, source_xml_pairs)
-
-  source_xml_pairs = list(source_xml_pairs)
-
-  save_file_pairs_to_csv(args.out, source_xml_pairs)
-
-def main(argv=None):
-  args = parse_args(argv)
-  run(args)
-
-if __name__ == '__main__':
-  logging.basicConfig(level='INFO')
-
-  main()
diff --git a/sciencebeam_gym/preprocess/find_file_pairs_test.py b/sciencebeam_gym/preprocess/find_file_pairs_test.py
deleted file mode 100644
index 4aaee56..0000000
--- a/sciencebeam_gym/preprocess/find_file_pairs_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import logging
-import os
-from mock import patch
-
-import pytest
-
-import sciencebeam_gym.preprocess.find_file_pairs as find_file_pairs
-from sciencebeam_gym.preprocess.find_file_pairs import (
-  to_relative_file_pairs,
-  run,
-  parse_args,
-  main
-)
-
-
-LOGGER = logging.getLogger(__name__)
-
-BASE_SOURCE_PATH = '/source'
-
-PDF_FILE_1 = BASE_SOURCE_PATH + '/file1.pdf'
-XML_FILE_1 = BASE_SOURCE_PATH + '/file1.xml'
-PDF_FILE_2 = BASE_SOURCE_PATH + '/file2.pdf'
-XML_FILE_2 = BASE_SOURCE_PATH + '/file2.xml'
-
-SOURCE_PATTERN = '*.pdf'
-XML_PATTERN = '*.xml'
-OUTPUT_FILE = 'file-list.tsv'
-
-SOME_ARGV = [
-  '--data-path=%s' % BASE_SOURCE_PATH,
-  '--source-pattern=%s' % SOURCE_PATTERN,
-  '--xml-pattern=%s' % XML_PATTERN,
-  '--out=%s' % OUTPUT_FILE
-]
-
-@pytest.fixture(name='to_relative_file_pairs_mock')
-def _to_relative_file_pairs():
-  with patch.object(find_file_pairs, 'to_relative_file_pairs') as m:
-    yield m
-
-@pytest.fixture(name='find_file_pairs_grouped_by_parent_directory_or_name_mock')
-def _find_file_pairs_grouped_by_parent_directory_or_name():
-  with patch.object(find_file_pairs, 'find_file_pairs_grouped_by_parent_directory_or_name') as m:
-    m.return_value = [
-      (PDF_FILE_1, XML_FILE_1),
-      (PDF_FILE_2, XML_FILE_2)
-    ]
-    yield m
-
-@pytest.fixture(name='save_file_pairs_to_csv_mock')
-def _save_file_pairs_to_csv():
-  with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m:
-    yield m
-
-@pytest.fixture(name='save_file_pairs_to_csv_mock')
-def _save_file_pairs_to_csv():
-  with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m:
-    yield m
-
-@pytest.fixture(name='parse_args_mock')
-def _parse_args():
-  with patch.object(find_file_pairs, 'parse_args') as m:
-    yield m
-
-@pytest.fixture(name='run_mock')
-def _run():
-  with patch.object(find_file_pairs, 'run') as m:
-    yield m
-
-def _touch(path):
-  path.write(b'', ensure=True)
-  return path
-
-@pytest.fixture(name='pdf_file_1')
-def _pdf_file_1(tmpdir):
-  return _touch(tmpdir.join(PDF_FILE_1))
-
-@pytest.fixture(name='xml_file_1')
-def _xml_file_1(tmpdir):
-  return _touch(tmpdir.join(XML_FILE_1))
-
-@pytest.fixture(name='data_path')
-def _data_path(tmpdir):
-  return tmpdir.join(BASE_SOURCE_PATH)
-
-@pytest.fixture(name='out_file')
-def _out_file(tmpdir):
-  return tmpdir.join(OUTPUT_FILE)
-
-class TestToRelativeFilePairs(object):
-  def test_should_make_paths_relative(self):
-    assert list(to_relative_file_pairs(
-      '/parent',
-      [('/parent/sub/file1', '/parent/sub/file2')]
-    )) == [('sub/file1', 'sub/file2')]
-
-class TestRun(object):
-  def test_should_pass_around_parameters(
-    self,
-    find_file_pairs_grouped_by_parent_directory_or_name_mock,
-    save_file_pairs_to_csv_mock):
-
-    opt = parse_args(SOME_ARGV)
-    run(opt)
-    find_file_pairs_grouped_by_parent_directory_or_name_mock.assert_called_with([
-      os.path.join(BASE_SOURCE_PATH, SOURCE_PATTERN),
-      os.path.join(BASE_SOURCE_PATH, XML_PATTERN)
-    ])
-    save_file_pairs_to_csv_mock.assert_called_with(
-      opt.out,
-      find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value
-    )
-
-  def test_should_use_relative_paths_if_enabled(
-    self,
-    find_file_pairs_grouped_by_parent_directory_or_name_mock,
-    to_relative_file_pairs_mock,
-    save_file_pairs_to_csv_mock):
-
-    opt = parse_args(SOME_ARGV)
-    opt.use_relative_paths = True
-
-    to_relative_file_pairs_mock.return_value = [('file1.pdf', 'file1.xml')]
-
-    run(opt)
-    to_relative_file_pairs_mock.assert_called_with(
-      BASE_SOURCE_PATH,
-      find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value
-    )
-    save_file_pairs_to_csv_mock.assert_called_with(
-      opt.out,
-      to_relative_file_pairs_mock.return_value
-    )
-
-  def test_should_generate_file_list(self, data_path, pdf_file_1, xml_file_1, out_file):
-    LOGGER.debug('pdf_file_1: %s, xml_file: %s', pdf_file_1, xml_file_1)
-    opt = parse_args(SOME_ARGV)
-    opt.data_path = str(data_path)
-    opt.out = str(out_file)
-    run(opt)
-    out_lines = [s.strip() for s in out_file.read().strip().split('\n')]
-    LOGGER.debug('out_lines: %s', out_lines)
-    assert out_lines == [
-      'source_url\txml_url',
-      '%s\t%s' % (pdf_file_1, xml_file_1)
-    ]
-
-class TestMain(object):
-  def test_should_parse_args_and_call_run(self, parse_args_mock, run_mock):
-    main(SOME_ARGV)
-    parse_args_mock.assert_called_with(SOME_ARGV)
-    run_mock.assert_called_with(parse_args_mock.return_value)
diff --git a/sciencebeam_gym/preprocess/get_output_files.py b/sciencebeam_gym/preprocess/get_output_files.py
deleted file mode 100644
index 73eae63..0000000
--- a/sciencebeam_gym/preprocess/get_output_files.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import argparse
-import logging
-
-from sciencebeam_gym.utils.file_list import (
-  load_file_list,
-  save_file_list,
-  to_relative_file_list
-)
-
-from sciencebeam_gym.utils.file_path import (
-  join_if_relative_path
-)
-
-from sciencebeam_gym.preprocess.preprocessing_utils import (
-  get_or_validate_base_path,
-  get_output_file
-)
-
-from sciencebeam_gym.preprocess.check_file_list import (
-  check_files_and_report_result
-)
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def parse_args(argv=None):
-  parser = argparse.ArgumentParser(
-    'Get output files based on source files and suffix.'
-  )
-
-  source = parser.add_argument_group('source')
-  source.add_argument(
-    '--source-file-list', type=str, required=True,
-    help='path to source file list (tsv/csv/lst)'
-  )
-  source.add_argument(
-    '--source-file-column', type=str, required=False,
-    default='url',
-    help='csv/tsv column (ignored for plain file list)'
-  )
-  source.add_argument(
-    '--source-base-path', type=str, required=False,
-    help='base data path for source file urls'
-  )
-
-  output = parser.add_argument_group('output')
-  output.add_argument(
-    '--output-file-list', type=str, required=True,
-    help='path to output file list (tsv/csv/lst)'
-  )
-  output.add_argument(
-    '--output-file-column', type=str, required=False,
-    default='url',
-    help='csv/tsv column (ignored for plain file list)'
-  )
-  output.add_argument(
-    '--output-file-suffix', type=str, required=False,
-    help='file suffix (will be added to source urls after removing ext)'
-  )
-  output.add_argument(
-    '--output-base-path', type=str, required=False,
-    help='base output path (by default source base path with"-results" suffix)'
-  )
-  output.add_argument(
-    '--use-relative-paths', action='store_true',
-    help='create a file list with relative paths (relative to the output data path)'
-  )
-
-  parser.add_argument(
-    '--limit', type=int, required=False,
-    help='limit the files to process'
-  )
-
-  parser.add_argument(
-    '--check', action='store_true', default=False,
-    help='check whether the output files exist'
-  )
-  parser.add_argument(
-    '--check-limit', type=int, required=False,
-    help='limit the files to check'
-  )
-
-  parser.add_argument(
-    '--debug', action='store_true', default=False,
-    help='enable debug output'
-  )
-  return parser.parse_args(argv)
-
-def get_output_file_list(file_list, source_base_path, output_base_path, output_file_suffix):
-  return [
-    get_output_file(filename, source_base_path, output_base_path, output_file_suffix)
-    for filename in file_list
-  ]
-
-def run(opt):
-  source_file_list = load_file_list(
-    join_if_relative_path(
-      opt.source_base_path,
-      opt.source_file_list
-    ),
-    column=opt.source_file_column,
-    limit=opt.limit
-  )
-  source_base_path = get_or_validate_base_path(
-    source_file_list, opt.source_base_path
-  )
-
-  target_file_list = get_output_file_list(
-    source_file_list, source_base_path, opt.output_base_path, opt.output_file_suffix
-  )
-
-  if opt.check:
-    check_file_list = (
-      target_file_list[:opt.check_limit] if opt.check_limit
-      else target_file_list
-    )
-    get_logger().info(
-      'checking %d (out of %d) files...',
-      len(check_file_list), len(target_file_list)
-    )
-    check_files_and_report_result(check_file_list)
-
-  if opt.use_relative_paths:
-    target_file_list = to_relative_file_list(opt.output_base_path, target_file_list)
-
-  get_logger().info(
-    'saving file list (with %d files) to: %s',
-    len(target_file_list), opt.output_file_list
-  )
-  save_file_list(
-    opt.output_file_list,
-    target_file_list,
-    column=opt.output_file_column
-  )
-
-def process_args(args):
-  if not args.output_base_path:
-    args.output_base_path = args.source_base_path + '-results'
-
-def main(argv=None):
-  args = parse_args(argv)
-  process_args(args)
-
-  if args.debug:
-    logging.getLogger().setLevel('DEBUG')
-
-  run(args)
-
-if __name__ == '__main__':
-  logging.basicConfig(level='INFO')
-  logging.getLogger('oauth2client').setLevel('WARNING')
-
-  main()
diff --git a/sciencebeam_gym/preprocess/get_output_files_test.py b/sciencebeam_gym/preprocess/get_output_files_test.py
deleted file mode 100644
index 41f665a..0000000
--- a/sciencebeam_gym/preprocess/get_output_files_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import os
-from mock import patch, ANY
-
-import pytest
-
-import sciencebeam_gym.preprocess.get_output_files as get_output_files
-from sciencebeam_gym.preprocess.get_output_files import (
-  get_output_file_list,
-  run,
-  parse_args,
-  main
-)
-
-SOME_ARGV = [
-  '--source-file-list=source.csv',
-  '--output-file-list=output.csv',
-  '--limit=10'
-]
-
-BASE_SOURCE_PATH = '/source'
-
-FILE_1 = BASE_SOURCE_PATH + '/file1'
-FILE_2 = BASE_SOURCE_PATH + '/file2'
-
-
-@pytest.fixture(name='load_file_list_mock')
-def _load_file_list():
-  with patch.object(get_output_files, 'load_file_list') as m:
-    m.return_value = [FILE_1, FILE_2]
-    yield m
-
-@pytest.fixture(name='get_output_file_list_mock')
-def _get_output_file_list():
-  with patch.object(get_output_files, 'get_output_file_list') as m:
-    yield m
-
-@pytest.fixture(name='save_file_list_mock')
-def _save_file_list():
-  with patch.object(get_output_files, 'save_file_list') as m:
-    yield m
-
-@pytest.fixture(name='check_files_and_report_result_mock')
-def _check_files_and_report_result():
-  with patch.object(get_output_files, 'check_files_and_report_result') as m:
-    yield m
-
-@pytest.fixture(name='to_relative_file_list_mock')
-def _to_relative_file_list():
-  with patch.object(get_output_files, 'to_relative_file_list') as m:
-    yield m
-
-class TestGetOutputFileList(object):
-  def test_should_return_output_file_with_path_and_change_ext(self):
-    assert get_output_file_list(
-      ['/source/path/file.pdf'],
-      '/source',
-      '/output',
-      '.xml'
-    ) == ['/output/path/file.xml']
-
-@pytest.mark.usefixtures(
-  "load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock",
-  "to_relative_file_list_mock"
-)
-class TestRun(object):
-  def test_should_pass_around_parameters(
-    self,
-    load_file_list_mock,
-    get_output_file_list_mock,
-    save_file_list_mock):
-
-    load_file_list_mock.return_value = [FILE_1, FILE_2]
-    opt = parse_args(SOME_ARGV)
-    run(opt)
-    load_file_list_mock.assert_called_with(
-      opt.source_file_list,
-      column=opt.source_file_column,
-      limit=opt.limit
-    )
-    get_output_file_list_mock.assert_called_with(
-      load_file_list_mock.return_value,
-      BASE_SOURCE_PATH,
-      opt.output_base_path,
-      opt.output_file_suffix
-    )
-    save_file_list_mock.assert_called_with(
-      opt.output_file_list,
-      get_output_file_list_mock.return_value,
-      column=opt.source_file_column
-    )
-
-  def test_should_make_file_list_absolute_if_it_is_relative(
-    self,
-    load_file_list_mock):
-
-    opt = parse_args(SOME_ARGV)
-    opt.source_base_path = BASE_SOURCE_PATH
-    opt.source_file_list = 'source.tsv'
-    run(opt)
-    load_file_list_mock.assert_called_with(
-      os.path.join(opt.source_base_path, opt.source_file_list),
-      column=opt.source_file_column,
-      limit=opt.limit
-    )
-
-  def test_should_raise_error_if_source_path_is_invalid(self):
-    opt = parse_args(SOME_ARGV)
-    opt.source_base_path = '/other/path'
-    with pytest.raises(AssertionError):
-      run(opt)
-
-  def test_should_use_passed_in_source_path_if_valid(
-    self,
-    get_output_file_list_mock,
-    load_file_list_mock):
-
-    opt = parse_args(SOME_ARGV)
-    opt.source_base_path = '/base'
-    load_file_list_mock.return_value = ['/base/source/file1', '/base/source/file2']
-    run(opt)
-    get_output_file_list_mock.assert_called_with(
-      ANY,
-      opt.source_base_path,
-      ANY,
-      ANY
-    )
-
-  def test_should_check_file_list_if_enabled(
-    self,
-    get_output_file_list_mock,
-    check_files_and_report_result_mock):
-
-    opt = parse_args(SOME_ARGV)
-    opt.check = True
-    run(opt)
-    check_files_and_report_result_mock.assert_called_with(
-      get_output_file_list_mock.return_value
-    )
-
-  def test_should_limit_files_to_check(
-    self,
-    load_file_list_mock,
-    get_output_file_list_mock,
-    check_files_and_report_result_mock):
-
-    opt = parse_args(SOME_ARGV)
-    opt.check = True
-    opt.check_limit = 1
-    load_file_list_mock.return_value = [FILE_1, FILE_2]
-    run(opt)
-    check_files_and_report_result_mock.assert_called_with(
-      get_output_file_list_mock.return_value[:opt.check_limit]
-    )
-
-  def test_should_save_relative_paths_if_enabled(
-    self,
-    get_output_file_list_mock,
-    to_relative_file_list_mock,
-    save_file_list_mock):
-
-    opt = parse_args(SOME_ARGV)
-    opt.use_relative_paths = True
-    run(opt)
-    to_relative_file_list_mock.assert_called_with(
-      opt.output_base_path,
-      get_output_file_list_mock.return_value,
-    )
-    save_file_list_mock.assert_called_with(
-      opt.output_file_list,
-      to_relative_file_list_mock.return_value,
-      column=opt.source_file_column
-    )
-
-class TestMain(object):
-  def test_should_parse_args_and_call_run(self):
-    m = get_output_files
-    with patch.object(m, 'parse_args') as parse_args_mock:
-      with patch.object(m, 'run') as run_mock:
-        main(SOME_ARGV)
-        parse_args_mock.assert_called_with(SOME_ARGV)
-        run_mock.assert_called_with(parse_args_mock.return_value)
diff --git a/sciencebeam_gym/preprocess/lxml_to_svg.py b/sciencebeam_gym/preprocess/lxml_to_svg.py
index 5cd417b..8186739 100644
--- a/sciencebeam_gym/preprocess/lxml_to_svg.py
+++ b/sciencebeam_gym/preprocess/lxml_to_svg.py
@@ -4,15 +4,15 @@ import os
 
 from lxml import etree
 
-from sciencebeam_gym.utils.bounding_box import (
-  BoundingBox
-)
-
-from sciencebeam_gym.utils.csv import (
+from sciencebeam_utils.utils.csv import (
   open_csv_output,
   write_dict_csv
 )
 
+from sciencebeam_gym.utils.bounding_box import (
+  BoundingBox
+)
+
 from sciencebeam_gym.preprocess.annotation.annotator import (
   Annotator,
   DEFAULT_ANNOTATORS
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
index e1ce7ae..9f36244 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
@@ -9,39 +9,44 @@ import apache_beam as beam
 from apache_beam.io.filesystems import FileSystems
 from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
 
-from sciencebeam_gym.utils.collection import (
-  extend_dict,
-  remove_keys_from_dict
-)
-
-from sciencebeam_gym.utils.file_path import (
-  relative_path,
-  join_if_relative_path
-)
-
-from sciencebeam_gym.beam_utils.utils import (
+from sciencebeam_utils.beam_utils.utils import (
   TransformAndCount,
   TransformAndLog,
   MapOrLog,
   PreventFusion
 )
 
-from sciencebeam_gym.beam_utils.csv import (
+from sciencebeam_utils.beam_utils.csv import (
   WriteDictCsv,
   ReadDictCsv
 )
 
-from sciencebeam_gym.beam_utils.io import (
+from sciencebeam_utils.beam_utils.io import (
   read_all_from_path,
   basename,
   save_file_content
 )
 
-from sciencebeam_gym.beam_utils.main import (
+from sciencebeam_utils.beam_utils.main import (
   add_cloud_args,
   process_cloud_args
 )
 
+from sciencebeam_utils.utils.collection import (
+  extend_dict,
+  remove_keys_from_dict
+)
+
+from sciencebeam_utils.utils.file_path import (
+  change_ext,
+  relative_path,
+  join_if_relative_path
+)
+
+from sciencebeam_utils.utils.file_pairs import (
+  find_file_pairs_grouped_by_parent_directory_or_name,
+)
+
 from sciencebeam_gym.structured_document.svg import (
   SvgStructuredDocument
 )
@@ -61,8 +66,6 @@ from sciencebeam_gym.preprocess.annotation.annotation_evaluation import (
 )
 
 from sciencebeam_gym.preprocess.preprocessing_utils import (
-  change_ext,
-  find_file_pairs_grouped_by_parent_directory_or_name,
   convert_pdf_bytes_to_lxml,
   convert_and_annotate_lxml_content,
   pdf_bytes_to_png_pages,
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
index e42cf0b..f444ce7 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
@@ -6,21 +6,21 @@ import pytest
 
 import apache_beam as beam
 
-from sciencebeam_gym.utils.collection import (
-  extend_dict
-)
-
-from sciencebeam_gym.beam_utils.utils import (
+from sciencebeam_utils.beam_utils.utils import (
   TransformAndLog
 )
 
-from sciencebeam_gym.beam_utils.testing import (
+from sciencebeam_utils.beam_utils.testing import (
   BeamTest,
   TestPipeline,
   get_current_test_context,
   get_counter_value
 )
 
+from sciencebeam_utils.utils.collection import (
+  extend_dict
+)
+
 from sciencebeam_gym.preprocess.preprocessing_pipeline import (
   parse_args,
   configure_pipeline,
diff --git a/sciencebeam_gym/preprocess/preprocessing_transforms_test.py b/sciencebeam_gym/preprocess/preprocessing_transforms_test.py
index c821ff6..732700c 100644
--- a/sciencebeam_gym/preprocess/preprocessing_transforms_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_transforms_test.py
@@ -3,19 +3,19 @@ import pytest
 import apache_beam as beam
 from apache_beam.io.filesystems import FileSystems
 
-from sciencebeam_gym.beam_utils.io import (
+from sciencebeam_utils.beam_utils.io import (
   find_matching_filenames
 )
 
-from sciencebeam_gym.utils.tfrecord import (
-  iter_read_tfrecord_file_as_dict_list
-)
-
-from sciencebeam_gym.beam_utils.testing import (
+from sciencebeam_utils.beam_utils.testing import (
   BeamTest,
   TestPipeline
 )
 
+from sciencebeam_gym.utils.tfrecord import (
+  iter_read_tfrecord_file_as_dict_list
+)
+
 from sciencebeam_gym.preprocess.preprocessing_transforms import (
   WritePropsToTFRecord
 )
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py
index 96d6ff6..62d7be8 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -11,29 +11,33 @@ from lxml import etree
 
 from apache_beam.io.filesystems import FileSystems
 
-from sciencebeam_gym.utils.xml import (
+from sciencebeam_alignment.align import (
+  native_enabled as align_native_enabled
+)
+
+from sciencebeam_utils.beam_utils.io import (
+  find_matching_filenames
+)
+
+from sciencebeam_utils.utils.xml import (
   xml_from_string_with_recover
 )
 
-from sciencebeam_gym.utils.stopwatch import (
+from sciencebeam_utils.utils.stopwatch import (
   StopWatchRecorder
 )
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   groupby_to_dict,
   sort_and_groupby_to_dict
 )
 
-from sciencebeam_gym.utils.pages_zip import (
-  save_pages
-)
-
-from sciencebeam_gym.beam_utils.io import (
-  find_matching_filenames
+from sciencebeam_utils.utils.file_path import (
+  relative_path
 )
 
-from sciencebeam_gym.utils.file_path import (
-  relative_path
+from sciencebeam_gym.utils.pages_zip import (
+  save_pages
 )
 
 from sciencebeam_gym.preprocess.lxml_to_svg import (
@@ -49,10 +53,6 @@ from sciencebeam_gym.preprocess.annotation.annotator import (
   DEFAULT_ANNOTATORS
 )
 
-from sciencebeam_gym.alignment.align import (
-  native_enabled as align_native_enabled
-)
-
 from sciencebeam_gym.preprocess.annotation.matching_annotator import (
   MatchingAnnotator
 )
@@ -77,85 +77,10 @@ from sciencebeam_gym.pdf import (
   PdfToPng
 )
 
-# deprecated, moved to sciencebeam_gym.utils.file_path
-# pylint: disable=wrong-import-position, unused-import
-from sciencebeam_gym.utils.file_path import (
-  join_if_relative_path,
-)
-# pylint: enable=wrong-import-position, unused-import
-
 
 def get_logger():
   return logging.getLogger(__name__)
 
-def group_files_by_parent_directory(filenames):
-  return groupby_to_dict(sorted(filenames), lambda x: os.path.dirname(x))
-
-def get_ext(filename):
-  name, ext = os.path.splitext(filename)
-  if ext == '.gz':
-    ext = get_ext(name) + ext
-  return ext
-
-def strip_ext(filename):
-  # strip of gz, assuming there will be another extension before .gz
-  if filename.endswith('.gz'):
-    filename = filename[:-3]
-  return os.path.splitext(filename)[0]
-
-def group_files_by_name_excl_ext(filenames):
-  return sort_and_groupby_to_dict(filenames, strip_ext)
-
-def zip_by_keys(*dict_list):
-  keys = reduce(lambda agg, v: agg | set(v.keys()), dict_list, set())
-  return (
-    [d.get(k) for d in dict_list]
-    for k in sorted(keys)
-  )
-
-def group_file_pairs_by_parent_directory_or_name(files_by_type):
-  grouped_files_by_pattern = [
-    group_files_by_parent_directory(files) for files in files_by_type
-  ]
-  for files_in_group_by_pattern in zip_by_keys(*grouped_files_by_pattern):
-    if all(len(files or []) == 1 for files in files_in_group_by_pattern):
-      yield tuple([files[0] for files in files_in_group_by_pattern])
-    else:
-      grouped_by_name = [
-        group_files_by_name_excl_ext(files or [])
-        for files in files_in_group_by_pattern
-      ]
-      for files_by_name in zip_by_keys(*grouped_by_name):
-        if all(len(files or []) == 1 for files in files_by_name):
-          yield tuple([files[0] for files in files_by_name])
-        else:
-          get_logger().info(
-            'no exclusively matching files found: %s',
-            [files for files in files_by_name]
-          )
-
-def find_file_pairs_grouped_by_parent_directory_or_name(patterns, limit=None):
-  matching_files_by_pattern = [
-    list(find_matching_filenames(pattern)) for pattern in patterns
-  ]
-  get_logger().info(
-    'found number of files %s',
-    ', '.join(
-      '%s: %d' % (pattern, len(files))
-      for pattern, files in zip(patterns, matching_files_by_pattern)
-    )
-  )
-  patterns_without_files = [
-    pattern
-    for pattern, files in zip(patterns, matching_files_by_pattern)
-    if len(files) == 0
-  ]
-  if patterns_without_files:
-    raise RuntimeError('no files found for: %s' % patterns_without_files)
-  return group_file_pairs_by_parent_directory_or_name(
-    matching_files_by_pattern
-  )
-
 def convert_pdf_bytes_to_lxml(pdf_content, path=None, page_range=None):
   stop_watch_recorder = StopWatchRecorder()
 
@@ -222,45 +147,6 @@ def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, na
 
   return svg_roots
 
-def change_ext(path, old_ext, new_ext):
-  if old_ext is None:
-    old_ext = os.path.splitext(path)[1]
-    if old_ext == '.gz':
-      path = path[:-len(old_ext)]
-      old_ext = os.path.splitext(path)[1]
-  if old_ext and path.endswith(old_ext):
-    return path[:-len(old_ext)] + new_ext
-  else:
-    return path + new_ext
-
-def base_path_for_file_list(file_list):
-  common_prefix = os.path.commonprefix(file_list)
-  i = max(common_prefix.rfind('/'), common_prefix.rfind('\\'))
-  if i >= 0:
-    return common_prefix[:i]
-  else:
-    return ''
-
-def get_or_validate_base_path(file_list, base_path):
-  common_path = base_path_for_file_list(file_list)
-  if base_path:
-    if not common_path.startswith(base_path):
-      raise AssertionError(
-        "invalid base path '%s', common path is: '%s'" % (base_path, common_path)
-      )
-    return base_path
-  else:
-    return common_path
-
-def get_output_file(filename, source_base_path, output_base_path, output_file_suffix):
-  return FileSystems.join(
-    output_base_path,
-    change_ext(
-      relative_path(source_base_path, filename),
-      None, output_file_suffix
-    )
-  )
-
 def save_svg_roots(output_filename, svg_pages):
   return save_pages(output_filename, '.svg', (
     etree.tostring(svg_page)
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
index 12b20f7..a287dd3 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
@@ -1,7 +1,5 @@
 from mock import patch, MagicMock, DEFAULT
 
-import pytest
-
 from lxml import etree
 
 from sciencebeam_gym.structured_document.svg import (
@@ -10,12 +8,7 @@ from sciencebeam_gym.structured_document.svg import (
 
 from sciencebeam_gym.preprocess.preprocessing_utils import (
   svg_page_to_blockified_png_bytes,
-  group_file_pairs_by_parent_directory_or_name,
   convert_pdf_bytes_to_lxml,
-  change_ext,
-  base_path_for_file_list,
-  get_or_validate_base_path,
-  get_output_file,
   parse_page_range,
 )
 
@@ -36,58 +29,6 @@ class TestSvgPageToBlockifiedPngBytes(object):
       kwargs = call_args[1]
       assert (kwargs.get('width'), kwargs.get('height')) == (100.1, 200.9)
 
-class TestGroupFilePairsByParentDirectoryOrName(object):
-  def test_should_return_empty_list_with_empty_input_file_lists(self):
-    assert list(group_file_pairs_by_parent_directory_or_name([
-      [],
-      []
-    ])) == []
-
-  def test_should_group_single_file(self):
-    assert list(group_file_pairs_by_parent_directory_or_name([
-      ['parent1/file.x'],
-      ['parent1/file.y']
-    ])) == [('parent1/file.x', 'parent1/file.y')]
-
-  def test_should_group_single_file_in_directory_with_different_names(self):
-    assert list(group_file_pairs_by_parent_directory_or_name([
-      ['parent1/file1.x'],
-      ['parent1/file2.y']
-    ])) == [('parent1/file1.x', 'parent1/file2.y')]
-
-  def test_should_ignore_files_in_different_directories(self):
-    assert list(group_file_pairs_by_parent_directory_or_name([
-      ['parent1/file.x'],
-      ['parent2/file.y']
-    ])) == []
-
-  def test_should_group_multiple_files_in_separate_parent_directories(self):
-    assert list(group_file_pairs_by_parent_directory_or_name([
-      ['parent1/file.x', 'parent2/file.x'],
-      ['parent1/file.y', 'parent2/file.y']
-    ])) == [
-      ('parent1/file.x', 'parent1/file.y'),
-      ('parent2/file.x', 'parent2/file.y')
-    ]
-
-  def test_should_group_multiple_files_in_same_parent_directory_with_same_name(self):
-    assert list(group_file_pairs_by_parent_directory_or_name([
-      ['parent1/file1.x', 'parent1/file2.x'],
-      ['parent1/file1.y', 'parent1/file2.y']
-    ])) == [
-      ('parent1/file1.x', 'parent1/file1.y'),
-      ('parent1/file2.x', 'parent1/file2.y')
-    ]
-
-  def test_should_group_multiple_files_in_same_parent_directory_with_same_name_gzipped(self):
-    assert list(group_file_pairs_by_parent_directory_or_name([
-      ['parent1/file1.x.gz', 'parent1/file2.x.gz'],
-      ['parent1/file1.y.gz', 'parent1/file2.y.gz']
-    ])) == [
-      ('parent1/file1.x.gz', 'parent1/file1.y.gz'),
-      ('parent1/file2.x.gz', 'parent1/file2.y.gz')
-    ]
-
 DEFAULT_PDF_TO_LXML_ARGS = ['-blocks', '-noImageInline', '-noImage', '-fullFontName']
 
 LXML_CONTENT_1 = b'lxml content 1'
@@ -115,74 +56,6 @@ class TestConvertPdfBytesToLxml(object):
       )
       assert lxml_content == LXML_CONTENT_1
 
-class TestChangeExt(object):
-  def test_should_replace_simple_ext_with_simple_ext(self):
-    assert change_ext('file.pdf', None, '.xml') == 'file.xml'
-
-  def test_should_replace_simple_ext_with_combined_ext(self):
-    assert change_ext('file.pdf', None, '.svg.zip') == 'file.svg.zip'
-
-  def test_should_remove_gz_ext_before_replacing_ext(self):
-    assert change_ext('file.pdf.gz', None, '.svg.zip') == 'file.svg.zip'
-
-class TestBasePathForFileList(object):
-  def test_should_return_empty_string_if_file_list_is_empty(self):
-    assert base_path_for_file_list([]) == ''
-
-  def test_should_return_empty_string_if_filename_is_empty(self):
-    assert base_path_for_file_list(['']) == ''
-
-  def test_should_return_parent_directory_of_single_file(self):
-    assert base_path_for_file_list(['/base/path/1/file']) == '/base/path/1'
-
-  def test_should_return_common_path_of_two_files(self):
-    assert base_path_for_file_list(['/base/path/1/file', '/base/path/2/file']) == '/base/path'
-
-  def test_should_return_common_path_of_two_files_using_protocol(self):
-    assert base_path_for_file_list([
-      'a://base/path/1/file', 'a://base/path/2/file'
-    ]) == 'a://base/path'
-
-  def test_should_return_common_path_of_two_files_using_forward_slash(self):
-    assert base_path_for_file_list([
-      '\\base\\path\\1\\file', '\\base\\path\\2\\file'
-    ]) == '\\base\\path'
-
-  def test_should_return_empty_string_if_no_common_path_was_found(self):
-    assert base_path_for_file_list(['a://base/path/1/file', 'b://base/path/2/file']) == ''
-
-  def test_should_return_common_path_ignoring_partial_name_match(self):
-    assert base_path_for_file_list(['/base/path/file1', '/base/path/file2']) == '/base/path'
-
-class TestGetOrValidateBasePath(object):
-  def test_should_return_base_path_of_two_files_if_no_base_path_was_provided(self):
-    assert get_or_validate_base_path(
-      ['/base/path/1/file', '/base/path/2/file'],
-      None
-    ) == '/base/path'
-
-  def test_should_return_passed_in_base_path_if_valid(self):
-    assert get_or_validate_base_path(
-      ['/base/path/1/file', '/base/path/2/file'],
-      '/base'
-    ) == '/base'
-
-  def test_should_raise_error_if_passed_in_base_path_is_invalid(self):
-    with pytest.raises(AssertionError):
-      get_or_validate_base_path(
-        ['/base/path/1/file', '/base/path/2/file'],
-        '/base/other'
-      )
-
-class TestGetOutputFile(object):
-  def test_should_return_output_file_with_path_and_change_ext(self):
-    assert get_output_file(
-      '/source/path/file.pdf',
-      '/source',
-      '/output',
-      '.xml'
-    ) == '/output/path/file.xml'
-
 class TestPageRange(object):
   def test_should_parse_single_page_number_as_range(self):
     assert parse_page_range('1') == (1, 1)
diff --git a/sciencebeam_gym/preprocess/split_csv_dataset.py b/sciencebeam_gym/preprocess/split_csv_dataset.py
deleted file mode 100644
index 9035fba..0000000
--- a/sciencebeam_gym/preprocess/split_csv_dataset.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import argparse
-import csv
-import logging
-from math import trunc
-from random import shuffle
-
-from apache_beam.io.filesystems import FileSystems
-
-from sciencebeam_gym.utils.csv import (
-  csv_delimiter_by_filename,
-  write_csv_rows
-)
-
-from sciencebeam_gym.preprocess.preprocessing_utils import (
-  strip_ext,
-  get_ext
-)
-
-def get_logger():
-  return logging.getLogger(__name__)
-
-def extract_proportions_from_args(args):
-  digits = 3
-  proportions = [
-    (name, round(p, digits))
-    for name, p in [
-      ('train', args.train),
-      ('test', args.test),
-      ('validation', args.validation)
-    ]
-    if p > 0
-  ]
-  if sum(p for _, p in proportions) > 1.0:
-    raise ValueError('proportions add up to more than 1.0')
-  if not args.test:
-    proportions.append(('test', 1.0 - sum(p for _, p in proportions)))
-  elif not args.validation:
-    proportions.append(('validation', round(1.0 - sum(p for _, p in proportions), digits)))
-  proportions = [(name, p) for name, p in proportions if p > 0]
-  return proportions
-
-def split_rows(rows, percentages, fill=False):
-  size = len(rows)
-  chunk_size_list = [int(trunc(p * size)) for p in percentages]
-  if fill:
-    chunk_size_list[-1] = size - sum(chunk_size_list[:-1])
-  chunk_offset_list = [0]
-  for chunk_size in chunk_size_list[0:-1]:
-    chunk_offset_list.append(chunk_offset_list[-1] + chunk_size)
-  get_logger().debug('chunk_offset_list: %s', chunk_offset_list)
-  get_logger().debug('chunk_size_list: %s', chunk_size_list)
-  return [
-    rows[chunk_offset:chunk_offset + chunk_size]
-    for chunk_offset, chunk_size in zip(chunk_offset_list, chunk_size_list)
-  ]
-
-def output_filenames_for_names(names, prefix, ext):
-  return [
-    prefix + ('' if prefix.endswith('/') else '-') + name + ext
-    for name in names
-  ]
-
-def parse_args(argv=None):
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-    '--input', type=str, required=True,
-    help='input csv/tsv file'
-  )
-  parser.add_argument(
-    '--train', type=float, required=True,
-    help='Train dataset proportion'
-  )
-  parser.add_argument(
-    '--test', type=float, required=False,
-    help='Test dataset proportion (if not specified it is assumed to be the remaining percentage)'
-  )
-  parser.add_argument(
-    '--validation', type=float, required=False,
-    help='Validation dataset proportion (requires test-proportion)'
-  )
-  parser.add_argument(
-    '--random', action='store_true', default=False,
-    help='randomise samples before doing the split'
-  )
-  parser.add_argument(
-    '--fill', action='store_true', default=False,
-    help='use up all of the remaining data rows for the last set'
-  )
-  parser.add_argument(
-    '--no-header', action='store_true', default=False,
-    help='input file does not contain a header'
-  )
-  parser.add_argument(
-    '--out', type=str, required=False,
-    help='output csv/tsv file prefix or directory (if ending with slash)'
-    ' will use input file name by default'
-  )
-  return parser.parse_args(argv)
-
-def process_args(args):
-  if not args.out:
-    args.out = strip_ext(args.input)
-
-def main(argv=None):
-  args = parse_args(argv)
-  process_args(args)
-  ext = get_ext(args.input)
-  proportions = extract_proportions_from_args(args)
-  output_filenames = output_filenames_for_names(
-    [name for name, _ in proportions],
-    args.out,
-    ext
-  )
-  get_logger().info('proportions: %s', proportions)
-  get_logger().info('output_filenames: %s', output_filenames)
-  delimiter = csv_delimiter_by_filename(args.input)
-  with FileSystems.open(args.input) as f:
-    reader = csv.reader(f, delimiter=delimiter)
-    header_row = None if args.no_header else next(reader)
-    data_rows = list(reader)
-  get_logger().info('number of rows: %d', len(data_rows))
-  if args.random:
-    shuffle(data_rows)
-  data_rows_by_set = split_rows(
-    data_rows,
-    [p for _, p in proportions],
-    fill=args.fill
-  )
-
-  mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv'
-  for output_filename, set_data_rows in zip(output_filenames, data_rows_by_set):
-    get_logger().info('set size: %d (%s)', len(set_data_rows), output_filename)
-    with FileSystems.create(output_filename, mime_type=mime_type) as f:
-      writer = csv.writer(f, delimiter=delimiter)
-      if header_row:
-        write_csv_rows(writer, [header_row])
-      write_csv_rows(writer, set_data_rows)
-
-if __name__ == '__main__':
-  logging.basicConfig(level='INFO')
-
-  main()
diff --git a/sciencebeam_gym/preprocess/split_csv_dataset_test.py b/sciencebeam_gym/preprocess/split_csv_dataset_test.py
deleted file mode 100644
index a3c470a..0000000
--- a/sciencebeam_gym/preprocess/split_csv_dataset_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from collections import namedtuple
-
-from sciencebeam_gym.preprocess.split_csv_dataset import (
-  extract_proportions_from_args,
-  split_rows,
-  output_filenames_for_names
-)
-
-def create_args(**kwargs):
-  return namedtuple('args', kwargs.keys())(**kwargs)
-
-class TestExtractProportionsFromArgs(object):
-  def test_should_create_train_test_split_with_only_train_specified(self):
-    assert extract_proportions_from_args(
-      create_args(train=0.6, test=None, validation=None)
-    ) == [('train', 0.6), ('test', 0.4)]
-
-  def test_should_create_train_test_validation_split_with_train_and_test_specified(self):
-    assert extract_proportions_from_args(
-      create_args(train=0.6, test=0.3, validation=None)
-    ) == [('train', 0.6), ('test', 0.3), ('validation', 0.1)]
-
-  def test_should_not_add_validation_if_remaining_percentage_is_zero(self):
-    assert extract_proportions_from_args(
-      create_args(train=0.6, test=0.4, validation=None)
-    ) == [('train', 0.6), ('test', 0.4)]
-
-class TestSplitRows(object):
-  def test_should_split_train_test(self):
-    assert split_rows(list(range(10)), [0.6, 0.4]) == [
-      list(range(6)),
-      list(range(6, 10))
-    ]
-
-  def test_should_split_train_test_validation(self):
-    assert split_rows(list(range(10)), [0.6, 0.3, 0.1]) == [
-      list(range(6)),
-      list(range(6, 9)),
-      list(range(9, 10))
-    ]
-
-  def test_should_round_down(self):
-    assert split_rows(list(range(11)), [0.6, 0.4]) == [
-      list(range(6)),
-      list(range(6, 10))
-    ]
-
-  def test_should_fill_last_chunk_if_enabled(self):
-    assert split_rows(list(range(11)), [0.6, 0.4], fill=True) == [
-      list(range(6)),
-      list(range(6, 11))
-    ]
-
-class TestGetOutputFilenamesForNames(object):
-  def test_should_add_name_and_ext_with_path_sep_if_out_ends_with_slash(self):
-    assert output_filenames_for_names(
-      ['train', 'test'], 'out/', '.tsv'
-    ) == ['out/train.tsv', 'out/test.tsv']
-
-  def test_should_add_name_and_ext_with_hyphen_if_out_does_not_end_with_slash(self):
-    assert output_filenames_for_names(
-      ['train', 'test'], 'out', '.tsv'
-    ) == ['out-train.tsv', 'out-test.tsv']
diff --git a/sciencebeam_gym/structured_document/lxml.py b/sciencebeam_gym/structured_document/lxml.py
index 6cb374b..5219529 100644
--- a/sciencebeam_gym/structured_document/lxml.py
+++ b/sciencebeam_gym/structured_document/lxml.py
@@ -1,9 +1,9 @@
-from sciencebeam_gym.utils.bounding_box import (
-  BoundingBox
+from sciencebeam_utils.utils.xml import (
+  set_or_remove_attrib
 )
 
-from sciencebeam_gym.utils.xml import (
-  set_or_remove_attrib
+from sciencebeam_gym.utils.bounding_box import (
+  BoundingBox
 )
 
 from sciencebeam_gym.structured_document import (
diff --git a/sciencebeam_gym/structured_document/structured_document_saver.py b/sciencebeam_gym/structured_document/structured_document_saver.py
index 97d170a..5ca17bd 100644
--- a/sciencebeam_gym/structured_document/structured_document_saver.py
+++ b/sciencebeam_gym/structured_document/structured_document_saver.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import
 
 from lxml import etree
 
-from sciencebeam_gym.beam_utils.io import (
+from sciencebeam_utils.beam_utils.io import (
   save_file_content
 )
 
diff --git a/sciencebeam_gym/structured_document/svg.py b/sciencebeam_gym/structured_document/svg.py
index 27655c2..194ccf9 100644
--- a/sciencebeam_gym/structured_document/svg.py
+++ b/sciencebeam_gym/structured_document/svg.py
@@ -1,9 +1,9 @@
-from sciencebeam_gym.utils.bounding_box import (
-  BoundingBox
+from sciencebeam_utils.utils.xml import (
+  set_or_remove_attrib
 )
 
-from sciencebeam_gym.utils.xml import (
-  set_or_remove_attrib
+from sciencebeam_gym.utils.bounding_box import (
+  BoundingBox
 )
 
 from sciencebeam_gym.structured_document import (
diff --git a/sciencebeam_gym/tools/calculate_class_weights_test.py b/sciencebeam_gym/tools/calculate_class_weights_test.py
index bd9053f..5dd9935 100644
--- a/sciencebeam_gym/tools/calculate_class_weights_test.py
+++ b/sciencebeam_gym/tools/calculate_class_weights_test.py
@@ -6,7 +6,7 @@ from io import BytesIO
 
 from backports.tempfile import TemporaryDirectory
 
-from sciencebeam_gym.utils.num import (
+from sciencebeam_utils.utils.num import (
   assert_close,
   assert_all_close
 )
diff --git a/sciencebeam_gym/trainer/data/examples.py b/sciencebeam_gym/trainer/data/examples.py
index bc2b313..0c13b04 100644
--- a/sciencebeam_gym/trainer/data/examples.py
+++ b/sciencebeam_gym/trainer/data/examples.py
@@ -4,7 +4,7 @@ from functools import partial
 import tensorflow as tf
 from tensorflow.python.lib.io import file_io # pylint: disable=E0611
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   extend_dict
 )
 
diff --git a/sciencebeam_gym/trainer/data/examples_test.py b/sciencebeam_gym/trainer/data/examples_test.py
index 970dac4..7a2886a 100644
--- a/sciencebeam_gym/trainer/data/examples_test.py
+++ b/sciencebeam_gym/trainer/data/examples_test.py
@@ -5,7 +5,7 @@ import pytest
 
 import tensorflow as tf
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   extend_dict
 )
 
diff --git a/sciencebeam_gym/trainer/evaluator_test.py b/sciencebeam_gym/trainer/evaluator_test.py
index 9c1927a..3f257f0 100644
--- a/sciencebeam_gym/trainer/evaluator_test.py
+++ b/sciencebeam_gym/trainer/evaluator_test.py
@@ -3,7 +3,7 @@ import logging
 import pytest
 import tensorflow as tf
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   to_namedtuple
 )
 
diff --git a/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py b/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py
index caf7f49..99e9c6f 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py
@@ -6,7 +6,7 @@ import pytest
 import tensorflow as tf
 import numpy as np
 
-from sciencebeam_gym.utils.num import (
+from sciencebeam_utils.utils.num import (
   assert_close
 )
 
diff --git a/sciencebeam_gym/trainer/models/pix2pix/loss_test.py b/sciencebeam_gym/trainer/models/pix2pix/loss_test.py
index be5c89f..25916b3 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/loss_test.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/loss_test.py
@@ -5,7 +5,7 @@ from six import raise_from
 import tensorflow as tf
 import numpy as np
 
-from sciencebeam_gym.utils.num import (
+from sciencebeam_utils.utils.num import (
   assert_close
 )
 
diff --git a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py
index 488efe6..b514198 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py
@@ -6,12 +6,12 @@ import tensorflow as tf
 import numpy as np
 import pytest
 
-from sciencebeam_gym.utils.num import (
+from sciencebeam_utils.utils.num import (
   assert_all_close,
   assert_all_not_close
 )
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   extend_dict
 )
 
diff --git a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py
index ce921ec..7377812 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py
@@ -6,7 +6,7 @@ from pytest import raises
 
 import tensorflow as tf
 
-from sciencebeam_gym.utils.collection import (
+from sciencebeam_utils.utils.collection import (
   extend_dict
 )
 
diff --git a/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py b/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py
index abb269f..22cfc0d 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py
@@ -4,7 +4,7 @@ from __future__ import division
 import tensorflow as tf
 import numpy as np
 
-from sciencebeam_gym.utils.num import (
+from sciencebeam_utils.utils.num import (
   assert_all_close
 )
 
diff --git a/sciencebeam_gym/utils/collection.py b/sciencebeam_gym/utils/collection.py
deleted file mode 100644
index c790410..0000000
--- a/sciencebeam_gym/utils/collection.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from __future__ import absolute_import
-
-from collections import namedtuple
-from itertools import groupby
-
-from six import iteritems
-
-flatten = lambda l: [item for sublist in l for item in sublist]
-
-iter_flatten = lambda l: (item for sublist in l for item in sublist)
-
-def filter_truthy(list_of_something):
-  return [l for l in list_of_something if l]
-
-def strip_all(list_of_strings):
-  return [(s or '').strip() for s in list_of_strings if s]
-
-def remove_key_from_dict(d, key):
-  return {k: v for k, v in iteritems(d) if k != key}
-
-def remove_keys_from_dict(d, keys_to_remove):
-  if not keys_to_remove:
-    return d
-  return {
-    k: v
-    for k, v in iteritems(d)
-    if k not in keys_to_remove
-  }
-
-def extract_from_dict(d, key, default_value=None):
-  return d.get(key, default_value), remove_key_from_dict(d, key)
-
-def extend_dict(d, *other_dicts, **kwargs):
-  """
-  example:
-
-  extend_dict(d1, d2)
-
-  is equivalent to Python 3 syntax:
-  {
-    **d1,
-    **d2
-  }
-  """
-  d = d.copy()
-  for other_dict in other_dicts:
-    d.update(other_dict)
-  d.update(kwargs)
-  return d
-
-def groupby_to_dict(iterable, key):
-  return {
-    k: list(v)
-    for k, v in groupby(iterable, key=key)
-  }
-
-def sort_and_groupby_to_dict(iterable, key):
-  return groupby_to_dict(sorted(iterable, key=key), key)
-
-def to_namedtuple(*args, **kwargs):
-  name = kwargs.pop('name', 'Tuple')
-  d = extend_dict(*list(args) + [kwargs])
-  return namedtuple(name, d.keys())(**d)
diff --git a/sciencebeam_gym/utils/compat.py b/sciencebeam_gym/utils/compat.py
deleted file mode 100644
index a52ca94..0000000
--- a/sciencebeam_gym/utils/compat.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from six import PY3
-
-def python_2_unicode_compatible(cls):
-  """
-  Same as futures.utils.python_2_unicode_compatible but with support for __repr__
-  """
-  if not PY3:
-    if cls.__repr__ is not object.__repr__:
-      unicode_repr = cls.__repr__
-      cls.__repr__ = lambda self: unicode_repr(self).encode('utf-8')
-    if cls.__str__ is not object.__str__:
-      cls.__unicode__ = cls.__str__
-      cls.__str__ = lambda self: self.__unicode__().encode('utf-8')
-  return cls
diff --git a/sciencebeam_gym/utils/compat_test.py b/sciencebeam_gym/utils/compat_test.py
deleted file mode 100644
index 8c7d380..0000000
--- a/sciencebeam_gym/utils/compat_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from six import text_type
-
-from sciencebeam_gym.utils.compat import (
-  python_2_unicode_compatible
-)
-
-ASCII_VALUE = 'abc'
-UNICODE_VALUE = u'a\u1234b'
-
-@python_2_unicode_compatible
-class ReprWrapper(object):
-  def __init__(self, value):
-    self.value = value
-
-  def __repr__(self):
-    return self.value
-
-@python_2_unicode_compatible
-class StrWrapper(object):
-  def __init__(self, value):
-    self.value = value
-
-  def __str__(self):
-    return self.value
-
-@python_2_unicode_compatible
-class ReprStrWrapper(object):
-  def __init__(self, value):
-    self.value = value
-
-  def __repr__(self):
-    return self.value
-
-  def __str__(self):
-    return self.value
-
-class TestPython2UnicodeCompatible(object):
-  def test_should_return_repr_ascii_value(self):
-    assert repr(ReprWrapper(text_type(ASCII_VALUE))) == ASCII_VALUE
-
-  def test_should_encode_repr_unicode_value_without_str(self):
-    assert repr(ReprWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8')
-
-  def test_should_encode_repr_unicode_value_with_str(self):
-    assert repr(ReprStrWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8')
-
-  def test_should_return_str_ascii_value(self):
-    assert str(StrWrapper(text_type(ASCII_VALUE))) == ASCII_VALUE
-
-  def test_should_encode_str_unicode_value_without_repr(self):
-    assert str(StrWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8')
-
-  def test_should_encode_str_unicode_value_with_repr(self):
-    assert str(ReprStrWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8')
-
-  def test_should_encode_str_unicode_value_with_repr_but_without_str(self):
-    assert str(ReprWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8')
diff --git a/sciencebeam_gym/utils/csv.py b/sciencebeam_gym/utils/csv.py
deleted file mode 100644
index a4d7f99..0000000
--- a/sciencebeam_gym/utils/csv.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import os
-import csv
-
-import six
-
-TEMP_FILE_SUFFIX = '.part'
-
-def csv_delimiter_by_filename(filename):
-  if '.tsv' in filename:
-    return '\t'
-  else:
-    return ','
-
-def open_csv_output(filename):
-  return open(filename, 'w')
-
-def write_csv_rows(writer, iterable):
-  if six.PY2:
-    for row in iterable:
-      writer.writerow([
-        x.encode('utf-8') if isinstance(x, six.text_type) else x
-        for x in row
-      ])
-  else:
-    for row in iterable:
-      writer.writerow(row)
-
-def write_csv_row(writer, row):
-  write_csv_rows(writer, [row])
-
-def write_csv(filename, columns, iterable, delimiter=None):
-  if delimiter is None:
-    delimiter = csv_delimiter_by_filename(filename)
-  is_stdout = filename in {'stdout', '/dev/stdout'}
-  temp_filename = (
-    filename + TEMP_FILE_SUFFIX
-    if is_stdout
-    else filename
-  )
-  if not is_stdout and os.path.isfile(filename):
-    os.remove(filename)
-  with open_csv_output(temp_filename) as csv_f:
-    writer = csv.writer(csv_f, delimiter=delimiter)
-    write_csv_rows(writer, [columns])
-    write_csv_rows(writer, iterable)
-  if not is_stdout:
-    os.rename(temp_filename, filename)
-
-def iter_dict_to_list(iterable, fields):
-  return (
-    [item.get(field) for field in fields]
-    for item in iterable
-  )
-
-def write_dict_csv(filename, columns, iterable, delimiter=None):
-  write_csv(filename, columns, iter_dict_to_list(iterable, columns), delimiter=delimiter)
diff --git a/sciencebeam_gym/utils/file_list.py b/sciencebeam_gym/utils/file_list.py
deleted file mode 100644
index b062236..0000000
--- a/sciencebeam_gym/utils/file_list.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from __future__ import absolute_import
-
-import codecs
-import csv
-import os
-from itertools import islice
-
-from apache_beam.io.filesystems import FileSystems
-
-from sciencebeam_gym.utils.csv import (
-  csv_delimiter_by_filename
-)
-
-from .file_path import (
-  relative_path,
-  join_if_relative_path
-)
-
-
-def is_csv_or_tsv_file_list(file_list_path):
-  return '.csv' in file_list_path or '.tsv' in file_list_path
-
-def load_plain_file_list(file_list_path, limit=None):
-  with FileSystems.open(file_list_path) as f:
-    lines = (x.rstrip() for x in codecs.getreader('utf-8')(f))
-    if limit:
-      lines = islice(lines, 0, limit)
-    return list(lines)
-
-def load_csv_or_tsv_file_list(file_list_path, column, header=True, limit=None):
-  delimiter = csv_delimiter_by_filename(file_list_path)
-  with FileSystems.open(file_list_path) as f:
-    reader = csv.reader(f, delimiter=delimiter)
-    if not header:
-      assert isinstance(column, int)
-      column_index = column
-    else:
-      header_row = next(reader)
-      if isinstance(column, int):
-        column_index = column
-      else:
-        try:
-          column_index = header_row.index(column)
-        except ValueError:
-          raise ValueError(
-            'column %s not found, available columns: %s' %
-            (column, header_row)
-          )
-    lines = (x[column_index].decode('utf-8') for x in reader)
-    if limit:
-      lines = islice(lines, 0, limit)
-    return list(lines)
-
-def to_absolute_file_list(base_path, file_list):
-  return [join_if_relative_path(base_path, s) for s in file_list]
-
-def to_relative_file_list(base_path, file_list):
-  return [relative_path(base_path, s) for s in file_list]
-
-def load_file_list(file_list_path, column, header=True, limit=None, to_absolute=True):
-  if is_csv_or_tsv_file_list(file_list_path):
-    file_list = load_csv_or_tsv_file_list(
-      file_list_path, column=column, header=header, limit=limit
-    )
-  else:
-    file_list = load_plain_file_list(file_list_path, limit=limit)
-  if to_absolute:
-    file_list = to_absolute_file_list(
-      os.path.dirname(file_list_path), file_list
-    )
-  return file_list
-
-def save_plain_file_list(file_list_path, file_list):
-  with FileSystems.create(file_list_path) as f:
-    f.write('\n'.join(file_list).encode('utf-8'))
-
-def save_csv_or_tsv_file_list(file_list_path, file_list, column, header=True):
-  if header:
-    file_list = [column] + file_list
-  save_plain_file_list(file_list_path, file_list)
-
-def save_file_list(file_list_path, file_list, column, header=True):
-  if is_csv_or_tsv_file_list(file_list_path):
-    return save_csv_or_tsv_file_list(
-      file_list_path, file_list, column=column, header=header
-    )
-  else:
-    return save_plain_file_list(file_list_path, file_list)
diff --git a/sciencebeam_gym/utils/file_list_test.py b/sciencebeam_gym/utils/file_list_test.py
deleted file mode 100644
index efeff9a..0000000
--- a/sciencebeam_gym/utils/file_list_test.py
+++ /dev/null
@@ -1,202 +0,0 @@
-import os
-from tempfile import NamedTemporaryFile
-from mock import patch
-from backports.tempfile import TemporaryDirectory
-
-import pytest
-
-import sciencebeam_gym.utils.file_list as file_list_loader
-from sciencebeam_gym.utils.file_list import (
-  is_csv_or_tsv_file_list,
-  load_plain_file_list,
-  load_csv_or_tsv_file_list,
-  to_absolute_file_list,
-  to_relative_file_list,
-  load_file_list,
-  save_plain_file_list,
-  save_csv_or_tsv_file_list,
-  save_file_list
-)
-
-FILE_1 = 'file1.pdf'
-FILE_2 = 'file2.pdf'
-UNICODE_FILE_1 = u'file1\u1234.pdf'
-FILE_LIST = [FILE_1, FILE_2]
-
-@pytest.fixture(name='load_plain_file_list_mock')
-def _load_plain_file_list():
-  with patch.object(file_list_loader, 'load_plain_file_list') as mock:
-    yield mock
-
-@pytest.fixture(name='load_csv_or_tsv_file_list_mock')
-def _load_csv_or_tsv_file_list():
-  with patch.object(file_list_loader, 'load_csv_or_tsv_file_list') as mock:
-    yield mock
-
-@pytest.fixture(name='to_absolute_file_list_mock')
-def _to_absolute_file_list():
-  with patch.object(file_list_loader, 'to_absolute_file_list') as mock:
-    yield mock
-
-class TestIsCsvOrTsvFileList(object):
-  def test_should_return_true_if_file_ext_is_csv(self):
-    assert is_csv_or_tsv_file_list('files.csv')
-
-  def test_should_return_true_if_file_ext_is_csv_gz(self):
-    assert is_csv_or_tsv_file_list('files.csv.gz')
-
-  def test_should_return_true_if_file_ext_is_tsv(self):
-    assert is_csv_or_tsv_file_list('files.tsv')
-
-  def test_should_return_true_if_file_ext_is_tsv_gz(self):
-    assert is_csv_or_tsv_file_list('files.tsv.gz')
-
-  def test_should_return_false_if_file_ext_is_lst(self):
-    assert not is_csv_or_tsv_file_list('files.lst')
-
-  def test_should_return_false_if_file_ext_is_lst_gz(self):
-    assert not is_csv_or_tsv_file_list('files.lst.gz')
-
-class TestLoadPlainFileList(object):
-  def test_should_read_multiple_file_paths_from_file(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join([FILE_1, FILE_2]))
-      f.flush()
-      assert load_plain_file_list(f.name) == [FILE_1, FILE_2]
-
-  def test_should_read_unicode_file(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join([UNICODE_FILE_1.encode('utf-8')]))
-      f.flush()
-      assert load_plain_file_list(f.name) == [UNICODE_FILE_1]
-
-  def test_should_apply_limit(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join([FILE_1, FILE_2]))
-      f.flush()
-      assert load_plain_file_list(f.name, limit=1) == [FILE_1]
-
-class TestLoadCsvOrTsvFileList(object):
-  def test_should_read_multiple_file_paths_from_file_with_header_using_column_name(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join(['url', FILE_1, FILE_2]))
-      f.flush()
-      assert load_csv_or_tsv_file_list(f.name, 'url') == [FILE_1, FILE_2]
-
-  def test_should_read_multiple_file_paths_from_file_with_header_using_column_index(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join(['url', FILE_1, FILE_2]))
-      f.flush()
-      assert load_csv_or_tsv_file_list(f.name, 0) == [FILE_1, FILE_2]
-
-  def test_should_read_multiple_file_paths_from_file_without_header(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join([FILE_1, FILE_2]))
-      f.flush()
-      assert load_csv_or_tsv_file_list(f.name, 0, header=False) == [FILE_1, FILE_2]
-
-  def test_should_read_unicode_file(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join(['url', UNICODE_FILE_1.encode('utf-8')]))
-      f.flush()
-      assert load_csv_or_tsv_file_list(f.name, 'url') == [UNICODE_FILE_1]
-
-  def test_should_raise_exception_if_column_name_is_invalid(self):
-    with pytest.raises(ValueError):
-      with NamedTemporaryFile() as f:
-        f.write('\n'.join(['url', FILE_1, FILE_2]))
-        f.flush()
-        assert load_csv_or_tsv_file_list(f.name, 'xyz') == [FILE_1, FILE_2]
-
-  def test_should_raise_exception_if_column_index_is_invalid(self):
-    with pytest.raises(IndexError):
-      with NamedTemporaryFile() as f:
-        f.write('\n'.join(['url', FILE_1, FILE_2]))
-        f.flush()
-        assert load_csv_or_tsv_file_list(f.name, 1) == [FILE_1, FILE_2]
-
-  def test_should_apply_limit(self):
-    with NamedTemporaryFile() as f:
-      f.write('\n'.join(['url', FILE_1, FILE_2]))
-      f.flush()
-      assert load_csv_or_tsv_file_list(f.name, 'url', limit=1) == [FILE_1]
-
-class TestToAbsoluteFileList(object):
-  def test_should_make_path_absolute(self):
-    assert to_absolute_file_list('/base/path', ['sub/file1']) == ['/base/path/sub/file1']
-
-  def test_should_not_change_absolute_paths(self):
-    assert to_absolute_file_list('/base/path', ['/other/file1']) == ['/other/file1']
-
-class TestToRelativeFileList(object):
-  def test_should_make_path_absolute(self):
-    assert to_relative_file_list('/base/path', ['/base/path/sub/file1']) == ['sub/file1']
-
-  def test_should_not_change_path_outside_base_path(self):
-    assert to_relative_file_list('/base/path', ['/other/file1']) == ['/other/file1']
-
-@pytest.mark.usefixtures(
-  'load_plain_file_list_mock', 'load_csv_or_tsv_file_list_mock', 'to_absolute_file_list_mock'
-)
-class TestLoadFileList(object):
-  def test_should_call_load_plain_file_list(self, load_plain_file_list_mock):
-    result = load_file_list(
-      'file-list.lst', column='url', header=True, limit=1, to_absolute=False
-    )
-    load_plain_file_list_mock.assert_called_with('file-list.lst', limit=1)
-    assert result == load_plain_file_list_mock.return_value
-
-  def test_should_call_load_csv_or_tsv_file_list(self, load_csv_or_tsv_file_list_mock):
-    result = load_file_list(
-      'file-list.csv', column='url', header=True, limit=1, to_absolute=False
-    )
-    load_csv_or_tsv_file_list_mock.assert_called_with(
-      'file-list.csv', column='url', header=True, limit=1
-    )
-    assert result == load_csv_or_tsv_file_list_mock.return_value
-
-  def test_should_make_file_list_absolute(
-    self, load_plain_file_list_mock, to_absolute_file_list_mock):
-
-    result = load_file_list('/base/path/file-list.lst', column='url', to_absolute=True)
-    to_absolute_file_list_mock.assert_called_with(
-      '/base/path', load_plain_file_list_mock.return_value
-    )
-    assert result == to_absolute_file_list_mock.return_value
-
-class TestSavePlainFileList(object):
-  def test_should_write_multiple_file_paths(self):
-    with TemporaryDirectory() as path:
-      file_list_path = os.path.join(path, 'out.lst')
-      save_plain_file_list(file_list_path, [FILE_1, FILE_2])
-      assert load_plain_file_list(file_list_path) == [FILE_1, FILE_2]
-
-  def test_should_write_unicode_file(self):
-    with TemporaryDirectory() as path:
-      file_list_path = os.path.join(path, 'out.lst')
-      save_plain_file_list(file_list_path, [UNICODE_FILE_1])
-      assert load_plain_file_list(file_list_path) == [UNICODE_FILE_1]
-
-class TestSaveCsvOrTsvFileList(object):
-  def test_should_write_multiple_file_paths(self):
-    with TemporaryDirectory() as path:
-      file_list_path = os.path.join(path, 'out.csv')
-      save_csv_or_tsv_file_list(file_list_path, [FILE_1, FILE_2], column='url')
-      assert load_csv_or_tsv_file_list(file_list_path, column='url') == [FILE_1, FILE_2]
-
-  def test_should_write_unicode_file(self):
-    with TemporaryDirectory() as path:
-      file_list_path = os.path.join(path, 'out.lst')
-      save_csv_or_tsv_file_list(file_list_path, [UNICODE_FILE_1], column='url')
-      assert load_csv_or_tsv_file_list(file_list_path, column='url') == [UNICODE_FILE_1]
-
-class TestSaveFileList(object):
-  def test_should_call_save_plain_file_list(self):
-    with patch.object(file_list_loader, 'save_plain_file_list') as mock:
-      save_file_list('file-list.lst', FILE_LIST, column='url', header=True)
-      mock.assert_called_with('file-list.lst', FILE_LIST)
-
-  def test_should_call_save_csv_or_tsv_file_list(self):
-    with patch.object(file_list_loader, 'save_csv_or_tsv_file_list') as mock:
-      save_file_list('file-list.csv', FILE_LIST, column='url', header=True)
-      mock.assert_called_with('file-list.csv', FILE_LIST, column='url', header=True)
diff --git a/sciencebeam_gym/utils/file_path.py b/sciencebeam_gym/utils/file_path.py
deleted file mode 100644
index cc798c1..0000000
--- a/sciencebeam_gym/utils/file_path.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-from __future__ import absolute_import
-
-from apache_beam.io.filesystems import FileSystems
-
-def relative_path(base_path, path):
-  if not base_path:
-    return path
-  if not base_path.endswith('/'):
-    base_path += '/'
-  return path[len(base_path):] if path.startswith(base_path) else path
-
-def is_relative_path(path):
-  return not path.startswith('/') and '://' not in path
-
-def join_if_relative_path(base_path, path):
-  return (
-    FileSystems.join(base_path, path)
-    if base_path and is_relative_path(path)
-    else path
-  )
diff --git a/sciencebeam_gym/utils/file_path_test.py b/sciencebeam_gym/utils/file_path_test.py
deleted file mode 100644
index 9ef1680..0000000
--- a/sciencebeam_gym/utils/file_path_test.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from .file_path import (
-  relative_path,
-  join_if_relative_path
-)
-
-class TestRelativePath(object):
-  def test_should_return_path_if_base_path_is_none(self):
-    assert relative_path(None, 'file') == 'file'
-
-  def test_should_return_path_if_path_outside_base_path(self):
-    assert relative_path('/parent', '/other/file') == '/other/file'
-
-  def test_should_return_absolute_path_if_base_path_matches(self):
-    assert relative_path('/parent', '/parent/file') == 'file'
-
-class TestJoinIfRelativePath(object):
-  def test_should_return_path_if_base_path_is_none(self):
-    assert join_if_relative_path(None, 'file') == 'file'
-
-  def test_should_return_path_if_not_relative(self):
-    assert join_if_relative_path('/parent', '/other/file') == '/other/file'
-
-  def test_should_return_joined_path_if_relative(self):
-    assert join_if_relative_path('/parent', 'file') == '/parent/file'
diff --git a/sciencebeam_gym/utils/io.py b/sciencebeam_gym/utils/io.py
deleted file mode 100644
index ab9644c..0000000
--- a/sciencebeam_gym/utils/io.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os
-import errno
-
-def makedirs(path, exists_ok=False):
-  try:
-    # Python 3
-    os.makedirs(path, exists_ok=exists_ok)
-  except TypeError:
-    # Python 2
-    try:
-      os.makedirs(path)
-    except OSError as e:
-      if e.errno == errno.EEXIST and os.path.isdir(path) and exists_ok:
-        pass
-      else:
-        raise
diff --git a/sciencebeam_gym/utils/num.py b/sciencebeam_gym/utils/num.py
deleted file mode 100644
index 2e77f18..0000000
--- a/sciencebeam_gym/utils/num.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from six import raise_from
-
-import numpy as np
-
-def assert_close(a, b, atol=1.e-8):
-  try:
-    assert np.allclose([a], [b], atol=atol)
-  except AssertionError as e:
-    raise_from(AssertionError('expected %s to be close to %s (atol=%s)' % (a, b, atol)), e)
-
-def assert_all_close(a, b, atol=1.e-8):
-  try:
-    assert np.allclose(a, b, atol=atol)
-  except AssertionError as e:
-    raise_from(AssertionError('expected %s to be close to %s (atol=%s)' % (a, b, atol)), e)
-
-def assert_all_not_close(a, b, atol=1.e-8):
-  try:
-    assert not np.allclose(a, b, atol=atol)
-  except AssertionError as e:
-    raise_from(AssertionError('expected %s not to be close to %s (atol=%s)' % (a, b, atol)), e)
diff --git a/sciencebeam_gym/utils/pages_zip.py b/sciencebeam_gym/utils/pages_zip.py
index 59d421d..f992400 100644
--- a/sciencebeam_gym/utils/pages_zip.py
+++ b/sciencebeam_gym/utils/pages_zip.py
@@ -3,7 +3,7 @@ from zipfile import ZipFile, ZIP_DEFLATED
 
 from apache_beam.io.filesystems import FileSystems
 
-from sciencebeam_gym.beam_utils.io import (
+from sciencebeam_utils.beam_utils.io import (
   dirname,
   mkdirs_if_not_exists
 )
diff --git a/sciencebeam_gym/utils/stopwatch.py b/sciencebeam_gym/utils/stopwatch.py
deleted file mode 100644
index bc750b4..0000000
--- a/sciencebeam_gym/utils/stopwatch.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import sys
-import time
-
-try:
-  perf_counter = time.perf_counter
-except AttributeError:
-  # as per original timeit source (before perf_counter)
-  if sys.platform == "win32":
-    # On Windows, the best timer is time.clock()
-    perf_counter = time.clock
-  else:
-    # On most other platforms the best timer is time.time()
-    perf_counter = time.time
-
-class StopWatch(object):
-  def __init__(self):
-    self.start = perf_counter()
-
-  def get_elapsed_seconds(self, reset=False):
-    end = perf_counter()
-    elapsed = end - self.start
-    if reset:
-      self.start = end
-    return elapsed
-
-class StopWatchRecorder(object):
-  def __init__(self):
-    self.stop_watch = StopWatch()
-    self.recorded_timings = []
-    self.started = None
-
-  def stop(self):
-    self.start(None)
-
-  def start(self, name):
-    elapsed = self.stop_watch.get_elapsed_seconds(reset=True)
-    if self.started:
-      self.recorded_timings.append((self.started, elapsed))
-    self.started = name
-
-  def __str__(self):
-    total = ('total', sum(elapsed for _, elapsed in self.recorded_timings))
-    return ', '.join(
-      '%s: %.6fs' % (name, elapsed)
-      for name, elapsed in self.recorded_timings + [total]
-    )
diff --git a/sciencebeam_gym/utils/string.py b/sciencebeam_gym/utils/string.py
deleted file mode 100644
index 00aa95d..0000000
--- a/sciencebeam_gym/utils/string.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from future.utils import python_2_unicode_compatible
-
-@python_2_unicode_compatible
-class LazyStr(object):
-  def __init__(self, fn):
-    self.fn = fn
-
-  def __str__(self):
-    return self.fn()
diff --git a/sciencebeam_gym/utils/xml.py b/sciencebeam_gym/utils/xml.py
deleted file mode 100644
index 764c6a7..0000000
--- a/sciencebeam_gym/utils/xml.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from lxml import etree
-
-def _get_text_content_and_exclude(node, exclude):
-  result = ''
-  if node.text is not None:
-    result += node.text
-  result += ''.join([
-    (
-      _get_text_content_and_exclude(c, exclude)
-      if c not in exclude
-      else ''
-    ) +
-    (c.tail if c.tail is not None else '')
-    for c in node.iterchildren()
-  ])
-  return result
-
-def get_text_content(node, exclude=None):
-  '''
-  Strip tags and return text content
-  '''
-  if not exclude:
-    return ''.join(node.itertext())
-  return _get_text_content_and_exclude(node, exclude)
-
-def get_immediate_text(node):
-  return node.xpath('text()')
-
-def get_text_content_list(nodes, exclude=None):
-  return [get_text_content(node, exclude=exclude) for node in nodes]
-
-def xml_from_string_with_recover(s):
-  parser = etree.XMLParser(recover=True)
-  return etree.fromstring(s, parser=parser)
-
-def set_or_remove_attrib(attrib, name, value):
-  if value is None:
-    if name in attrib:
-      del attrib[name]
-  else:
-    attrib[name] = value
diff --git a/sciencebeam_gym/utils/xml_test.py b/sciencebeam_gym/utils/xml_test.py
deleted file mode 100644
index cbfbefa..0000000
--- a/sciencebeam_gym/utils/xml_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from lxml.builder import E
-
-from sciencebeam_gym.utils.xml import (
-  get_text_content,
-  get_immediate_text,
-  xml_from_string_with_recover
-)
-
-SOME_VALUE_1 = 'some value1'
-SOME_VALUE_2 = 'some value2'
-
-class TestGetTextContent(object):
-  def test_should_return_simple_text(self):
-    node = E.parent(SOME_VALUE_1)
-    assert get_text_content(node) == SOME_VALUE_1
-
-  def test_should_return_text_of_child_element(self):
-    node = E.parent(E.child(SOME_VALUE_1))
-    assert get_text_content(node) == SOME_VALUE_1
-
-  def test_should_return_text_of_child_element_and_preceeding_text(self):
-    node = E.parent(SOME_VALUE_1, E.child(SOME_VALUE_2))
-    assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2
-
-  def test_should_return_text_of_child_element_and_trailing_text(self):
-    node = E.parent(E.child(SOME_VALUE_1), SOME_VALUE_2)
-    assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2
-
-  def test_should_return_text_of_parent_excluding_children_to_exclude(self):
-    child = E.child(SOME_VALUE_1)
-    node = E.parent(child, SOME_VALUE_2)
-    assert get_text_content(node, exclude=[child]) == SOME_VALUE_2
-
-class TestGetImmediateText(object):
-  def test_should_return_simple_text(self):
-    node = E.parent(SOME_VALUE_1)
-    assert get_immediate_text(node) == [SOME_VALUE_1]
-
-  def test_should_not_return_text_of_child_element(self):
-    node = E.parent(E.child(SOME_VALUE_1))
-    assert get_immediate_text(node) == []
-
-class TestXmlFromStringWithRecover(object):
-  def test_should_parse_clean_xml(self):
-    root = xml_from_string_with_recover('<root><child1>%s</child1></root>' % SOME_VALUE_1)
-    node = root.find('child1')
-    assert node is not None
-    assert node.text == SOME_VALUE_1
-
-  def test_should_parse_xml_with_unencoded_ampersand(self):
-    value = 'A & B'
-    root = xml_from_string_with_recover('<root><child1>%s</child1></root>' % value)
-    node = root.find('child1')
-    assert node is not None
-    assert node.text == 'A  B'
-
-  def test_should_parse_xml_with_unencoded_unknown_entity(self):
-    value = 'A &unknown; B'
-    root = xml_from_string_with_recover('<root><child1>%s</child1></root>' % value)
-    node = root.find('child1')
-    assert node is not None
-    assert node.text == 'A  B'
diff --git a/sciencebeam_gym/utils/zip.py b/sciencebeam_gym/utils/zip.py
deleted file mode 100644
index 1155a99..0000000
--- a/sciencebeam_gym/utils/zip.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import os
-from stat import S_IXUSR
-
-ZIP_UNIX_SYSTEM = 3
-
-def make_executable(path):
-  os.chmod(path, os.stat(path).st_mode | S_IXUSR)
-
-def extract_all_with_permission(zf, target_dir):
-  for info in zf.infolist():
-    extracted_path = zf.extract(info, target_dir)
-
-    if info.create_system == ZIP_UNIX_SYSTEM:
-      unix_attributes = info.external_attr >> 16
-      if unix_attributes:
-        os.chmod(extracted_path, unix_attributes)
-
-def extract_all_with_executable_permission(zf, target_dir):
-  for info in zf.infolist():
-    extracted_path = zf.extract(info, target_dir)
-
-    if info.create_system == ZIP_UNIX_SYSTEM and os.path.isfile(extracted_path):
-      unix_attributes = info.external_attr >> 16
-      if unix_attributes & S_IXUSR:
-        make_executable(extracted_path)
-- 
GitLab