From bd100f4d3b80ef27c1e7e2c7c50f71af736e5c8e Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 24 Aug 2018 20:36:26 +0100 Subject: [PATCH] Use sciencebeam-utils and sciencebeam-alignment (#35) * added sciencebeam-utils and sciencebeam-alignment dependency * replaced local alignment module with sciencebeam-alignment * replaced local beam_utils with sciencebeam-utils * replaced local utils with sciencebeam-utils for modules that have moved * removed tools that have been moved to sciencebeam-utils * removed utility functions that have moved to sciencebeam-utils * updated readme --- README.md | 12 +- requirements.txt | 2 + sciencebeam_gym/alignment/SequenceMatcher.py | 14 - .../alignment/WordSequenceMatcher.py | 42 --- .../alignment/WordSequenceMatcher_test.py | 46 --- sciencebeam_gym/alignment/__init__.py | 0 sciencebeam_gym/alignment/align.py | 338 ------------------ .../alignment/align_fast_utils.pyx | 147 -------- .../alignment/align_performance.py | 85 ----- sciencebeam_gym/alignment/align_test.py | 143 -------- sciencebeam_gym/beam_utils/__init__.py | 0 sciencebeam_gym/beam_utils/csv.py | 180 ---------- sciencebeam_gym/beam_utils/csv_test.py | 126 ------- sciencebeam_gym/beam_utils/files.py | 49 --- sciencebeam_gym/beam_utils/files_test.py | 77 ---- sciencebeam_gym/beam_utils/io.py | 46 --- sciencebeam_gym/beam_utils/main.py | 136 ------- sciencebeam_gym/beam_utils/testing.py | 242 ------------- sciencebeam_gym/beam_utils/utils.py | 107 ------ sciencebeam_gym/beam_utils/utils_test.py | 152 -------- .../convert/conversion_pipeline.py | 27 +- .../convert/conversion_pipeline_test.py | 2 +- .../convert/grobid/grobid_service_wrapper.py | 4 +- .../inference_model/__init___test.py | 2 +- .../annotate_using_predictions.py | 2 +- .../inference_model/extract_to_xml.py | 2 +- .../inference_model/extract_to_xml_test.py | 2 +- .../text/crf/crfsuite_training_pipeline.py | 12 +- .../crf/crfsuite_training_pipeline_test.py | 2 +- sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py | 4 +- .../annotation/annotation_evaluation.py | 2 +- .../annotation/find_line_numbers_test.py | 2 +- .../preprocess/annotation/fuzzy_match.py | 6 +- .../annotation/matching_annotator.py | 8 +- .../annotation/matching_annotator_test.py | 8 +- .../annotation/target_annotation.py | 8 +- sciencebeam_gym/preprocess/check_file_list.py | 85 ----- .../preprocess/check_file_list_test.py | 58 --- sciencebeam_gym/preprocess/find_file_pairs.py | 93 ----- .../preprocess/find_file_pairs_test.py | 152 -------- .../preprocess/get_output_files.py | 153 -------- .../preprocess/get_output_files_test.py | 181 ---------- sciencebeam_gym/preprocess/lxml_to_svg.py | 10 +- .../preprocess/preprocessing_pipeline.py | 35 +- .../preprocess/preprocessing_pipeline_test.py | 12 +- .../preprocessing_transforms_test.py | 12 +- .../preprocess/preprocessing_utils.py | 144 +------- .../preprocess/preprocessing_utils_test.py | 127 ------- .../preprocess/split_csv_dataset.py | 142 -------- .../preprocess/split_csv_dataset_test.py | 63 ---- sciencebeam_gym/structured_document/lxml.py | 8 +- .../structured_document_saver.py | 2 +- sciencebeam_gym/structured_document/svg.py | 8 +- .../tools/calculate_class_weights_test.py | 2 +- sciencebeam_gym/trainer/data/examples.py | 2 +- sciencebeam_gym/trainer/data/examples_test.py | 2 +- sciencebeam_gym/trainer/evaluator_test.py | 2 +- .../trainer/models/pix2pix/evaluate_test.py | 2 +- .../trainer/models/pix2pix/loss_test.py | 2 +- .../models/pix2pix/pix2pix_core_test.py | 4 +- .../models/pix2pix/pix2pix_model_test.py | 2 +- .../trainer/models/pix2pix/tf_utils_test.py | 2 +- sciencebeam_gym/utils/collection.py | 63 ---- sciencebeam_gym/utils/compat.py | 14 - sciencebeam_gym/utils/compat_test.py | 57 --- sciencebeam_gym/utils/csv.py | 56 --- sciencebeam_gym/utils/file_list.py | 88 ----- sciencebeam_gym/utils/file_list_test.py | 202 ----------- sciencebeam_gym/utils/file_path.py | 21 -- sciencebeam_gym/utils/file_path_test.py | 24 -- sciencebeam_gym/utils/io.py | 16 - sciencebeam_gym/utils/num.py | 21 -- sciencebeam_gym/utils/pages_zip.py | 2 +- sciencebeam_gym/utils/stopwatch.py | 46 --- sciencebeam_gym/utils/string.py | 9 - sciencebeam_gym/utils/xml.py | 41 --- sciencebeam_gym/utils/xml_test.py | 62 ---- sciencebeam_gym/utils/zip.py | 25 -- 78 files changed, 127 insertions(+), 3962 deletions(-) delete mode 100644 sciencebeam_gym/alignment/SequenceMatcher.py delete mode 100644 sciencebeam_gym/alignment/WordSequenceMatcher.py delete mode 100644 sciencebeam_gym/alignment/WordSequenceMatcher_test.py delete mode 100644 sciencebeam_gym/alignment/__init__.py delete mode 100644 sciencebeam_gym/alignment/align.py delete mode 100644 sciencebeam_gym/alignment/align_fast_utils.pyx delete mode 100644 sciencebeam_gym/alignment/align_performance.py delete mode 100644 sciencebeam_gym/alignment/align_test.py delete mode 100644 sciencebeam_gym/beam_utils/__init__.py delete mode 100644 sciencebeam_gym/beam_utils/csv.py delete mode 100644 sciencebeam_gym/beam_utils/csv_test.py delete mode 100644 sciencebeam_gym/beam_utils/files.py delete mode 100644 sciencebeam_gym/beam_utils/files_test.py delete mode 100644 sciencebeam_gym/beam_utils/io.py delete mode 100644 sciencebeam_gym/beam_utils/main.py delete mode 100644 sciencebeam_gym/beam_utils/testing.py delete mode 100644 sciencebeam_gym/beam_utils/utils.py delete mode 100644 sciencebeam_gym/beam_utils/utils_test.py delete mode 100644 sciencebeam_gym/preprocess/check_file_list.py delete mode 100644 sciencebeam_gym/preprocess/check_file_list_test.py delete mode 100644 sciencebeam_gym/preprocess/find_file_pairs.py delete mode 100644 sciencebeam_gym/preprocess/find_file_pairs_test.py delete mode 100644 sciencebeam_gym/preprocess/get_output_files.py delete mode 100644 sciencebeam_gym/preprocess/get_output_files_test.py delete mode 100644 sciencebeam_gym/preprocess/split_csv_dataset.py delete mode 100644 sciencebeam_gym/preprocess/split_csv_dataset_test.py delete mode 100644 sciencebeam_gym/utils/collection.py delete mode 100644 sciencebeam_gym/utils/compat.py delete mode 100644 sciencebeam_gym/utils/compat_test.py delete mode 100644 sciencebeam_gym/utils/csv.py delete mode 100644 sciencebeam_gym/utils/file_list.py delete mode 100644 sciencebeam_gym/utils/file_list_test.py delete mode 100644 sciencebeam_gym/utils/file_path.py delete mode 100644 sciencebeam_gym/utils/file_path_test.py delete mode 100644 sciencebeam_gym/utils/io.py delete mode 100644 sciencebeam_gym/utils/num.py delete mode 100644 sciencebeam_gym/utils/stopwatch.py delete mode 100644 sciencebeam_gym/utils/string.py delete mode 100644 sciencebeam_gym/utils/xml.py delete mode 100644 sciencebeam_gym/utils/xml_test.py delete mode 100644 sciencebeam_gym/utils/zip.py diff --git a/README.md b/README.md index a8d0a71..5a73a6f 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ The parent directory per manuscript is optional. If that is not the case then th Run: ```bash -python -m sciencebeam_gym.preprocess.find_file_pairs \ +python -m sciencebeam_utils.tools.find_file_pairs \ --data-path <source directory> \ --source-pattern *.pdf.gz --xml-pattern *.nxml.gz \ --out <output file list csv/tsv> @@ -116,7 +116,7 @@ python -m sciencebeam_gym.preprocess.find_file_pairs \ e.g.: ```bash -python -m sciencebeam_gym.preprocess.find_file_pairs \ +python -m sciencebeam_utils.tools.find_file_pairs \ --data-path gs://some-bucket/some-dataset \ --source-pattern *.pdf.gz --xml-pattern *.nxml.gz \ --out gs://some-bucket/some-dataset/file-list.tsv @@ -134,7 +134,7 @@ That file could also be generated using any other preferred method. To separate the file list into a _training_, _validation_ and _test_ dataset, the following script can be used: ```bash -python -m sciencebeam_gym.preprocess.split_csv_dataset \ +python -m sciencebeam_utils.tools.split_csv_dataset \ --input <csv/tsv file list> \ --train 0.5 --validation 0.2 --test 0.3 --random --fill ``` @@ -142,7 +142,7 @@ python -m sciencebeam_gym.preprocess.split_csv_dataset \ e.g.: ```bash -python -m sciencebeam_gym.preprocess.split_csv_dataset \ +python -m sciencebeam_utils.tools.split_csv_dataset \ --input gs://some-bucket/some-dataset/file-list.tsv \ --train 0.5 --validation 0.2 --test 0.3 --random --fill ``` @@ -261,7 +261,7 @@ A (Linear Chain) [CRF](https://en.wikipedia.org/wiki/Conditional_random_field) m When running the (CV) preprocessing pipeline with the parameter `--save-svg`, files with the file ext `.svg.zip` will be written to the _output path_. To get a list of those one can use the following command: ```bash -python -m sciencebeam_gym.preprocess.get_output_files \ +python -m sciencebeam_utils.tools.get_output_files \ --source-file-list path/to/file-list-train.tsv --source-file-column=pdf-url \ --output-file-suffix=.svg.zip --output-file-list path/to/file-list-train-svg.tsv ``` @@ -273,7 +273,7 @@ The CRF Model can also be trained using the CV output as an additional input. When running the CV conversion pipeline with the parameter `--save-annot-lxml`, files with the ext `.cv.lxml.gz` will be written to the _output path_. To get a list of those one can use the following command: ```bash -python -m sciencebeam_gym.preprocess.get_output_files \ +python -m sciencebeam_utils.tools.get_output_files \ --source-file-list path/to/file-list-train.tsv --source-file-column=pdf-url \ --output-file-suffix=.cv.lxml.gz --output-file-list path/to/file-list-train-cv-lxml.tsv ``` diff --git a/requirements.txt b/requirements.txt index 7120aaa..4802b98 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,8 @@ protobuf==3.5.2.post1 python-crfsuite==0.9.5 Pyqtree==0.24 requests==2.18.4 +sciencebeam-alignment==0.0.2 +sciencebeam-utils==0.0.1 sklearn-crfsuite==0.3.6 six==1.11.0 tensorflow-transform==0.6.0 diff --git a/sciencebeam_gym/alignment/SequenceMatcher.py b/sciencebeam_gym/alignment/SequenceMatcher.py deleted file mode 100644 index 0f11012..0000000 --- a/sciencebeam_gym/alignment/SequenceMatcher.py +++ /dev/null @@ -1,14 +0,0 @@ -import platform -import warnings - -try: - from fuzzywuzzy.StringMatcher import StringMatcher as SequenceMatcher -except ImportError: - if platform.python_implementation() != "PyPy": - warnings.warn( - 'Using slow pure-python SequenceMatcher.' - ' Install python-Levenshtein (and fuzzywuzzy) to remove this warning' - ) - from difflib import SequenceMatcher - -assert SequenceMatcher is not None diff --git a/sciencebeam_gym/alignment/WordSequenceMatcher.py b/sciencebeam_gym/alignment/WordSequenceMatcher.py deleted file mode 100644 index 75ee892..0000000 --- a/sciencebeam_gym/alignment/WordSequenceMatcher.py +++ /dev/null @@ -1,42 +0,0 @@ -import logging -from difflib import SequenceMatcher - -DEFAULT_SEPARATORS = ' ,' - -def get_logger(): - return logging.getLogger(__name__) - -def split_with_offset(s, sep): - previous_start = 0 - tokens = [] - for i, c in enumerate(s): - if c in sep: - if previous_start < i: - tokens.append((previous_start, s[previous_start:i])) - previous_start = i + 1 - if previous_start < len(s): - tokens.append((previous_start, s[previous_start:])) - return tokens - -class WordSequenceMatcher(object): - def __init__(self, isjunk=None, a=None, b=None, sep=None): - if isjunk: - raise ValueError('isjunk not supported') - self.a = a - self.b = b - self.sep = sep or DEFAULT_SEPARATORS - - def get_matching_blocks(self): - a_words_with_offsets = split_with_offset(self.a, self.sep) - b_words_with_offsets = split_with_offset(self.b, self.sep) - a_words = [w for _, w in a_words_with_offsets] - b_words = [w for _, w in b_words_with_offsets] - a_indices = [i for i, _ in a_words_with_offsets] - b_indices = [i for i, _ in b_words_with_offsets] - sm = SequenceMatcher(None, a_words, b_words) - matching_blocks = [ - (a_indices[ai], b_indices[bi], len(a_words[ai])) - for ai, bi, size in sm.get_matching_blocks() - if size - ] - return matching_blocks diff --git a/sciencebeam_gym/alignment/WordSequenceMatcher_test.py b/sciencebeam_gym/alignment/WordSequenceMatcher_test.py deleted file mode 100644 index 56a52d7..0000000 --- a/sciencebeam_gym/alignment/WordSequenceMatcher_test.py +++ /dev/null @@ -1,46 +0,0 @@ -from sciencebeam_gym.alignment.WordSequenceMatcher import ( - WordSequenceMatcher -) - -WORD_1 = 'word1' -WORD_2 = 'word2' -WORD_3 = 'word3' - -class TestWordSequenceMatcher(object): - def test_should_not_match_different_words(self): - sm = WordSequenceMatcher(None, WORD_1, WORD_2) - matching_blocks = sm.get_matching_blocks() - assert len(matching_blocks) == 0 - - def test_should_match_same_words_standalone(self): - sm = WordSequenceMatcher(None, WORD_1, WORD_1) - matching_blocks = sm.get_matching_blocks() - assert matching_blocks == [( - 0, - 0, - len(WORD_1) - )] - - def test_should_match_same_words_within_other_words(self): - a_words = ['pre_a__', WORD_1, 'post_a'] - b_words = ['pre_b', WORD_1, 'post_b'] - sm = WordSequenceMatcher( - None, - ' '.join(a_words), - ' '.join(b_words) - ) - matching_blocks = sm.get_matching_blocks() - assert matching_blocks == [( - len(a_words[0]) + 1, - len(b_words[0]) + 1, - len(WORD_1) - )] - - def test_should_match_same_words_standalone_ignore_comma_after_word(self): - sm = WordSequenceMatcher(None, WORD_1 + ',', WORD_1) - matching_blocks = sm.get_matching_blocks() - assert matching_blocks == [( - 0, - 0, - len(WORD_1) - )] diff --git a/sciencebeam_gym/alignment/__init__.py b/sciencebeam_gym/alignment/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sciencebeam_gym/alignment/align.py b/sciencebeam_gym/alignment/align.py deleted file mode 100644 index c3094c2..0000000 --- a/sciencebeam_gym/alignment/align.py +++ /dev/null @@ -1,338 +0,0 @@ -import logging -from collections import deque -from itertools import islice -from abc import ABCMeta, abstractmethod -from contextlib import contextmanager - -import numpy as np - -from six import ( - with_metaclass, - string_types, - binary_type -) - -try: - from sciencebeam_gym.alignment.align_fast_utils import ( # pylint: disable=E0611 - native_compute_inner_alignment_matrix_simple_scoring_int, - native_compute_inner_alignment_matrix_simple_scoring_any, - native_compute_inner_alignment_matrix_scoring_fn_any, - native_alignment_matrix_single_path_traceback - ) - native_enabled = True -except ImportError: - logging.getLogger(__name__).warning('fast implementation not available') - native_enabled = False - -MIN_INT = -2147483647 - -def get_logger(): - return logging.getLogger(__name__) - -@contextmanager -def require_native(required=True): - global native_enabled # pylint: disable=W0603 - was_enabled = native_enabled - native_enabled = required - yield - native_enabled = was_enabled - -def _is_array_of_type(a, dtype): - return np.issubdtype(a.dtype, dtype) - -# fallback implementation -def compute_inner_alignment_matrix_simple_scoring_py( - scoring_matrix, a, b, match_score, mismatch_score, gap_score, min_score): - m = len(a) + 1 - n = len(b) + 1 - for i in range(1, m): - for j in range(1, n): - scoring_matrix[i, j] = max( - min_score, - - # Match elements. - scoring_matrix[i - 1, j - 1] + - (match_score if a[i - 1] == b[j - 1] else mismatch_score), - - # Gap on sequenceA. - scoring_matrix[i, j - 1] + gap_score, - - # Gap on sequenceB. - scoring_matrix[i - 1, j] + gap_score - ) - -# fallback implementation -def compute_inner_alignment_matrix_scoring_fn_py( - scoring_matrix, a, b, scoring_fn, gap_score, min_score): - m = len(a) + 1 - n = len(b) + 1 - for i in range(1, m): - for j in range(1, n): - scoring_matrix[i, j] = max( - min_score, - - # Match elements. - scoring_matrix[i - 1, j - 1] + - scoring_fn(a[i - 1], b[j - 1]), - - # Gap on sequenceA. - scoring_matrix[i, j - 1] + gap_score, - - # Gap on sequenceB. - scoring_matrix[i - 1, j] + gap_score - ) - -def compute_inner_alignment_matrix_simple_scoring( - scoring_matrix, a, b, match_score, mismatch_score, gap_score, min_score): - try: - if ( - native_enabled and - _is_array_of_type(a, np.int32) and _is_array_of_type(b, np.int32) - ): - native_compute_inner_alignment_matrix_simple_scoring_int( - scoring_matrix, a, b, - match_score, mismatch_score, gap_score, min_score - ) - return - elif native_enabled: - native_compute_inner_alignment_matrix_simple_scoring_any( - scoring_matrix, a, b, - match_score, mismatch_score, gap_score, min_score - ) - return - except AttributeError: - pass - compute_inner_alignment_matrix_simple_scoring_py( - scoring_matrix, a, b, - match_score, mismatch_score, gap_score, min_score - ) - -def compute_inner_alignment_matrix_custom_scoring( - scoring_matrix, a, b, scoring_fn, gap_score, min_score): - - if native_enabled: - native_compute_inner_alignment_matrix_scoring_fn_any( - scoring_matrix, a, b, - scoring_fn, gap_score, min_score - ) - else: - compute_inner_alignment_matrix_scoring_fn_py( - scoring_matrix, a, b, - scoring_fn, gap_score, min_score - ) - -def compute_inner_alignment_matrix( - scoring_matrix, a, b, scoring, min_score): - if isinstance(scoring, CustomScoring): - compute_inner_alignment_matrix_custom_scoring( - scoring_matrix, a, b, - scoring.scoring_fn, scoring.gap_score, min_score - ) - else: - compute_inner_alignment_matrix_simple_scoring( - scoring_matrix, a, b, - scoring.match_score, scoring.mismatch_score, scoring.gap_score, - min_score - ) - -def _next_locs(score_matrix, i, j, is_local): - diag_score = score_matrix[i - 1][j - 1] if (i != 0 and j != 0) else MIN_INT - up_score = score_matrix[i - 1][j] if i != 0 else MIN_INT - left_score = score_matrix[i][j - 1] if j != 0 else MIN_INT - max_score = max(diag_score, up_score, left_score) - if max_score == MIN_INT: - return [] - if (max_score == 0 or diag_score == 0) and (is_local or (i == 1 and j == 1)): - return [] - if diag_score == max_score: - get_logger().debug('diag_score: %s (%s)', diag_score, max_score) - return [(i - 1, j - 1)] - locs = [] - if up_score == max_score: - locs.append((i - 1, j)) - if left_score == max_score: - locs.append((i, j - 1)) - return locs - -def alignment_matrix_traceback_py(score_matrix, start_locs, is_local): - # Using LinkedListNode to cheaply branch off to multiple paths - pending_roots = deque([ - LinkedListNode(tuple(loc)) - for loc in start_locs - ]) - while len(pending_roots) > 0: - n = pending_roots.pop() - i, j = n.data - next_locs = _next_locs(score_matrix, i, j, is_local) - get_logger().debug('next_locs: %s', next_locs) - if len(next_locs) == 0: - yield n - else: - pending_roots.extend([ - LinkedListNode(next_loc, n) - for next_loc in next_locs - ]) - -def alignment_matrix_traceback(score_matrix, start_locs, is_local, limit): - if native_enabled and limit == 1: - yield native_alignment_matrix_single_path_traceback( - score_matrix, start_locs[0], 1 if is_local else 0 - ) - else: - paths = alignment_matrix_traceback_py( - score_matrix, reversed(start_locs), is_local - ) - if limit: - paths = islice(paths, limit) - for path in paths: - yield path - -class SimpleScoring(object): - def __init__(self, match_score, mismatch_score, gap_score): - self.match_score = match_score - self.mismatch_score = mismatch_score - self.gap_score = gap_score - -class CustomScoring(object): - def __init__(self, scoring_fn, gap_score): - self.scoring_fn = scoring_fn - self.gap_score = gap_score - -class LinkedListNode(object): - def __init__(self, data, next_node=None): - self.data = data - self.next_node = next_node - - def __str__(self): - if self.next_node is not None: - return str(self.data) + ' -> ' + str(self.next_node) - else: - return str(self.data) - - def __iter__(self): - yield self.data - next_node = self.next_node - while next_node is not None: - yield next_node.data - next_node = next_node.next_node - -def _path_to_matching_blocks(path, a, b): - block_ai = 0 - block_bi = 0 - block_size = 0 - for ai, bi in ((ai_ - 1, bi_ - 1) for ai_, bi_ in path): - if a[ai] == b[bi]: - if block_size and block_ai + block_size == ai and block_bi + block_size == bi: - block_size += 1 - else: - if block_size: - yield (block_ai, block_bi, block_size) - block_ai = ai - block_bi = bi - block_size = 1 - if block_size: - yield (block_ai, block_bi, block_size) - -def _as_np_array(s): - if isinstance(s, binary_type): - return np.frombuffer(s, dtype=np.uint8).astype(np.int32) - if isinstance(s, string_types): - return np.array([ord(c) for c in s], dtype=np.int32) - return np.asarray(s) - -wrap_sequence = _as_np_array - -class AbstractSequenceMatcher(object, with_metaclass(ABCMeta)): - def __init__(self, a, b, scoring): - self.a = a - self.b = b - self.scoring = scoring - self._alignment_matrix = None - self._a = _as_np_array(a) - self._b = _as_np_array(b) - - @abstractmethod - def _computer_alignment_matrix(self): - pass - - def _get_alignment_matrix(self): - if self._alignment_matrix is None: - self._alignment_matrix = self._computer_alignment_matrix() - return self._alignment_matrix - - @abstractmethod - def get_multiple_matching_blocks(self, limit=None): - pass - - def get_matching_blocks(self): - for matching_blocks in self.get_multiple_matching_blocks(limit=1): - return list(matching_blocks) + [(len(self.a), len(self.b), 0)] - return [(len(self.a), len(self.b), 0)] - -class LocalSequenceMatcher(AbstractSequenceMatcher): - """ - Local sequence matcher using Smith-Waterman algorithm - """ - - def _computer_alignment_matrix(self): - m = len(self._a) + 1 - n = len(self._b) + 1 - scoring_matrix = np.empty((m, n), dtype=int) - scoring_matrix[:, 0] = 0 - scoring_matrix[0, :] = 0 - min_score = 0 - compute_inner_alignment_matrix( - scoring_matrix, - self._a, self._b, - self.scoring, - min_score - ) - return scoring_matrix - - def get_multiple_matching_blocks(self, limit=None): - score_matrix = self._get_alignment_matrix() - max_score = score_matrix.max() - - max_score_loc = np.argwhere(score_matrix == max_score) - get_logger().debug('max_score_loc: %s', max_score_loc) - is_local = True - paths = alignment_matrix_traceback(score_matrix, max_score_loc, is_local, limit=limit or 0) - return ( - list(_path_to_matching_blocks(path, self.a, self.b)) - for path in paths - ) - -class GlobalSequenceMatcher(AbstractSequenceMatcher): - """ - Global sequence matcher using Needleman-Wunsch algorithm - """ - - def _computer_alignment_matrix(self): - m = len(self._a) + 1 - n = len(self._b) + 1 - scoring_matrix = np.empty((m, n), dtype=int) - for i in range(m): - scoring_matrix[i, 0] = self.scoring.gap_score * i - for j in range(n): - scoring_matrix[0, j] = self.scoring.gap_score * j - min_score = MIN_INT - compute_inner_alignment_matrix( - scoring_matrix, - self._a, self._b, - self.scoring, - min_score - ) - return scoring_matrix - - def get_multiple_matching_blocks(self, limit=None): - score_matrix = self._get_alignment_matrix() - - m = len(self._a) + 1 - n = len(self._b) + 1 - start_locs = [(m - 1, n - 1)] - is_local = False - paths = alignment_matrix_traceback(score_matrix, start_locs, is_local, limit=limit or 0) - return ( - list(_path_to_matching_blocks(path, self.a, self.b)) - for path in paths - ) diff --git a/sciencebeam_gym/alignment/align_fast_utils.pyx b/sciencebeam_gym/alignment/align_fast_utils.pyx deleted file mode 100644 index bc263ad..0000000 --- a/sciencebeam_gym/alignment/align_fast_utils.pyx +++ /dev/null @@ -1,147 +0,0 @@ -from cpython cimport array -cimport cython - -import logging - -import numpy as np -cimport numpy as np - -DEF MIN_INT = -2147483647 - -def get_logger(): - return logging.getLogger(__name__) - -ctypedef np.int_t int_t -ctypedef int_t[:, :] score_matrix_t -ctypedef bint bool_t - -cdef inline int imax2(int a, int b): - if a >= b: - return a - else: - return b - -cdef inline int imax3(int a, int b, int c): - if a >= b: - return imax2(a, c) - else: - return imax2(b, c) - -cdef inline int imax4(int a, int b, int c, int d): - if a >= b: - return imax3(a, c, d) - else: - return imax3(b, c, d) - -def native_compute_inner_alignment_matrix_simple_scoring_int( - score_matrix_t scoring_matrix, - int[:] a, - int[:] b, - int match_score, int mismatch_score, int gap_score, int min_score): - cdef int m = len(a) + 1 - cdef int n = len(b) + 1 - cdef int i, j - for i in range(1, m): - for j in range(1, n): - scoring_matrix[i, j] = imax4( - min_score, - - # Match elements. - scoring_matrix[i - 1, j - 1] + - (match_score if a[i - 1] == b[j - 1] else mismatch_score), - - # Gap on sequenceA. - scoring_matrix[i, j - 1] + gap_score, - - # Gap on sequenceB. - scoring_matrix[i - 1, j] + gap_score - ) - -def native_compute_inner_alignment_matrix_simple_scoring_any( - score_matrix_t scoring_matrix, - a, - b, - int match_score, int mismatch_score, int gap_score, int min_score): - cdef list ca = list(a) - cdef list cb = list(b) - cdef int m = len(ca) + 1 - cdef int n = len(cb) + 1 - cdef int i, j - for i in range(1, m): - for j in range(1, n): - scoring_matrix[i, j] = imax4( - min_score, - - # Match elements. - scoring_matrix[i - 1, j - 1] + - (match_score if ca[i - 1] == cb[j - 1] else mismatch_score), - - # Gap on sequenceA. - scoring_matrix[i, j - 1] + gap_score, - - # Gap on sequenceB. - scoring_matrix[i - 1, j] + gap_score - ) - -def native_compute_inner_alignment_matrix_scoring_fn_any( - score_matrix_t scoring_matrix, - a, - b, - scoring_fn, int gap_score, int min_score): - cdef list ca = list(a) - cdef list cb = list(b) - cdef int m = len(ca) + 1 - cdef int n = len(cb) + 1 - cdef int i, j - for i in range(1, m): - for j in range(1, n): - scoring_matrix[i, j] = imax4( - min_score, - - # Match elements. - scoring_matrix[i - 1, j - 1] + - scoring_fn(ca[i - 1], cb[j - 1]), - - # Gap on sequenceA. - scoring_matrix[i, j - 1] + gap_score, - - # Gap on sequenceB. - scoring_matrix[i - 1, j] + gap_score - ) - -cdef inline _next_loc( - score_matrix_t score_matrix, int i, int j, bool_t is_local): - - diag_score = score_matrix[i - 1][j - 1] if (i != 0 and j != 0) else MIN_INT - up_score = score_matrix[i - 1][j] if i != 0 else MIN_INT - left_score = score_matrix[i][j - 1] if j != 0 else MIN_INT - max_score = imax3(diag_score, up_score, left_score) - if max_score == MIN_INT: - return None - if (max_score == 0 or diag_score == 0) and (is_local or (i == 1 and j == 1)): - # stop at local match, or end - return None - if diag_score == max_score: - return (i - 1, j - 1) - if up_score == max_score: - return (i - 1, j) - if left_score == max_score: - return (i, j - 1) - return None - -def native_alignment_matrix_single_path_traceback( - score_matrix_t score_matrix, - start_loc, bool_t is_local): - - cdef int[2] cur_loc = (int(start_loc[0]), int(start_loc[1])) - cdef list path = [cur_loc] - cdef int i, j - cdef tuple next_loc - while True: - i, j = cur_loc - next_loc = _next_loc(score_matrix, i, j, is_local) - if not next_loc: - return path - else: - cur_loc = next_loc - path.insert(0, cur_loc) diff --git a/sciencebeam_gym/alignment/align_performance.py b/sciencebeam_gym/alignment/align_performance.py deleted file mode 100644 index 0901f58..0000000 --- a/sciencebeam_gym/alignment/align_performance.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import absolute_import, print_function - -import logging -import timeit - -import numpy as np - -from sciencebeam_gym.alignment.align import ( - SimpleScoring, - CustomScoring, - LocalSequenceMatcher, - require_native -) - -DEFAULT_MATCH_SCORE = 2 -DEFAULT_MISMATCH_SCORE = -1 -DEFAULT_GAP_SCORE = -3 - -DEFAULT_SCORING = SimpleScoring( - DEFAULT_MATCH_SCORE, DEFAULT_MISMATCH_SCORE, DEFAULT_GAP_SCORE -) - -CUSTOM_SCORING = CustomScoring( - lambda a, b: DEFAULT_MATCH_SCORE if a == b else DEFAULT_MISMATCH_SCORE, - DEFAULT_GAP_SCORE -) - -SHORT_STRING1 = 'abc' -SHORT_STRING2 = 'def' - -LONG_STRING1 = 'abcefghijk' * 100 -LONG_STRING2 = ''.join(list(reversed(LONG_STRING1))) - -def encode_str(s): - return np.array([int(ord(x)) for x in s], dtype=np.int32) - -LONG_ENCODED1 = encode_str(LONG_STRING1) -LONG_ENCODED2 = encode_str(LONG_STRING2) - -def test_align_with_scoring_fn_py(): - with require_native(False): - LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, CUSTOM_SCORING).get_matching_blocks() - -def test_align_with_scoring_fn(): - with require_native(True): - LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, CUSTOM_SCORING).get_matching_blocks() - -def test_align_with_simple_scoring(): - with require_native(True): - LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, DEFAULT_SCORING).get_matching_blocks() - -def test_align_with_simple_scoring_int(): - with require_native(True): - LocalSequenceMatcher(LONG_ENCODED1, LONG_ENCODED2, DEFAULT_SCORING).get_matching_blocks() - -def test_align_with_simple_scoring_str(): - with require_native(True): - LocalSequenceMatcher(LONG_STRING1, LONG_STRING2, DEFAULT_SCORING).get_matching_blocks() - -def report_timing(fn, number=1): - timeit_result_ms = timeit.timeit( - fn + "()", - setup="from __main__ import " + fn, - number=number - ) * 1000 - print("{} ({}x):\n{:f} ms / it ({:f} ms total)\n".format( - fn, - number, - timeit_result_ms / number, - timeit_result_ms - )) - -def main(): - print("len LONG_STRING1: {}\n".format(len(LONG_STRING1))) - print("len LONG_ENCODED1: {}\n".format(len(LONG_ENCODED1))) - report_timing("test_align_with_scoring_fn_py") - report_timing("test_align_with_scoring_fn", 3) - report_timing("test_align_with_simple_scoring", 3) - report_timing("test_align_with_simple_scoring_int", 3) - report_timing("test_align_with_simple_scoring_str", 3) - -if __name__ == "__main__": - logging.basicConfig(level='INFO') - - main() diff --git a/sciencebeam_gym/alignment/align_test.py b/sciencebeam_gym/alignment/align_test.py deleted file mode 100644 index 3e62b4c..0000000 --- a/sciencebeam_gym/alignment/align_test.py +++ /dev/null @@ -1,143 +0,0 @@ -from abc import ABCMeta, abstractmethod -from contextlib import contextmanager - -from six import ( - with_metaclass, - u as as_u, - b as as_b -) - -import numpy as np - -from sciencebeam_gym.alignment.align import ( - LocalSequenceMatcher, - GlobalSequenceMatcher, - SimpleScoring, - CustomScoring, - require_native -) - -DEFAULT_SCORING = SimpleScoring( - match_score=3, - mismatch_score=-1, - gap_score=-2 -) - -DEFAULT_CUSTOM_SCORING = CustomScoring( - scoring_fn=lambda a, b: ( - DEFAULT_SCORING.match_score if a == b else DEFAULT_SCORING.mismatch_score - ), - gap_score=-2 -) - -def _non_zero(matching_blocks): - return [(ai, bi, size) for ai, bi, size in matching_blocks if size] - -class CharWrapper(object): - def __init__(self, c): - self.c = c - - def __eq__(self, b): - return self.c == b.c - -class AbstractTestCommonSequenceMatcher(object, with_metaclass(ABCMeta)): - @abstractmethod - def _convert(self, x): - pass - - @contextmanager - def _wrap_test(self): - yield - - def test_should_not_return_non_zero_blocks_for_no(self): - with self._wrap_test(): - sm = self._matcher(a='a', b='b') - assert _non_zero(sm.get_matching_blocks()) == [] - - def test_should_add_zero_block_with_final_indices(self): - with self._wrap_test(): - sm = self._matcher(a='a', b='b') - assert sm.get_matching_blocks() == [(1, 1, 0)] - - def test_should_return_single_character_match(self): - with self._wrap_test(): - sm = self._matcher(a='a', b='a') - assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 1)] - - def test_should_return_combine_character_match_block(self): - with self._wrap_test(): - sm = self._matcher(a='abc', b='abc') - assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3)] - - def test_should_return_combine_character_match_blocks_with_gaps(self): - with self._wrap_test(): - sm = self._matcher(a='abc123xyz', b='abc987xyz') - assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3), (6, 6, 3)] - - def test_should_align_using_custom_scoring_fn(self): - with self._wrap_test(): - sm = self._matcher(a='a', b='a', scoring=DEFAULT_CUSTOM_SCORING) - assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 1)] - -class AbstractTestLocalSequenceMatcher(AbstractTestCommonSequenceMatcher): - def _matcher(self, a, b, scoring=None): - return LocalSequenceMatcher( - a=self._convert(a), - b=self._convert(b), - scoring=scoring or DEFAULT_SCORING - ) - - def test_should_not_match_block_after_big_gap(self): - with self._wrap_test(): - sm = self._matcher(a='abcxyz', b='abc123456xyz') - assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3)] - -class TestLocalSequenceMatcherWithUnicode(AbstractTestLocalSequenceMatcher): - def _convert(self, x): - return as_u(x) - -class TestLocalSequenceMatcherWithBytes(AbstractTestLocalSequenceMatcher): - def _convert(self, x): - return as_b(x) - -class TestLocalSequenceMatcherWithIntList(AbstractTestLocalSequenceMatcher): - def _convert(self, x): - return [ord(c) for c in x] - -class TestLocalSequenceMatcherWithNumpyInt32Array(AbstractTestLocalSequenceMatcher): - def _convert(self, x): - return np.array([ord(c) for c in x], dtype=np.int32) - -class TestLocalSequenceMatcherWithNumpyInt64Array(AbstractTestLocalSequenceMatcher): - def _convert(self, x): - return np.array([ord(c) for c in x], dtype=np.int64) - -class TestLocalSequenceMatcherWithCustomObjectList(AbstractTestLocalSequenceMatcher): - def _convert(self, x): - return [CharWrapper(c) for c in x] - -class TestLocalSequenceMatcherWithNumpyInt32ArrayWithoutNative( - TestLocalSequenceMatcherWithNumpyInt32Array): - - @contextmanager - def _wrap_test(self): - with require_native(False): - yield - -class AbstractTestGlobalSequenceMatcher(AbstractTestCommonSequenceMatcher): - def _matcher(self, a, b, scoring=None): - with require_native(False): - return GlobalSequenceMatcher( - a=self._convert(a), - b=self._convert(b), - scoring=scoring or DEFAULT_SCORING - ) - - def test_should_prefer_match_block_after_big_gap(self): - with self._wrap_test(): - sm = self._matcher(a='abcxyz', b='abc123456xyz') - assert _non_zero(sm.get_matching_blocks()) == [(0, 0, 3), (3, 9, 3)] - -class TestGlobalSequenceMatcherWithUnicode(AbstractTestGlobalSequenceMatcher): - def _convert(self, x): - return as_u(x) diff --git a/sciencebeam_gym/beam_utils/__init__.py b/sciencebeam_gym/beam_utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sciencebeam_gym/beam_utils/csv.py b/sciencebeam_gym/beam_utils/csv.py deleted file mode 100644 index b8a51e7..0000000 --- a/sciencebeam_gym/beam_utils/csv.py +++ /dev/null @@ -1,180 +0,0 @@ -from __future__ import absolute_import - -import logging -import csv -from io import BytesIO - -from six import string_types - -import apache_beam as beam -from apache_beam.io.textio import WriteToText, ReadFromText -from apache_beam.io.filesystem import CompressionTypes -from apache_beam.io.filebasedsource import FileBasedSource - -from sciencebeam_gym.beam_utils.utils import ( - TransformAndLog -) - -from sciencebeam_gym.utils.csv import ( - csv_delimiter_by_filename -) - -def get_logger(): - return logging.getLogger(__name__) - -def DictToList(fields): - def wrapper(x): - get_logger().debug('DictToList: %s -> %s', fields, x) - return [x.get(field) for field in fields] - return wrapper - -def format_csv_rows(rows, delimiter=','): - get_logger().debug('format_csv_rows, rows: %s', rows) - out = BytesIO() - writer = csv.writer(out, delimiter=delimiter) - writer.writerows([ - [ - x.encode('utf-8') if isinstance(x, string_types) else x - for x in row - ] - for row in rows - ]) - result = out.getvalue().decode('utf-8').rstrip('\r\n') - get_logger().debug('format_csv_rows, result: %s', result) - return result - -class WriteDictCsv(beam.PTransform): - def __init__(self, path, columns, file_name_suffix=None): - super(WriteDictCsv, self).__init__() - self.path = path - self.columns = columns - self.file_name_suffix = file_name_suffix - self.delimiter = csv_delimiter_by_filename(path + file_name_suffix) - - def expand(self, pcoll): - return ( - pcoll | - "ToList" >> beam.Map(DictToList(self.columns)) | - "Format" >> TransformAndLog( - beam.Map(lambda x: format_csv_rows([x], delimiter=self.delimiter)), - log_prefix='formatted csv: ', - log_level='debug' - ) | - "Utf8Encode" >> beam.Map(lambda x: x.encode('utf-8')) | - "Write" >> WriteToText( - self.path, - file_name_suffix=self.file_name_suffix, - header=format_csv_rows([self.columns], delimiter=self.delimiter).encode('utf-8') - ) - ) - -def _strip_quotes(s): - return s[1:-1] if len(s) >= 2 and s[0] == '"' and s[-1] == '"' else s - -# copied and modified from https://github.com/pabloem/beam_utils -# (move back if still active) - -class ReadLineIterator(object): - def __init__(self, obj): - self._obj = obj - - def __iter__(self): - return self - - def next(self): - line = self._obj.readline() - if line == None or line == '': - raise StopIteration - return line - -class CsvFileSource(FileBasedSource): - """ A source for a GCS or local comma-separated-file - Parses a text file assuming newline-delimited lines, - and comma-delimited fields. Assumes UTF-8 encoding. - """ - - def __init__( - self, file_pattern, - compression_type=CompressionTypes.AUTO, - delimiter=',', header=True, dictionary_output=True, - validate=True, limit=None): - """ Initialize a CsvFileSource. - Args: - delimiter: The delimiter character in the CSV file. - header: Whether the input file has a header or not. - Default: True - dictionary_output: The kind of records that the CsvFileSource outputs. - If True, then it will output dict()'s, if False it will output list()'s. - Default: True - Raises: - ValueError: If the input arguments are not consistent. - """ - super(CsvFileSource, self).__init__( - file_pattern, - compression_type=compression_type, - validate=validate, - splittable=False # Can't just split anywhere - ) - self.delimiter = delimiter - self.header = header - self.dictionary_output = dictionary_output - self.limit = limit - self._file = None - - if not self.header and dictionary_output: - raise ValueError( - 'header is required for the CSV reader to provide dictionary output' - ) - - def read_records(self, file_name, range_tracker): - # If a multi-file pattern was specified as a source then make sure the - # start/end offsets use the default values for reading the entire file. - headers = None - self._file = self.open_file(file_name) - - reader = csv.reader(ReadLineIterator(self._file), delimiter=self.delimiter) - - line_no = 0 - for i, row in enumerate(reader): - if self.header and i == 0: - headers = row - continue - - if self.limit and line_no >= self.limit: - break - - line_no += 1 - if self.dictionary_output: - yield { - header: value - for header, value in zip(headers, row) - } - else: - yield row - - -class ReadDictCsv(beam.PTransform): - """ - Simplified CSV parser, which does not support: - * multi-line values - * delimiter within value - """ - def __init__(self, filename, header=True, limit=None): - super(ReadDictCsv, self).__init__() - if not header: - raise RuntimeError('header required') - self.filename = filename - self.columns = None - self.delimiter = csv_delimiter_by_filename(filename) - self.limit = limit - self.row_num = 0 - - def expand(self, pcoll): - return ( - pcoll | - beam.io.Read(CsvFileSource( - self.filename, - delimiter=self.delimiter, - limit=self.limit - )) - ) diff --git a/sciencebeam_gym/beam_utils/csv_test.py b/sciencebeam_gym/beam_utils/csv_test.py deleted file mode 100644 index f4c5c4c..0000000 --- a/sciencebeam_gym/beam_utils/csv_test.py +++ /dev/null @@ -1,126 +0,0 @@ -from __future__ import absolute_import - -from contextlib import contextmanager -from mock import patch - -import pytest - -import apache_beam as beam -from apache_beam.testing.util import ( - assert_that, - equal_to -) - -from sciencebeam_gym.beam_utils.testing import ( - TestPipeline, - BeamTest, - MockWriteToText, - patch_beam_io -) - -from sciencebeam_gym.beam_utils.csv import ( - WriteDictCsv, - ReadDictCsv, - format_csv_rows -) - -MODULE_UNDER_TEST = 'sciencebeam_gym.beam_utils.csv' - -@contextmanager -def patch_module_under_test(**kwargs): - with patch.multiple( - MODULE_UNDER_TEST, - **kwargs - ) as mocks: - yield mocks - -def to_csv(rows, delimiter): - return format_csv_rows(rows, delimiter).encode('utf-8').replace('\r\n', '\n') + '\n' - -@pytest.mark.slow -class TestWriteDictCsv(BeamTest): - def test_should_write_tsv_with_header(self, test_context): - with patch_module_under_test(WriteToText=MockWriteToText): - with TestPipeline() as p: - _ = ( - p | - beam.Create([{ - 'a': 'a1', - 'b': 'b1' - }]) | - WriteDictCsv( - '.temp/dummy', - ['a', 'b'], - '.tsv' - ) - ) - assert test_context.get_file_content('.temp/dummy.tsv') == to_csv([ - ['a', 'b'], - ['a1', 'b1'] - ], '\t') - -@pytest.mark.slow -class TestReadDictCsv(BeamTest): - def test_should_read_rows_as_dict(self, test_context): - with patch_beam_io(): - test_context.set_file_content('.temp/dummy.tsv', to_csv([ - ['a', 'b'], - ['a1', 'b1'] - ], '\t')) - - with TestPipeline() as p: - result = ( - p | - ReadDictCsv('.temp/dummy.tsv') - ) - assert_that(result, equal_to([{ - 'a': 'a1', - 'b': 'b1' - }])) - - def test_should_read_multiple(self, test_context): - with patch_beam_io(): - test_context.set_file_content('.temp/dummy.tsv', to_csv([ - ['a', 'b'], - ['a1', 'b1'], - ['a2', 'b2'], - ['a3', 'b3'] - ], '\t')) - - with TestPipeline() as p: - result = ( - p | - ReadDictCsv('.temp/dummy.tsv') - ) - assert_that(result, equal_to([{ - 'a': 'a1', - 'b': 'b1' - }, { - 'a': 'a2', - 'b': 'b2' - }, { - 'a': 'a3', - 'b': 'b3' - }])) - - def test_should_limit_number_of_rows(self, test_context): - with patch_beam_io(): - test_context.set_file_content('.temp/dummy.tsv', to_csv([ - ['a', 'b'], - ['a1', 'b1'], - ['a2', 'b2'], - ['a3', 'b3'] - ], '\t')) - - with TestPipeline() as p: - result = ( - p | - ReadDictCsv('.temp/dummy.tsv', limit=2) - ) - assert_that(result, equal_to([{ - 'a': 'a1', - 'b': 'b1' - }, { - 'a': 'a2', - 'b': 'b2' - }])) diff --git a/sciencebeam_gym/beam_utils/files.py b/sciencebeam_gym/beam_utils/files.py deleted file mode 100644 index 9a3227c..0000000 --- a/sciencebeam_gym/beam_utils/files.py +++ /dev/null @@ -1,49 +0,0 @@ -from itertools import islice - -import apache_beam as beam - -from sciencebeam_gym.beam_utils.csv import ( - ReadDictCsv -) - -from sciencebeam_gym.beam_utils.io import ( - find_matching_filenames -) - -from sciencebeam_gym.beam_utils.utils import ( - GroupTransforms -) - -from sciencebeam_gym.utils.file_list import ( - load_file_list -) - -def find_matching_filenames_with_limit(pattern, limit=None): - return islice( - find_matching_filenames(pattern), - limit - ) - -def ReadFileList(file_list_path, column, limit=None): - file_list = load_file_list(file_list_path, column=column, limit=limit) - return beam.Create(file_list) - -def DeferredReadFileList(file_list_path, column, limit=None): - return GroupTransforms(lambda p: ( - p | - "ReadFileUrls" >> ReadDictCsv(file_list_path, limit=limit) | - "TranslateFileUrls" >> beam.Map(lambda row: row[column]) - )) - -def FindFiles(file_pattern, limit=None): - file_list = list(find_matching_filenames_with_limit(file_pattern, limit=limit)) - return beam.Create(file_list) - -def DeferredFindFiles(file_pattern, limit=None): - return GroupTransforms(lambda p: ( - p | - beam.Create([file_pattern]) | - "FindFiles" >> beam.FlatMap( - lambda pattern: find_matching_filenames_with_limit(pattern, limit) - ) - )) diff --git a/sciencebeam_gym/beam_utils/files_test.py b/sciencebeam_gym/beam_utils/files_test.py deleted file mode 100644 index c11b5dc..0000000 --- a/sciencebeam_gym/beam_utils/files_test.py +++ /dev/null @@ -1,77 +0,0 @@ -from mock import patch - -import apache_beam as beam -from apache_beam.testing.util import ( - assert_that, - equal_to -) - -from sciencebeam_gym.beam_utils.testing import ( - BeamTest, - TestPipeline -) - -import sciencebeam_gym.beam_utils.files as files_module -from sciencebeam_gym.beam_utils.files import ( - ReadFileList, - DeferredReadFileList, - FindFiles, - DeferredFindFiles -) - -FILE_1 = 'file1.pdf' -FILE_2 = 'file2.pdf' - -FILE_LIST_PATH = 'file-list.lst' -COLUMN = 'url' -LIMIT = 10 - -class TestReadFileList(BeamTest): - def test_should_use_load_file_list(self): - with patch.object(files_module, 'load_file_list') as load_file_list: - load_file_list.return_value = [FILE_1, FILE_2] - with TestPipeline() as p: - result = p | ReadFileList(FILE_LIST_PATH, column=COLUMN, limit=LIMIT) - assert_that(result, equal_to([FILE_1, FILE_2])) - load_file_list.assert_called_with(FILE_LIST_PATH, column=COLUMN, limit=LIMIT) - -class TestDeferredReadFileList(BeamTest): - def test_should_use_read_dict_csv(self): - with patch.object(files_module, 'ReadDictCsv') as ReadDictCsv: - ReadDictCsv.return_value = beam.Create([{COLUMN: FILE_1}, {COLUMN: FILE_2}]) - with TestPipeline() as p: - result = p | DeferredReadFileList(FILE_LIST_PATH, column=COLUMN, limit=LIMIT) - assert_that(result, equal_to([FILE_1, FILE_2])) - ReadDictCsv.assert_called_with(FILE_LIST_PATH, limit=LIMIT) - -class TestFindFiles(BeamTest): - def test_should_use_find_matching_filenames(self): - with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames: - find_matching_filenames.return_value = [FILE_1, FILE_2] - with TestPipeline() as p: - result = p | FindFiles(FILE_LIST_PATH, limit=LIMIT) - assert_that(result, equal_to([FILE_1, FILE_2])) - find_matching_filenames.assert_called_with(FILE_LIST_PATH) - - def test_should_apply_limit(self): - with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames: - find_matching_filenames.return_value = [FILE_1, FILE_2] - with TestPipeline() as p: - result = p | FindFiles(FILE_LIST_PATH, limit=1) - assert_that(result, equal_to([FILE_1])) - -class TestDeferredFindFiles(BeamTest): - def test_should_use_find_matching_filenames(self): - with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames: - find_matching_filenames.return_value = [FILE_1, FILE_2] - with TestPipeline() as p: - result = p | DeferredFindFiles(FILE_LIST_PATH, limit=LIMIT) - assert_that(result, equal_to([FILE_1, FILE_2])) - find_matching_filenames.assert_called_with(FILE_LIST_PATH) - - def test_should_apply_limit(self): - with patch.object(files_module, 'find_matching_filenames') as find_matching_filenames: - find_matching_filenames.return_value = [FILE_1, FILE_2] - with TestPipeline() as p: - result = p | DeferredFindFiles(FILE_LIST_PATH, limit=1) - assert_that(result, equal_to([FILE_1])) diff --git a/sciencebeam_gym/beam_utils/io.py b/sciencebeam_gym/beam_utils/io.py deleted file mode 100644 index 0e2ec91..0000000 --- a/sciencebeam_gym/beam_utils/io.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import absolute_import - -from io import BytesIO -import logging - -from apache_beam.io.filesystems import FileSystems - -DEFAULT_BUFFER_SIZE = 4096 * 1024 - -def get_logger(): - return logging.getLogger(__name__) - -def read_all_from_path(path, buffer_size=DEFAULT_BUFFER_SIZE): - with FileSystems.open(path) as f: - out = BytesIO() - while True: - buf = f.read(buffer_size) - if not buf: - break - out.write(buf) - return out.getvalue() - -def dirname(path): - return FileSystems.split(path)[0] - -def basename(path): - return FileSystems.split(path)[1] - -def find_matching_filenames(pattern): - return (x.path for x in FileSystems.match([pattern])[0].metadata_list) - -def mkdirs_if_not_exists(path): - if not FileSystems.exists(path): - try: - get_logger().info('attempting to create directory: %s', path) - FileSystems.mkdirs(path) - except IOError: - if not FileSystems.exists(path): - raise - -def save_file_content(output_filename, data): - mkdirs_if_not_exists(dirname(output_filename)) - # Note: FileSystems.create transparently handles compression based on the file extension - with FileSystems.create(output_filename) as f: - f.write(data) - return output_filename diff --git a/sciencebeam_gym/beam_utils/main.py b/sciencebeam_gym/beam_utils/main.py deleted file mode 100644 index e5bb44f..0000000 --- a/sciencebeam_gym/beam_utils/main.py +++ /dev/null @@ -1,136 +0,0 @@ -import errno -import logging -import os -import subprocess - -def get_logger(): - return logging.getLogger(__name__) - -def create_fn_api_runner(): - from apache_beam.runners.portability.fn_api_runner import FnApiRunner - return FnApiRunner() - -def get_cloud_project(): - cmd = [ - 'gcloud', '-q', 'config', 'list', 'project', - '--format=value(core.project)' - ] - with open(os.devnull, 'w') as dev_null: - try: - res = subprocess.check_output(cmd, stderr=dev_null).strip() - if not res: - raise Exception( - '--cloud specified but no Google Cloud Platform ' - 'project found.\n' - 'Please specify your project name with the --project ' - 'flag or set a default project: ' - 'gcloud config set project YOUR_PROJECT_NAME' - ) - return res - except OSError as e: - if e.errno == errno.ENOENT: - raise Exception( - 'gcloud is not installed. The Google Cloud SDK is ' - 'necessary to communicate with the Cloud ML service. ' - 'Please install and set up gcloud.' - ) - raise - -def get_default_job_name(name, suffix=''): - from getpass import getuser - from time import gmtime, strftime - timestamp_str = strftime("%Y%m%d-%H%M%S", gmtime()) - return '%s-%s%s-%s' % (name or 'beamapp', getuser(), suffix or '', timestamp_str) - -def get_or_create_sciencebeam_gym_dist_path(): - import sys - import pkg_resources - - dist = pkg_resources.get_distribution("sciencebeam_gym") - sciencebeam_gym_path = dist.location - sciencebeam_gym_version = dist.version - subprocess.call([ - 'python', 'setup.py', 'sdist' - ], cwd=sciencebeam_gym_path, stdout=sys.stdout, stderr=sys.stderr) - sciencebeam_gym_dist_path = os.path.join( - sciencebeam_gym_path, - 'dist/sciencebeam_gym-%s.tar.gz' % sciencebeam_gym_version - ) - return sciencebeam_gym_dist_path - -def process_sciencebeam_gym_dep_args(args): - """ - If in cloud mode, add local sciencebeam-gym dependency and build distribution. - That way we don't need to keep an updated public package available. - (the project may be re-structured by then) - """ - if args.cloud: - sciencebeam_gym_dist_path = get_or_create_sciencebeam_gym_dist_path() - get_logger().info('sciencebeam_gym_dist_path: %s', sciencebeam_gym_dist_path) - args.extra_package = sciencebeam_gym_dist_path - -def add_cloud_args(parser): - parser.add_argument( - '--cloud', - default=False, - action='store_true' - ) - parser.add_argument( - '--runner', - required=False, - default=None, - help='Runner.' - ) - parser.add_argument( - '--project', - type=str, - help='The cloud project name to be used for running this pipeline' - ) - parser.add_argument( - '--num_workers', - default=1, - type=int, - help='The number of workers.' - ) - parser.add_argument( - '--job_name', type=str, required=False, - help='The name of the cloud job' - ) - parser.add_argument( - '--job-name-suffix', type=str, required=False, - help='A suffix appended to the job name' - ) - -def process_cloud_args(parsed_args, output_path, name=None): - if parsed_args.num_workers: - parsed_args.autoscaling_algorithm = 'NONE' - parsed_args.max_num_workers = parsed_args.num_workers - parsed_args.setup_file = './setup.py' - - if parsed_args.cloud: - # Flags which need to be set for cloud runs. - default_values = { - 'project': - get_cloud_project(), - 'temp_location': - os.path.join(os.path.dirname(output_path), 'temp'), - 'runner': - 'DataflowRunner', - 'save_main_session': - True, - } - if not parsed_args.job_name: - parsed_args.job_name = get_default_job_name(name, parsed_args.job_name_suffix) - else: - # Flags which need to be set for local runs. - default_values = { - 'runner': 'DirectRunner', - } - - get_logger().info('default_values: %s', default_values) - for kk, vv in default_values.iteritems(): - if kk not in parsed_args or not vars(parsed_args)[kk]: - vars(parsed_args)[kk] = vv - - if parsed_args.runner == 'FnApiRunner': - parsed_args.runner = create_fn_api_runner() diff --git a/sciencebeam_gym/beam_utils/testing.py b/sciencebeam_gym/beam_utils/testing.py deleted file mode 100644 index 0e0307b..0000000 --- a/sciencebeam_gym/beam_utils/testing.py +++ /dev/null @@ -1,242 +0,0 @@ -from __future__ import absolute_import - -import logging -from contextlib import contextmanager -from io import BytesIO -from mock import patch, Mock, MagicMock -from mock.mock import MagicProxy -from abc import ABCMeta, abstractmethod - -import pytest - -from six import with_metaclass - -import apache_beam as beam -from apache_beam.coders.coders import ToStringCoder, StrUtf8Coder -from apache_beam.testing.test_pipeline import TestPipeline as _TestPipeline -from apache_beam.io.filesystem import FileMetadata, MatchResult, CompressionTypes -from apache_beam.io.filesystems import FileSystems -from apache_beam.metrics.metric import MetricsFilter - - -class TestPipeline(_TestPipeline): - __test__ = False - - def __init__(self, *args, **kwargs): - super(TestPipeline, self).__init__(*args, **kwargs) - self._pipeline_result = None - - def run(self): - # Make sure we're only running the pipeline once - if not self._pipeline_result: - self._pipeline_result = super(TestPipeline, self).run() - return self._pipeline_result - - -_local = {} - -def get_logger(): - return logging.getLogger(__name__) - -class TestContext(object): - def __init__(self): - self.file_content_map = dict() - self.object_map = dict() - - def set_file_content(self, name, content): - get_logger().debug('set_file_content: %s (size: %d)', name, len(content)) - self.file_content_map[name] = content - - def get_file_content(self, name): - return self.file_content_map.get(name) - -def get_current_test_context(): - return _local['test_context'] - -# Apache Beam serialises everything, pretend Mocks being serialised -def unpickle_mock(state): - # get_logger().debug('unpickle mock: state=%s', state) - obj_id = state[0] if isinstance(state, tuple) else state - obj = get_current_test_context().object_map[obj_id] - return obj - -unpickle_mock.__safe_for_unpickling__ = True - -def mock_reduce(obj): - obj_id = id(obj) - # get_logger().debug('pickle mock, obj_id: %s', obj_id) - get_current_test_context().object_map[obj_id] = obj - return unpickle_mock, (obj_id,) - -for c in [Mock, MagicMock, MagicProxy]: - c.__reduce__ = mock_reduce - -@pytest.mark.filterwarnings('ignore::DeprecationWarning') -@pytest.mark.filterwarnings('ignore::UserWarning') -class BeamTest(object): - @pytest.fixture(name='test_context', autouse=True) - def init_test_context(self): - get_logger().debug('setting up test context') - test_context = TestContext() - _local['test_context'] = test_context - yield test_context - get_logger().debug('clearing test context') - del _local['test_context'] - -class MockWriteToText(beam.PTransform): - class WriteDoFn(beam.DoFn): - def __init__(self, file_path_prefix, - file_name_suffix='', - coder=ToStringCoder(), - header=None): - - self.filename = file_path_prefix + file_name_suffix - self.file_obj = None - self.coder = coder - self.header = header - - def start_bundle(self): - assert self.filename - self.file_obj = BytesIO() - if self.header: - self.file_obj.write(self.coder.encode(self.header) + '\n') - - def process(self, element): - assert self.file_obj - self.file_obj.write(self.coder.encode(element) + '\n') - - def finish_bundle(self): - assert self.file_obj - self.file_obj.flush() - file_content = self.file_obj.getvalue() - get_logger().debug('file content: %s: %s', self.filename, file_content) - test_context = get_current_test_context() - test_context.set_file_content(self.filename, file_content) - self.file_obj.close() - - def __init__(self, *args, **kwargs): - self._sink = MockWriteToText.WriteDoFn(*args, **kwargs) - - def expand(self, pcoll): - return pcoll | 'MockWriteToText' >> beam.ParDo(self._sink) - -def MockReadFromText( - file_pattern=None, - coder=StrUtf8Coder(), - skip_header_lines=0): - - file_content = get_current_test_context().get_file_content(file_pattern) - if file_content is None: - raise RuntimeError('no file content set for %s' % file_pattern) - lines = file_content.replace('\r\n', '\n').split('\n') - if skip_header_lines: - lines = lines[skip_header_lines:] - return 'MockReadFromText' >> beam.Create( - [ - coder.decode(line) - for line in lines - ] - ) - -class MockFileBasedSource(beam.io.filebasedsource.FileBasedSource): - def open_file(self, file_name): - file_content = get_current_test_context().get_file_content(file_name) - if file_content is None: - raise RuntimeError('no file content set for %s' % file_name) - return BytesIO(file_content) - -class AbstractFileSystem(with_metaclass(ABCMeta, object)): - @abstractmethod - def open( - self, path, mime_type='application/octet-stream', - compression_type=CompressionTypes.AUTO): - pass - - @abstractmethod - def create( - self, path, mime_type='application/octet-stream', - compression_type=CompressionTypes.AUTO): - pass - -class MockFileSystem(AbstractFileSystem): - @classmethod - def scheme(cls): - return 'mock' - - def match(self, patterns, limits=None): - test_context = get_current_test_context() - file_content_map = test_context.file_content_map - all_files = file_content_map.keys() - if limits is None: - limits = [None] * len(patterns) - results = [] - for pattern, limit in zip(patterns, limits): - files = all_files[:limit] - metadata = [ - FileMetadata(f, len(file_content_map[f])) - for f in files - ] - results.append(MatchResult(pattern, metadata)) - return results - - def open( - self, path, mime_type='application/octet-stream', - compression_type=CompressionTypes.AUTO): - - file_content = get_current_test_context().get_file_content(path) - if file_content is None: - raise RuntimeError('no file content set for %s' % path) - return BytesIO(file_content) - - def create( - self, path, mime_type='application/octet-stream', - compression_type=CompressionTypes.AUTO): - - out = BytesIO() - out.close = lambda: ( - get_current_test_context() - .set_file_content(path, out.getvalue()) - ) - return out - - def rename(self, source_file_names, destination_file_names): - test_context = get_current_test_context() - file_content_map = test_context.file_content_map - for source_file_name, destination_file_name in zip(source_file_names, destination_file_names): - get_logger().debug('renaming %s to %s', source_file_name, destination_file_name) - if source_file_name not in file_content_map: - raise IOError('mock file does not exist: %s' % source_file_name) - if destination_file_name in file_content_map: - raise IOError('mock file already exists: %s' % destination_file_name) - file_content_map[destination_file_name] = file_content_map[source_file_name] - del file_content_map[source_file_name] - - def mkdirs(self, path): - get_logger().debug('mkdirs: %s (no-op)', path) - -def mock_get_filesystem(*_): - return MockFileSystem() - -@contextmanager -def patch_beam_io(): - with patch.object(FileSystems, 'get_filesystem', classmethod(mock_get_filesystem)): - yield - -def get_counter_values(pipeline_result, names, wait_until_finish=True): - if wait_until_finish: - pipeline_result.wait_until_finish() - counter_values = dict() - for name in names: - counter = pipeline_result.metrics().query( - MetricsFilter().with_name(name) - )['counters'] - assert len(counter) <= 1 - if len(counter) == 1: - counter_values[name] = counter[0].committed - return counter_values - -def get_counter_value(pipeline_result, name, default_value=None, wait_until_finish=True): - counter_values = get_counter_values( - pipeline_result, [name], wait_until_finish=wait_until_finish - ) - return counter_values.get(name, default_value) diff --git a/sciencebeam_gym/beam_utils/utils.py b/sciencebeam_gym/beam_utils/utils.py deleted file mode 100644 index 5b16af0..0000000 --- a/sciencebeam_gym/beam_utils/utils.py +++ /dev/null @@ -1,107 +0,0 @@ -import logging -from random import getrandbits - -import apache_beam as beam -from apache_beam.metrics.metric import Metrics - -def get_logger(): - return logging.getLogger(__name__) - -def Spy(f): - def spy_wrapper(x): - f(x) - return x - return spy_wrapper - -def MapSpy(f): - return beam.Map(Spy(f)) - -def MapOrLog(fn, log_fn=None, error_count=None): - if log_fn is None: - log_fn = lambda e, x: ( - get_logger().warning( - 'caught exception (ignoring item): %s, input: %.100s...', - e, x, exc_info=e - ) - ) - error_counter = ( - Metrics.counter('MapOrLog', error_count) - if error_count - else None - ) - def wrapper(x): - try: - yield fn(x) - except Exception as e: - if error_counter: - error_counter.inc() - log_fn(e, x) - return beam.FlatMap(wrapper) - -LEVEL_MAP = { - 'info': logging.INFO, - 'debug': logging.DEBUG -} - -def Count(name, counter_value_fn): - counter = Metrics.counter('Count', name) - def wrapper(x): - counter.inc(counter_value_fn(x) if counter_value_fn else 1) - return x - return name >> beam.Map(wrapper) - -class GroupTransforms(beam.PTransform): - """ - Convenience method to allow a PTransform for grouping purpose - to be defined using a lambda function. - (Completely unrelated to GroupBy transforms) - """ - def __init__(self, expand_fn): - super(GroupTransforms, self).__init__() - self.expand_fn = expand_fn - - def expand(self, pcoll): # pylint: disable=W0221 - return self.expand_fn(pcoll) - -def TransformAndCount(transform, counter_name, counter_value_fn=None): - return GroupTransforms(lambda pcoll: ( - pcoll | - transform | - "Count" >> Count(counter_name, counter_value_fn) - )) - -def TransformAndLog(transform, log_fn=None, log_prefix='', log_value_fn=None, log_level='info'): - if log_fn is None: - if log_value_fn is None: - log_value_fn = lambda x: x - log_level = LEVEL_MAP.get(log_level, log_level) - log_fn = lambda x: get_logger().log( - log_level, '%s%.50s...', log_prefix, log_value_fn(x) - ) - - return GroupTransforms(lambda pcoll: ( - pcoll | - transform | - "Log" >> MapSpy(log_fn) - )) - -def random_key(): - return getrandbits(32) - -def PreventFusion(key_fn=None, name="PreventFusion"): - """ - Prevents fusion to allow better distribution across workers. - - See: - https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion - - TODO Replace by: https://github.com/apache/beam/pull/4040 - """ - if key_fn is None: - key_fn = lambda _: random_key() - return name >> GroupTransforms(lambda pcoll: ( - pcoll | - "AddKey" >> beam.Map(lambda x: (key_fn(x), x)) | - "GroupByKey" >> beam.GroupByKey() | - "Ungroup" >> beam.FlatMap(lambda element: element[1]) - )) diff --git a/sciencebeam_gym/beam_utils/utils_test.py b/sciencebeam_gym/beam_utils/utils_test.py deleted file mode 100644 index f5804a5..0000000 --- a/sciencebeam_gym/beam_utils/utils_test.py +++ /dev/null @@ -1,152 +0,0 @@ -import logging - -import pytest - -import apache_beam as beam -from apache_beam.testing.util import ( - assert_that, - equal_to -) - -from sciencebeam_gym.beam_utils.testing import ( - BeamTest, - TestPipeline, - get_counter_value -) - -from sciencebeam_gym.beam_utils.utils import ( - MapOrLog, - TransformAndLog, - TransformAndCount, - PreventFusion -) - -SOME_VALUE_1 = 'value 1' -SOME_VALUE_2 = 'value 2' -SOME_VALUE_CAUSING_EXCEPTION = 1 - -SOME_FN = lambda x: x.upper() -def FN_RAISING_EXCEPTION(_): - raise RuntimeError('oh dear') - -ERROR_COUNT_METRIC_NAME = 'error_count' -COUNT_METRIC_NAME_1 = 'count_1' - -def get_logger(): - return logging.getLogger(__name__) - -def setup_module(): - logging.basicConfig(level='DEBUG') - -@pytest.mark.slow -class TestMapOrLog(BeamTest): - def test_should_pass_through_return_value_if_no_exception_was_raised(self): - fn = lambda x: x.upper() - with TestPipeline() as p: - result = ( - p | - beam.Create([SOME_VALUE_1]) | - MapOrLog(SOME_FN) - ) - assert_that(result, equal_to([fn(SOME_VALUE_1)])) - - def test_should_skip_entries_that_cause_an_exception(self): - with TestPipeline() as p: - result = ( - p | - beam.Create([SOME_VALUE_1]) | - MapOrLog(FN_RAISING_EXCEPTION) - ) - assert_that(result, equal_to([])) - - def test_should_not_increase_error_metric_counter_if_no_exception_raised(self): - with TestPipeline() as p: - _ = ( - p | - beam.Create([SOME_VALUE_1]) | - MapOrLog(FN_RAISING_EXCEPTION, error_count=ERROR_COUNT_METRIC_NAME) - ) - assert get_counter_value(p.run(), ERROR_COUNT_METRIC_NAME) == 1 - - def test_should_increase_error_metric_counter_if_exception_was_raised(self): - with TestPipeline() as p: - _ = ( - p | - beam.Create([SOME_VALUE_1]) | - MapOrLog(FN_RAISING_EXCEPTION, error_count=ERROR_COUNT_METRIC_NAME) - ) - assert get_counter_value(p.run(), ERROR_COUNT_METRIC_NAME) == 1 - -@pytest.mark.slow -class TestTransformAndCount(BeamTest): - def test_should_not_change_result(self): - with TestPipeline() as p: - result = ( - p | - beam.Create([SOME_VALUE_1.lower()]) | - TransformAndCount( - beam.Map(lambda x: x.upper()), - COUNT_METRIC_NAME_1 - ) - ) - assert_that(result, equal_to([SOME_VALUE_1.upper()])) - - def test_should_increase_count_per_item(self): - with TestPipeline() as p: - _ = ( - p | - beam.Create([SOME_VALUE_1, SOME_VALUE_2]) | - TransformAndCount( - beam.Map(lambda x: x), - COUNT_METRIC_NAME_1 - ) - ) - assert get_counter_value(p.run(), COUNT_METRIC_NAME_1) == 2 - - def test_should_increase_count_per_item_using_function(self): - with TestPipeline() as p: - _ = ( - p | - beam.Create([SOME_VALUE_1, SOME_VALUE_2]) | - TransformAndCount( - beam.Map(lambda x: x), - COUNT_METRIC_NAME_1, - lambda x: len(x) - ) - ) - assert get_counter_value(p.run(), COUNT_METRIC_NAME_1) == ( - len(SOME_VALUE_1) + len(SOME_VALUE_2) - ) - -@pytest.mark.slow -class TestTransformAndLog(BeamTest): - def test_should_not_change_result(self): - with TestPipeline() as p: - result = ( - p | - beam.Create([SOME_VALUE_1.lower()]) | - TransformAndLog( - beam.Map(lambda x: x.upper()) - ) - ) - assert_that(result, equal_to([SOME_VALUE_1.upper()])) - -@pytest.mark.slow -class TestPreventFusion(BeamTest): - def test_should_not_change_result_with_default_random_key(self): - with TestPipeline() as p: - result = ( - p | - beam.Create([SOME_VALUE_1, SOME_VALUE_2]) | - PreventFusion() - ) - assert_that(result, equal_to([SOME_VALUE_1, SOME_VALUE_2])) - - def test_should_not_change_result_with_constant_key(self): - with TestPipeline() as p: - result = ( - p | - beam.Create([SOME_VALUE_1, SOME_VALUE_2]) | - PreventFusion(lambda _: 1) - ) - assert_that(result, equal_to([SOME_VALUE_1, SOME_VALUE_2])) diff --git a/sciencebeam_gym/convert/conversion_pipeline.py b/sciencebeam_gym/convert/conversion_pipeline.py index ddafb5f..3a47ef3 100644 --- a/sciencebeam_gym/convert/conversion_pipeline.py +++ b/sciencebeam_gym/convert/conversion_pipeline.py @@ -12,34 +12,39 @@ from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions from lxml import etree -from sciencebeam_gym.utils.collection import ( - extend_dict, - remove_keys_from_dict -) - -from sciencebeam_gym.beam_utils.utils import ( +from sciencebeam_utils.beam_utils.utils import ( TransformAndCount, TransformAndLog, MapOrLog, PreventFusion ) -from sciencebeam_gym.beam_utils.files import ( +from sciencebeam_utils.beam_utils.files import ( ReadFileList, FindFiles ) -from sciencebeam_gym.beam_utils.io import ( +from sciencebeam_utils.beam_utils.io import ( read_all_from_path, save_file_content ) -from sciencebeam_gym.beam_utils.main import ( +from sciencebeam_utils.beam_utils.main import ( add_cloud_args, process_cloud_args, process_sciencebeam_gym_dep_args ) +from sciencebeam_utils.utils.collection import ( + extend_dict, + remove_keys_from_dict +) + +from sciencebeam_utils.utils.file_path import ( + join_if_relative_path, + get_output_file +) + from sciencebeam_gym.structured_document.structured_document_loader import ( load_structured_document ) @@ -49,12 +54,10 @@ from sciencebeam_gym.structured_document.lxml import ( ) from sciencebeam_gym.preprocess.preprocessing_utils import ( - join_if_relative_path, convert_pdf_bytes_to_lxml, parse_page_range, save_pages, - pdf_bytes_to_png_pages, - get_output_file + pdf_bytes_to_png_pages ) from sciencebeam_gym.inference_model.extract_to_xml import ( diff --git a/sciencebeam_gym/convert/conversion_pipeline_test.py b/sciencebeam_gym/convert/conversion_pipeline_test.py index aec25b1..4bf728d 100644 --- a/sciencebeam_gym/convert/conversion_pipeline_test.py +++ b/sciencebeam_gym/convert/conversion_pipeline_test.py @@ -5,7 +5,7 @@ import pytest import apache_beam as beam -from sciencebeam_gym.beam_utils.testing import ( +from sciencebeam_utils.beam_utils.testing import ( BeamTest, TestPipeline ) diff --git a/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py b/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py index f9e0c8e..3f47f37 100644 --- a/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py +++ b/sciencebeam_gym/convert/grobid/grobid_service_wrapper.py @@ -10,8 +10,8 @@ from zipfile import ZipFile from shutil import rmtree from urllib import URLopener -from sciencebeam_gym.utils.io import makedirs -from sciencebeam_gym.utils.zip import extract_all_with_executable_permission +from sciencebeam_utils.utils.io import makedirs +from sciencebeam_utils.utils.zip import extract_all_with_executable_permission def get_logger(): return logging.getLogger(__name__) diff --git a/sciencebeam_gym/inference_model/__init___test.py b/sciencebeam_gym/inference_model/__init___test.py index 873d5e0..9eb117f 100644 --- a/sciencebeam_gym/inference_model/__init___test.py +++ b/sciencebeam_gym/inference_model/__init___test.py @@ -5,7 +5,7 @@ from shutil import rmtree import tensorflow as tf import numpy as np -from sciencebeam_gym.utils.num import ( +from sciencebeam_utils.utils.num import ( assert_all_close ) diff --git a/sciencebeam_gym/inference_model/annotate_using_predictions.py b/sciencebeam_gym/inference_model/annotate_using_predictions.py index f2f0702..d3af139 100644 --- a/sciencebeam_gym/inference_model/annotate_using_predictions.py +++ b/sciencebeam_gym/inference_model/annotate_using_predictions.py @@ -7,7 +7,7 @@ from io import BytesIO import numpy as np from PIL import Image -from sciencebeam_gym.beam_utils.io import ( +from sciencebeam_utils.beam_utils.io import ( read_all_from_path ) diff --git a/sciencebeam_gym/inference_model/extract_to_xml.py b/sciencebeam_gym/inference_model/extract_to_xml.py index 7436434..fba57f7 100644 --- a/sciencebeam_gym/inference_model/extract_to_xml.py +++ b/sciencebeam_gym/inference_model/extract_to_xml.py @@ -4,7 +4,7 @@ import logging from lxml import etree from lxml.builder import E -from sciencebeam_gym.beam_utils.io import ( +from sciencebeam_utils.beam_utils.io import ( save_file_content ) diff --git a/sciencebeam_gym/inference_model/extract_to_xml_test.py b/sciencebeam_gym/inference_model/extract_to_xml_test.py index 223a905..dad436a 100644 --- a/sciencebeam_gym/inference_model/extract_to_xml_test.py +++ b/sciencebeam_gym/inference_model/extract_to_xml_test.py @@ -5,7 +5,7 @@ from backports.tempfile import TemporaryDirectory from lxml import etree from lxml.builder import E -from sciencebeam_gym.utils.xml import ( +from sciencebeam_utils.utils.xml import ( get_text_content, get_text_content_list ) diff --git a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py index bbe3307..50d7c80 100644 --- a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py +++ b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py @@ -8,11 +8,15 @@ from six import raise_from from tqdm import tqdm -from sciencebeam_gym.utils.stopwatch import ( +from sciencebeam_utils.beam_utils.io import ( + save_file_content +) + +from sciencebeam_utils.utils.stopwatch import ( StopWatchRecorder ) -from sciencebeam_gym.utils.file_list import ( +from sciencebeam_utils.utils.file_list import ( load_file_list ) @@ -36,10 +40,6 @@ from sciencebeam_gym.models.text.crf.crfsuite_model import ( CrfSuiteModel ) -from sciencebeam_gym.beam_utils.io import ( - save_file_content -) - def get_logger(): return logging.getLogger(__name__) diff --git a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py index 79e7e19..24cd23e 100644 --- a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py +++ b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline_test.py @@ -2,7 +2,7 @@ from mock import patch, Mock, ANY import pytest -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( to_namedtuple ) diff --git a/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py b/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py index 14bd082..0c6fadb 100644 --- a/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py +++ b/sciencebeam_gym/pdf/pdf_to_lxml_wrapper.py @@ -7,8 +7,8 @@ from shutil import rmtree from urllib import URLopener from tempfile import NamedTemporaryFile -from sciencebeam_gym.utils.io import makedirs -from sciencebeam_gym.utils.zip import extract_all_with_executable_permission +from sciencebeam_utils.utils.io import makedirs +from sciencebeam_utils.utils.zip import extract_all_with_executable_permission def get_logger(): return logging.getLogger(__name__) diff --git a/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py b/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py index eccb7d2..5ba16ce 100644 --- a/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py +++ b/sciencebeam_gym/preprocess/annotation/annotation_evaluation.py @@ -4,7 +4,7 @@ from collections import Counter from six import iteritems -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( flatten ) diff --git a/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py b/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py index 32688bf..367ed0b 100644 --- a/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py +++ b/sciencebeam_gym/preprocess/annotation/find_line_numbers_test.py @@ -1,4 +1,4 @@ -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( flatten ) diff --git a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py index 7f4f595..aa80315 100644 --- a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py +++ b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py @@ -2,16 +2,16 @@ from __future__ import division import logging -from sciencebeam_gym.utils.string import ( +from sciencebeam_utils.utils.string import ( LazyStr ) -from sciencebeam_gym.alignment.align import ( +from sciencebeam_alignment.align import ( LocalSequenceMatcher, SimpleScoring ) -from sciencebeam_gym.alignment.WordSequenceMatcher import ( +from sciencebeam_alignment.word_sequence_matcher import ( WordSequenceMatcher ) diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator.py b/sciencebeam_gym/preprocess/annotation/matching_annotator.py index b148828..564d45c 100644 --- a/sciencebeam_gym/preprocess/annotation/matching_annotator.py +++ b/sciencebeam_gym/preprocess/annotation/matching_annotator.py @@ -7,20 +7,20 @@ from itertools import tee, islice from six.moves import zip_longest -from sciencebeam_gym.utils.compat import ( +from sciencebeam_utils.utils.compat import ( python_2_unicode_compatible ) -from sciencebeam_gym.utils.csv import ( +from sciencebeam_utils.utils.csv import ( csv_delimiter_by_filename, write_csv_row ) -from sciencebeam_gym.utils.string import ( +from sciencebeam_utils.utils.string import ( LazyStr ) -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( iter_flatten, extract_from_dict ) diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py index 0bfb597..76fb765 100644 --- a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py +++ b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py @@ -2,6 +2,10 @@ from __future__ import division import logging +from sciencebeam_utils.utils.collection import ( + flatten +) + from sciencebeam_gym.structured_document import ( SimpleStructuredDocument, SimpleLine, @@ -21,10 +25,6 @@ from sciencebeam_gym.preprocess.annotation.matching_annotator import ( EM_DASH ) -from sciencebeam_gym.utils.collection import ( - flatten -) - TAG1 = 'tag1' TAG2 = 'tag2' diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation.py b/sciencebeam_gym/preprocess/annotation/target_annotation.py index 073358a..b1ff43d 100644 --- a/sciencebeam_gym/preprocess/annotation/target_annotation.py +++ b/sciencebeam_gym/preprocess/annotation/target_annotation.py @@ -8,20 +8,20 @@ from six.moves.configparser import ConfigParser # pylint: disable=E0401 from lxml import etree -from sciencebeam_gym.utils.compat import ( +from sciencebeam_utils.utils.compat import ( python_2_unicode_compatible ) -from sciencebeam_gym.utils.string import ( +from sciencebeam_utils.utils.string import ( LazyStr ) -from sciencebeam_gym.utils.xml import ( +from sciencebeam_utils.utils.xml import ( get_text_content, get_immediate_text ) -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( filter_truthy, strip_all ) diff --git a/sciencebeam_gym/preprocess/check_file_list.py b/sciencebeam_gym/preprocess/check_file_list.py deleted file mode 100644 index 79e21be..0000000 --- a/sciencebeam_gym/preprocess/check_file_list.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import division - -import argparse -import logging -from concurrent.futures import ThreadPoolExecutor - -from apache_beam.io.filesystems import FileSystems - -from sciencebeam_gym.utils.file_list import ( - load_file_list -) - -def get_logger(): - return logging.getLogger(__name__) - -def parse_args(argv=None): - parser = argparse.ArgumentParser( - 'Check file list' - ) - - source = parser.add_argument_group('source') - source.add_argument( - '--file-list', type=str, required=True, - help='path to source file list (tsv/csv/lst)' - ) - source.add_argument( - '--file-column', type=str, required=False, - default='url', - help='csv/tsv column (ignored for plain file list)' - ) - - parser.add_argument( - '--limit', type=int, required=False, - help='limit the files to process' - ) - - parser.add_argument( - '--debug', action='store_true', default=False, - help='enable debug output' - ) - return parser.parse_args(argv) - -def map_file_list_to_file_exists(file_list): - with ThreadPoolExecutor(max_workers=50) as executor: - return list(executor.map(FileSystems.exists, file_list)) - -def format_file_exists_results(file_exists): - if not file_exists: - return 'empty file list' - file_exists_count = sum(file_exists) - file_missing_count = len(file_exists) - file_exists_count - return ( - 'files exist: %d (%.0f%%), files missing: %d (%.0f%%)' % - ( - file_exists_count, 100.0 * file_exists_count / len(file_exists), - file_missing_count, 100.0 * file_missing_count / len(file_exists) - ) - ) - -def check_files_and_report_result(file_list): - file_exists = map_file_list_to_file_exists(file_list) - get_logger().info('%s', format_file_exists_results(file_exists)) - assert sum(file_exists) > 0 - -def run(opt): - file_list = load_file_list( - opt.file_list, - column=opt.file_column, - limit=opt.limit - ) - check_files_and_report_result(file_list) - -def main(argv=None): - args = parse_args(argv) - - if args.debug: - logging.getLogger().setLevel('DEBUG') - - run(args) - -if __name__ == '__main__': - logging.basicConfig(level='INFO') - logging.getLogger('oauth2client').setLevel('WARNING') - - main() diff --git a/sciencebeam_gym/preprocess/check_file_list_test.py b/sciencebeam_gym/preprocess/check_file_list_test.py deleted file mode 100644 index 775769b..0000000 --- a/sciencebeam_gym/preprocess/check_file_list_test.py +++ /dev/null @@ -1,58 +0,0 @@ -from mock import patch - -import pytest - -import sciencebeam_gym.preprocess.check_file_list as check_file_list_module -from sciencebeam_gym.preprocess.check_file_list import ( - map_file_list_to_file_exists, - format_file_exists_results, - check_files_and_report_result -) - -FILE_1 = 'file1' -FILE_2 = 'file2' - -class TestMapFileListToFileExists(object): - def test_should_return_single_file_exists(self): - m = check_file_list_module - with patch.object(m, 'FileSystems') as FileSystems: - assert map_file_list_to_file_exists( - [FILE_1] - ) == [FileSystems.exists.return_value] - FileSystems.exists.assert_called_with(FILE_1) - -class TestFormatFileExistsResults(object): - def test_should_format_no_files(self): - assert ( - format_file_exists_results([]) == - 'empty file list' - ) - - def test_should_format_all_files_exist(self): - assert ( - format_file_exists_results([True, True]) == - 'files exist: 2 (100%), files missing: 0 (0%)' - ) - - def test_should_format_files_partially_exist(self): - assert ( - format_file_exists_results([True, False]) == - 'files exist: 1 (50%), files missing: 1 (50%)' - ) - -class TestCheckFileListAndReportResults(object): - def test_should_pass_file_list_to_format(self): - m = check_file_list_module - with patch.object(m, 'map_file_list_to_file_exists') as map_file_list_to_file_exists_mock: - with patch.object(m, 'format_file_exists_results') as format_file_exists_results_mock: - map_file_list_to_file_exists_mock.return_value = [True, True] - check_files_and_report_result([FILE_1, FILE_2]) - map_file_list_to_file_exists_mock.assert_called_with([FILE_1, FILE_2]) - format_file_exists_results_mock.assert_called_with([True, True]) - - def test_should_raise_error_if_none_of_the_files_were_found(self): - m = check_file_list_module - with patch.object(m, 'map_file_list_to_file_exists') as map_file_list_to_file_exists_mock: - with pytest.raises(AssertionError): - map_file_list_to_file_exists_mock.return_value = [False, False] - check_files_and_report_result([FILE_1, FILE_2]) diff --git a/sciencebeam_gym/preprocess/find_file_pairs.py b/sciencebeam_gym/preprocess/find_file_pairs.py deleted file mode 100644 index ff727d1..0000000 --- a/sciencebeam_gym/preprocess/find_file_pairs.py +++ /dev/null @@ -1,93 +0,0 @@ -import argparse -import csv -import logging - -from apache_beam.io.filesystems import FileSystems - -from sciencebeam_gym.utils.csv import ( - csv_delimiter_by_filename, - write_csv_rows -) - -from sciencebeam_gym.beam_utils.io import ( - dirname, - mkdirs_if_not_exists -) - -from sciencebeam_gym.utils.file_path import ( - join_if_relative_path, - relative_path -) - -from sciencebeam_gym.preprocess.preprocessing_utils import ( - find_file_pairs_grouped_by_parent_directory_or_name -) - -def get_logger(): - return logging.getLogger(__name__) - -def parse_args(argv=None): - parser = argparse.ArgumentParser() - parser.add_argument( - '--data-path', type=str, required=True, - help='base data path' - ) - parser.add_argument( - '--source-pattern', type=str, required=True, - help='source pattern' - ) - parser.add_argument( - '--xml-pattern', type=str, required=True, - help='xml pattern' - ) - parser.add_argument( - '--out', type=str, required=True, - help='output csv/tsv file' - ) - - parser.add_argument( - '--use-relative-paths', action='store_true', - help='create a file list with relative paths (relative to the data path)' - ) - - return parser.parse_args(argv) - - -def save_file_pairs_to_csv(output_path, source_xml_pairs): - mkdirs_if_not_exists(dirname(output_path)) - delimiter = csv_delimiter_by_filename(output_path) - mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv' - with FileSystems.create(output_path, mime_type=mime_type) as f: - writer = csv.writer(f, delimiter=delimiter) - write_csv_rows(writer, [['source_url', 'xml_url']]) - write_csv_rows(writer, source_xml_pairs) - get_logger().info('written results to %s', output_path) - -def to_relative_file_pairs(base_path, file_pairs): - return ( - (relative_path(base_path, source_url), relative_path(base_path, xml_url)) - for source_url, xml_url in file_pairs - ) - -def run(args): - get_logger().info('finding file pairs') - source_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([ - join_if_relative_path(args.data_path, args.source_pattern), - join_if_relative_path(args.data_path, args.xml_pattern) - ]) - - if args.use_relative_paths: - source_xml_pairs = to_relative_file_pairs(args.data_path, source_xml_pairs) - - source_xml_pairs = list(source_xml_pairs) - - save_file_pairs_to_csv(args.out, source_xml_pairs) - -def main(argv=None): - args = parse_args(argv) - run(args) - -if __name__ == '__main__': - logging.basicConfig(level='INFO') - - main() diff --git a/sciencebeam_gym/preprocess/find_file_pairs_test.py b/sciencebeam_gym/preprocess/find_file_pairs_test.py deleted file mode 100644 index 4aaee56..0000000 --- a/sciencebeam_gym/preprocess/find_file_pairs_test.py +++ /dev/null @@ -1,152 +0,0 @@ -import logging -import os -from mock import patch - -import pytest - -import sciencebeam_gym.preprocess.find_file_pairs as find_file_pairs -from sciencebeam_gym.preprocess.find_file_pairs import ( - to_relative_file_pairs, - run, - parse_args, - main -) - - -LOGGER = logging.getLogger(__name__) - -BASE_SOURCE_PATH = '/source' - -PDF_FILE_1 = BASE_SOURCE_PATH + '/file1.pdf' -XML_FILE_1 = BASE_SOURCE_PATH + '/file1.xml' -PDF_FILE_2 = BASE_SOURCE_PATH + '/file2.pdf' -XML_FILE_2 = BASE_SOURCE_PATH + '/file2.xml' - -SOURCE_PATTERN = '*.pdf' -XML_PATTERN = '*.xml' -OUTPUT_FILE = 'file-list.tsv' - -SOME_ARGV = [ - '--data-path=%s' % BASE_SOURCE_PATH, - '--source-pattern=%s' % SOURCE_PATTERN, - '--xml-pattern=%s' % XML_PATTERN, - '--out=%s' % OUTPUT_FILE -] - -@pytest.fixture(name='to_relative_file_pairs_mock') -def _to_relative_file_pairs(): - with patch.object(find_file_pairs, 'to_relative_file_pairs') as m: - yield m - -@pytest.fixture(name='find_file_pairs_grouped_by_parent_directory_or_name_mock') -def _find_file_pairs_grouped_by_parent_directory_or_name(): - with patch.object(find_file_pairs, 'find_file_pairs_grouped_by_parent_directory_or_name') as m: - m.return_value = [ - (PDF_FILE_1, XML_FILE_1), - (PDF_FILE_2, XML_FILE_2) - ] - yield m - -@pytest.fixture(name='save_file_pairs_to_csv_mock') -def _save_file_pairs_to_csv(): - with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m: - yield m - -@pytest.fixture(name='save_file_pairs_to_csv_mock') -def _save_file_pairs_to_csv(): - with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m: - yield m - -@pytest.fixture(name='parse_args_mock') -def _parse_args(): - with patch.object(find_file_pairs, 'parse_args') as m: - yield m - -@pytest.fixture(name='run_mock') -def _run(): - with patch.object(find_file_pairs, 'run') as m: - yield m - -def _touch(path): - path.write(b'', ensure=True) - return path - -@pytest.fixture(name='pdf_file_1') -def _pdf_file_1(tmpdir): - return _touch(tmpdir.join(PDF_FILE_1)) - -@pytest.fixture(name='xml_file_1') -def _xml_file_1(tmpdir): - return _touch(tmpdir.join(XML_FILE_1)) - -@pytest.fixture(name='data_path') -def _data_path(tmpdir): - return tmpdir.join(BASE_SOURCE_PATH) - -@pytest.fixture(name='out_file') -def _out_file(tmpdir): - return tmpdir.join(OUTPUT_FILE) - -class TestToRelativeFilePairs(object): - def test_should_make_paths_relative(self): - assert list(to_relative_file_pairs( - '/parent', - [('/parent/sub/file1', '/parent/sub/file2')] - )) == [('sub/file1', 'sub/file2')] - -class TestRun(object): - def test_should_pass_around_parameters( - self, - find_file_pairs_grouped_by_parent_directory_or_name_mock, - save_file_pairs_to_csv_mock): - - opt = parse_args(SOME_ARGV) - run(opt) - find_file_pairs_grouped_by_parent_directory_or_name_mock.assert_called_with([ - os.path.join(BASE_SOURCE_PATH, SOURCE_PATTERN), - os.path.join(BASE_SOURCE_PATH, XML_PATTERN) - ]) - save_file_pairs_to_csv_mock.assert_called_with( - opt.out, - find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value - ) - - def test_should_use_relative_paths_if_enabled( - self, - find_file_pairs_grouped_by_parent_directory_or_name_mock, - to_relative_file_pairs_mock, - save_file_pairs_to_csv_mock): - - opt = parse_args(SOME_ARGV) - opt.use_relative_paths = True - - to_relative_file_pairs_mock.return_value = [('file1.pdf', 'file1.xml')] - - run(opt) - to_relative_file_pairs_mock.assert_called_with( - BASE_SOURCE_PATH, - find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value - ) - save_file_pairs_to_csv_mock.assert_called_with( - opt.out, - to_relative_file_pairs_mock.return_value - ) - - def test_should_generate_file_list(self, data_path, pdf_file_1, xml_file_1, out_file): - LOGGER.debug('pdf_file_1: %s, xml_file: %s', pdf_file_1, xml_file_1) - opt = parse_args(SOME_ARGV) - opt.data_path = str(data_path) - opt.out = str(out_file) - run(opt) - out_lines = [s.strip() for s in out_file.read().strip().split('\n')] - LOGGER.debug('out_lines: %s', out_lines) - assert out_lines == [ - 'source_url\txml_url', - '%s\t%s' % (pdf_file_1, xml_file_1) - ] - -class TestMain(object): - def test_should_parse_args_and_call_run(self, parse_args_mock, run_mock): - main(SOME_ARGV) - parse_args_mock.assert_called_with(SOME_ARGV) - run_mock.assert_called_with(parse_args_mock.return_value) diff --git a/sciencebeam_gym/preprocess/get_output_files.py b/sciencebeam_gym/preprocess/get_output_files.py deleted file mode 100644 index 73eae63..0000000 --- a/sciencebeam_gym/preprocess/get_output_files.py +++ /dev/null @@ -1,153 +0,0 @@ -import argparse -import logging - -from sciencebeam_gym.utils.file_list import ( - load_file_list, - save_file_list, - to_relative_file_list -) - -from sciencebeam_gym.utils.file_path import ( - join_if_relative_path -) - -from sciencebeam_gym.preprocess.preprocessing_utils import ( - get_or_validate_base_path, - get_output_file -) - -from sciencebeam_gym.preprocess.check_file_list import ( - check_files_and_report_result -) - -def get_logger(): - return logging.getLogger(__name__) - -def parse_args(argv=None): - parser = argparse.ArgumentParser( - 'Get output files based on source files and suffix.' - ) - - source = parser.add_argument_group('source') - source.add_argument( - '--source-file-list', type=str, required=True, - help='path to source file list (tsv/csv/lst)' - ) - source.add_argument( - '--source-file-column', type=str, required=False, - default='url', - help='csv/tsv column (ignored for plain file list)' - ) - source.add_argument( - '--source-base-path', type=str, required=False, - help='base data path for source file urls' - ) - - output = parser.add_argument_group('output') - output.add_argument( - '--output-file-list', type=str, required=True, - help='path to output file list (tsv/csv/lst)' - ) - output.add_argument( - '--output-file-column', type=str, required=False, - default='url', - help='csv/tsv column (ignored for plain file list)' - ) - output.add_argument( - '--output-file-suffix', type=str, required=False, - help='file suffix (will be added to source urls after removing ext)' - ) - output.add_argument( - '--output-base-path', type=str, required=False, - help='base output path (by default source base path with"-results" suffix)' - ) - output.add_argument( - '--use-relative-paths', action='store_true', - help='create a file list with relative paths (relative to the output data path)' - ) - - parser.add_argument( - '--limit', type=int, required=False, - help='limit the files to process' - ) - - parser.add_argument( - '--check', action='store_true', default=False, - help='check whether the output files exist' - ) - parser.add_argument( - '--check-limit', type=int, required=False, - help='limit the files to check' - ) - - parser.add_argument( - '--debug', action='store_true', default=False, - help='enable debug output' - ) - return parser.parse_args(argv) - -def get_output_file_list(file_list, source_base_path, output_base_path, output_file_suffix): - return [ - get_output_file(filename, source_base_path, output_base_path, output_file_suffix) - for filename in file_list - ] - -def run(opt): - source_file_list = load_file_list( - join_if_relative_path( - opt.source_base_path, - opt.source_file_list - ), - column=opt.source_file_column, - limit=opt.limit - ) - source_base_path = get_or_validate_base_path( - source_file_list, opt.source_base_path - ) - - target_file_list = get_output_file_list( - source_file_list, source_base_path, opt.output_base_path, opt.output_file_suffix - ) - - if opt.check: - check_file_list = ( - target_file_list[:opt.check_limit] if opt.check_limit - else target_file_list - ) - get_logger().info( - 'checking %d (out of %d) files...', - len(check_file_list), len(target_file_list) - ) - check_files_and_report_result(check_file_list) - - if opt.use_relative_paths: - target_file_list = to_relative_file_list(opt.output_base_path, target_file_list) - - get_logger().info( - 'saving file list (with %d files) to: %s', - len(target_file_list), opt.output_file_list - ) - save_file_list( - opt.output_file_list, - target_file_list, - column=opt.output_file_column - ) - -def process_args(args): - if not args.output_base_path: - args.output_base_path = args.source_base_path + '-results' - -def main(argv=None): - args = parse_args(argv) - process_args(args) - - if args.debug: - logging.getLogger().setLevel('DEBUG') - - run(args) - -if __name__ == '__main__': - logging.basicConfig(level='INFO') - logging.getLogger('oauth2client').setLevel('WARNING') - - main() diff --git a/sciencebeam_gym/preprocess/get_output_files_test.py b/sciencebeam_gym/preprocess/get_output_files_test.py deleted file mode 100644 index 41f665a..0000000 --- a/sciencebeam_gym/preprocess/get_output_files_test.py +++ /dev/null @@ -1,181 +0,0 @@ -import os -from mock import patch, ANY - -import pytest - -import sciencebeam_gym.preprocess.get_output_files as get_output_files -from sciencebeam_gym.preprocess.get_output_files import ( - get_output_file_list, - run, - parse_args, - main -) - -SOME_ARGV = [ - '--source-file-list=source.csv', - '--output-file-list=output.csv', - '--limit=10' -] - -BASE_SOURCE_PATH = '/source' - -FILE_1 = BASE_SOURCE_PATH + '/file1' -FILE_2 = BASE_SOURCE_PATH + '/file2' - - -@pytest.fixture(name='load_file_list_mock') -def _load_file_list(): - with patch.object(get_output_files, 'load_file_list') as m: - m.return_value = [FILE_1, FILE_2] - yield m - -@pytest.fixture(name='get_output_file_list_mock') -def _get_output_file_list(): - with patch.object(get_output_files, 'get_output_file_list') as m: - yield m - -@pytest.fixture(name='save_file_list_mock') -def _save_file_list(): - with patch.object(get_output_files, 'save_file_list') as m: - yield m - -@pytest.fixture(name='check_files_and_report_result_mock') -def _check_files_and_report_result(): - with patch.object(get_output_files, 'check_files_and_report_result') as m: - yield m - -@pytest.fixture(name='to_relative_file_list_mock') -def _to_relative_file_list(): - with patch.object(get_output_files, 'to_relative_file_list') as m: - yield m - -class TestGetOutputFileList(object): - def test_should_return_output_file_with_path_and_change_ext(self): - assert get_output_file_list( - ['/source/path/file.pdf'], - '/source', - '/output', - '.xml' - ) == ['/output/path/file.xml'] - -@pytest.mark.usefixtures( - "load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock", - "to_relative_file_list_mock" -) -class TestRun(object): - def test_should_pass_around_parameters( - self, - load_file_list_mock, - get_output_file_list_mock, - save_file_list_mock): - - load_file_list_mock.return_value = [FILE_1, FILE_2] - opt = parse_args(SOME_ARGV) - run(opt) - load_file_list_mock.assert_called_with( - opt.source_file_list, - column=opt.source_file_column, - limit=opt.limit - ) - get_output_file_list_mock.assert_called_with( - load_file_list_mock.return_value, - BASE_SOURCE_PATH, - opt.output_base_path, - opt.output_file_suffix - ) - save_file_list_mock.assert_called_with( - opt.output_file_list, - get_output_file_list_mock.return_value, - column=opt.source_file_column - ) - - def test_should_make_file_list_absolute_if_it_is_relative( - self, - load_file_list_mock): - - opt = parse_args(SOME_ARGV) - opt.source_base_path = BASE_SOURCE_PATH - opt.source_file_list = 'source.tsv' - run(opt) - load_file_list_mock.assert_called_with( - os.path.join(opt.source_base_path, opt.source_file_list), - column=opt.source_file_column, - limit=opt.limit - ) - - def test_should_raise_error_if_source_path_is_invalid(self): - opt = parse_args(SOME_ARGV) - opt.source_base_path = '/other/path' - with pytest.raises(AssertionError): - run(opt) - - def test_should_use_passed_in_source_path_if_valid( - self, - get_output_file_list_mock, - load_file_list_mock): - - opt = parse_args(SOME_ARGV) - opt.source_base_path = '/base' - load_file_list_mock.return_value = ['/base/source/file1', '/base/source/file2'] - run(opt) - get_output_file_list_mock.assert_called_with( - ANY, - opt.source_base_path, - ANY, - ANY - ) - - def test_should_check_file_list_if_enabled( - self, - get_output_file_list_mock, - check_files_and_report_result_mock): - - opt = parse_args(SOME_ARGV) - opt.check = True - run(opt) - check_files_and_report_result_mock.assert_called_with( - get_output_file_list_mock.return_value - ) - - def test_should_limit_files_to_check( - self, - load_file_list_mock, - get_output_file_list_mock, - check_files_and_report_result_mock): - - opt = parse_args(SOME_ARGV) - opt.check = True - opt.check_limit = 1 - load_file_list_mock.return_value = [FILE_1, FILE_2] - run(opt) - check_files_and_report_result_mock.assert_called_with( - get_output_file_list_mock.return_value[:opt.check_limit] - ) - - def test_should_save_relative_paths_if_enabled( - self, - get_output_file_list_mock, - to_relative_file_list_mock, - save_file_list_mock): - - opt = parse_args(SOME_ARGV) - opt.use_relative_paths = True - run(opt) - to_relative_file_list_mock.assert_called_with( - opt.output_base_path, - get_output_file_list_mock.return_value, - ) - save_file_list_mock.assert_called_with( - opt.output_file_list, - to_relative_file_list_mock.return_value, - column=opt.source_file_column - ) - -class TestMain(object): - def test_should_parse_args_and_call_run(self): - m = get_output_files - with patch.object(m, 'parse_args') as parse_args_mock: - with patch.object(m, 'run') as run_mock: - main(SOME_ARGV) - parse_args_mock.assert_called_with(SOME_ARGV) - run_mock.assert_called_with(parse_args_mock.return_value) diff --git a/sciencebeam_gym/preprocess/lxml_to_svg.py b/sciencebeam_gym/preprocess/lxml_to_svg.py index 5cd417b..8186739 100644 --- a/sciencebeam_gym/preprocess/lxml_to_svg.py +++ b/sciencebeam_gym/preprocess/lxml_to_svg.py @@ -4,15 +4,15 @@ import os from lxml import etree -from sciencebeam_gym.utils.bounding_box import ( - BoundingBox -) - -from sciencebeam_gym.utils.csv import ( +from sciencebeam_utils.utils.csv import ( open_csv_output, write_dict_csv ) +from sciencebeam_gym.utils.bounding_box import ( + BoundingBox +) + from sciencebeam_gym.preprocess.annotation.annotator import ( Annotator, DEFAULT_ANNOTATORS diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py index e1ce7ae..9f36244 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py @@ -9,39 +9,44 @@ import apache_beam as beam from apache_beam.io.filesystems import FileSystems from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions -from sciencebeam_gym.utils.collection import ( - extend_dict, - remove_keys_from_dict -) - -from sciencebeam_gym.utils.file_path import ( - relative_path, - join_if_relative_path -) - -from sciencebeam_gym.beam_utils.utils import ( +from sciencebeam_utils.beam_utils.utils import ( TransformAndCount, TransformAndLog, MapOrLog, PreventFusion ) -from sciencebeam_gym.beam_utils.csv import ( +from sciencebeam_utils.beam_utils.csv import ( WriteDictCsv, ReadDictCsv ) -from sciencebeam_gym.beam_utils.io import ( +from sciencebeam_utils.beam_utils.io import ( read_all_from_path, basename, save_file_content ) -from sciencebeam_gym.beam_utils.main import ( +from sciencebeam_utils.beam_utils.main import ( add_cloud_args, process_cloud_args ) +from sciencebeam_utils.utils.collection import ( + extend_dict, + remove_keys_from_dict +) + +from sciencebeam_utils.utils.file_path import ( + change_ext, + relative_path, + join_if_relative_path +) + +from sciencebeam_utils.utils.file_pairs import ( + find_file_pairs_grouped_by_parent_directory_or_name, +) + from sciencebeam_gym.structured_document.svg import ( SvgStructuredDocument ) @@ -61,8 +66,6 @@ from sciencebeam_gym.preprocess.annotation.annotation_evaluation import ( ) from sciencebeam_gym.preprocess.preprocessing_utils import ( - change_ext, - find_file_pairs_grouped_by_parent_directory_or_name, convert_pdf_bytes_to_lxml, convert_and_annotate_lxml_content, pdf_bytes_to_png_pages, diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py index e42cf0b..f444ce7 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py @@ -6,21 +6,21 @@ import pytest import apache_beam as beam -from sciencebeam_gym.utils.collection import ( - extend_dict -) - -from sciencebeam_gym.beam_utils.utils import ( +from sciencebeam_utils.beam_utils.utils import ( TransformAndLog ) -from sciencebeam_gym.beam_utils.testing import ( +from sciencebeam_utils.beam_utils.testing import ( BeamTest, TestPipeline, get_current_test_context, get_counter_value ) +from sciencebeam_utils.utils.collection import ( + extend_dict +) + from sciencebeam_gym.preprocess.preprocessing_pipeline import ( parse_args, configure_pipeline, diff --git a/sciencebeam_gym/preprocess/preprocessing_transforms_test.py b/sciencebeam_gym/preprocess/preprocessing_transforms_test.py index c821ff6..732700c 100644 --- a/sciencebeam_gym/preprocess/preprocessing_transforms_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_transforms_test.py @@ -3,19 +3,19 @@ import pytest import apache_beam as beam from apache_beam.io.filesystems import FileSystems -from sciencebeam_gym.beam_utils.io import ( +from sciencebeam_utils.beam_utils.io import ( find_matching_filenames ) -from sciencebeam_gym.utils.tfrecord import ( - iter_read_tfrecord_file_as_dict_list -) - -from sciencebeam_gym.beam_utils.testing import ( +from sciencebeam_utils.beam_utils.testing import ( BeamTest, TestPipeline ) +from sciencebeam_gym.utils.tfrecord import ( + iter_read_tfrecord_file_as_dict_list +) + from sciencebeam_gym.preprocess.preprocessing_transforms import ( WritePropsToTFRecord ) diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py index 96d6ff6..62d7be8 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils.py @@ -11,29 +11,33 @@ from lxml import etree from apache_beam.io.filesystems import FileSystems -from sciencebeam_gym.utils.xml import ( +from sciencebeam_alignment.align import ( + native_enabled as align_native_enabled +) + +from sciencebeam_utils.beam_utils.io import ( + find_matching_filenames +) + +from sciencebeam_utils.utils.xml import ( xml_from_string_with_recover ) -from sciencebeam_gym.utils.stopwatch import ( +from sciencebeam_utils.utils.stopwatch import ( StopWatchRecorder ) -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( groupby_to_dict, sort_and_groupby_to_dict ) -from sciencebeam_gym.utils.pages_zip import ( - save_pages -) - -from sciencebeam_gym.beam_utils.io import ( - find_matching_filenames +from sciencebeam_utils.utils.file_path import ( + relative_path ) -from sciencebeam_gym.utils.file_path import ( - relative_path +from sciencebeam_gym.utils.pages_zip import ( + save_pages ) from sciencebeam_gym.preprocess.lxml_to_svg import ( @@ -49,10 +53,6 @@ from sciencebeam_gym.preprocess.annotation.annotator import ( DEFAULT_ANNOTATORS ) -from sciencebeam_gym.alignment.align import ( - native_enabled as align_native_enabled -) - from sciencebeam_gym.preprocess.annotation.matching_annotator import ( MatchingAnnotator ) @@ -77,85 +77,10 @@ from sciencebeam_gym.pdf import ( PdfToPng ) -# deprecated, moved to sciencebeam_gym.utils.file_path -# pylint: disable=wrong-import-position, unused-import -from sciencebeam_gym.utils.file_path import ( - join_if_relative_path, -) -# pylint: enable=wrong-import-position, unused-import - def get_logger(): return logging.getLogger(__name__) -def group_files_by_parent_directory(filenames): - return groupby_to_dict(sorted(filenames), lambda x: os.path.dirname(x)) - -def get_ext(filename): - name, ext = os.path.splitext(filename) - if ext == '.gz': - ext = get_ext(name) + ext - return ext - -def strip_ext(filename): - # strip of gz, assuming there will be another extension before .gz - if filename.endswith('.gz'): - filename = filename[:-3] - return os.path.splitext(filename)[0] - -def group_files_by_name_excl_ext(filenames): - return sort_and_groupby_to_dict(filenames, strip_ext) - -def zip_by_keys(*dict_list): - keys = reduce(lambda agg, v: agg | set(v.keys()), dict_list, set()) - return ( - [d.get(k) for d in dict_list] - for k in sorted(keys) - ) - -def group_file_pairs_by_parent_directory_or_name(files_by_type): - grouped_files_by_pattern = [ - group_files_by_parent_directory(files) for files in files_by_type - ] - for files_in_group_by_pattern in zip_by_keys(*grouped_files_by_pattern): - if all(len(files or []) == 1 for files in files_in_group_by_pattern): - yield tuple([files[0] for files in files_in_group_by_pattern]) - else: - grouped_by_name = [ - group_files_by_name_excl_ext(files or []) - for files in files_in_group_by_pattern - ] - for files_by_name in zip_by_keys(*grouped_by_name): - if all(len(files or []) == 1 for files in files_by_name): - yield tuple([files[0] for files in files_by_name]) - else: - get_logger().info( - 'no exclusively matching files found: %s', - [files for files in files_by_name] - ) - -def find_file_pairs_grouped_by_parent_directory_or_name(patterns, limit=None): - matching_files_by_pattern = [ - list(find_matching_filenames(pattern)) for pattern in patterns - ] - get_logger().info( - 'found number of files %s', - ', '.join( - '%s: %d' % (pattern, len(files)) - for pattern, files in zip(patterns, matching_files_by_pattern) - ) - ) - patterns_without_files = [ - pattern - for pattern, files in zip(patterns, matching_files_by_pattern) - if len(files) == 0 - ] - if patterns_without_files: - raise RuntimeError('no files found for: %s' % patterns_without_files) - return group_file_pairs_by_parent_directory_or_name( - matching_files_by_pattern - ) - def convert_pdf_bytes_to_lxml(pdf_content, path=None, page_range=None): stop_watch_recorder = StopWatchRecorder() @@ -222,45 +147,6 @@ def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, na return svg_roots -def change_ext(path, old_ext, new_ext): - if old_ext is None: - old_ext = os.path.splitext(path)[1] - if old_ext == '.gz': - path = path[:-len(old_ext)] - old_ext = os.path.splitext(path)[1] - if old_ext and path.endswith(old_ext): - return path[:-len(old_ext)] + new_ext - else: - return path + new_ext - -def base_path_for_file_list(file_list): - common_prefix = os.path.commonprefix(file_list) - i = max(common_prefix.rfind('/'), common_prefix.rfind('\\')) - if i >= 0: - return common_prefix[:i] - else: - return '' - -def get_or_validate_base_path(file_list, base_path): - common_path = base_path_for_file_list(file_list) - if base_path: - if not common_path.startswith(base_path): - raise AssertionError( - "invalid base path '%s', common path is: '%s'" % (base_path, common_path) - ) - return base_path - else: - return common_path - -def get_output_file(filename, source_base_path, output_base_path, output_file_suffix): - return FileSystems.join( - output_base_path, - change_ext( - relative_path(source_base_path, filename), - None, output_file_suffix - ) - ) - def save_svg_roots(output_filename, svg_pages): return save_pages(output_filename, '.svg', ( etree.tostring(svg_page) diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py index 12b20f7..a287dd3 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py @@ -1,7 +1,5 @@ from mock import patch, MagicMock, DEFAULT -import pytest - from lxml import etree from sciencebeam_gym.structured_document.svg import ( @@ -10,12 +8,7 @@ from sciencebeam_gym.structured_document.svg import ( from sciencebeam_gym.preprocess.preprocessing_utils import ( svg_page_to_blockified_png_bytes, - group_file_pairs_by_parent_directory_or_name, convert_pdf_bytes_to_lxml, - change_ext, - base_path_for_file_list, - get_or_validate_base_path, - get_output_file, parse_page_range, ) @@ -36,58 +29,6 @@ class TestSvgPageToBlockifiedPngBytes(object): kwargs = call_args[1] assert (kwargs.get('width'), kwargs.get('height')) == (100.1, 200.9) -class TestGroupFilePairsByParentDirectoryOrName(object): - def test_should_return_empty_list_with_empty_input_file_lists(self): - assert list(group_file_pairs_by_parent_directory_or_name([ - [], - [] - ])) == [] - - def test_should_group_single_file(self): - assert list(group_file_pairs_by_parent_directory_or_name([ - ['parent1/file.x'], - ['parent1/file.y'] - ])) == [('parent1/file.x', 'parent1/file.y')] - - def test_should_group_single_file_in_directory_with_different_names(self): - assert list(group_file_pairs_by_parent_directory_or_name([ - ['parent1/file1.x'], - ['parent1/file2.y'] - ])) == [('parent1/file1.x', 'parent1/file2.y')] - - def test_should_ignore_files_in_different_directories(self): - assert list(group_file_pairs_by_parent_directory_or_name([ - ['parent1/file.x'], - ['parent2/file.y'] - ])) == [] - - def test_should_group_multiple_files_in_separate_parent_directories(self): - assert list(group_file_pairs_by_parent_directory_or_name([ - ['parent1/file.x', 'parent2/file.x'], - ['parent1/file.y', 'parent2/file.y'] - ])) == [ - ('parent1/file.x', 'parent1/file.y'), - ('parent2/file.x', 'parent2/file.y') - ] - - def test_should_group_multiple_files_in_same_parent_directory_with_same_name(self): - assert list(group_file_pairs_by_parent_directory_or_name([ - ['parent1/file1.x', 'parent1/file2.x'], - ['parent1/file1.y', 'parent1/file2.y'] - ])) == [ - ('parent1/file1.x', 'parent1/file1.y'), - ('parent1/file2.x', 'parent1/file2.y') - ] - - def test_should_group_multiple_files_in_same_parent_directory_with_same_name_gzipped(self): - assert list(group_file_pairs_by_parent_directory_or_name([ - ['parent1/file1.x.gz', 'parent1/file2.x.gz'], - ['parent1/file1.y.gz', 'parent1/file2.y.gz'] - ])) == [ - ('parent1/file1.x.gz', 'parent1/file1.y.gz'), - ('parent1/file2.x.gz', 'parent1/file2.y.gz') - ] - DEFAULT_PDF_TO_LXML_ARGS = ['-blocks', '-noImageInline', '-noImage', '-fullFontName'] LXML_CONTENT_1 = b'lxml content 1' @@ -115,74 +56,6 @@ class TestConvertPdfBytesToLxml(object): ) assert lxml_content == LXML_CONTENT_1 -class TestChangeExt(object): - def test_should_replace_simple_ext_with_simple_ext(self): - assert change_ext('file.pdf', None, '.xml') == 'file.xml' - - def test_should_replace_simple_ext_with_combined_ext(self): - assert change_ext('file.pdf', None, '.svg.zip') == 'file.svg.zip' - - def test_should_remove_gz_ext_before_replacing_ext(self): - assert change_ext('file.pdf.gz', None, '.svg.zip') == 'file.svg.zip' - -class TestBasePathForFileList(object): - def test_should_return_empty_string_if_file_list_is_empty(self): - assert base_path_for_file_list([]) == '' - - def test_should_return_empty_string_if_filename_is_empty(self): - assert base_path_for_file_list(['']) == '' - - def test_should_return_parent_directory_of_single_file(self): - assert base_path_for_file_list(['/base/path/1/file']) == '/base/path/1' - - def test_should_return_common_path_of_two_files(self): - assert base_path_for_file_list(['/base/path/1/file', '/base/path/2/file']) == '/base/path' - - def test_should_return_common_path_of_two_files_using_protocol(self): - assert base_path_for_file_list([ - 'a://base/path/1/file', 'a://base/path/2/file' - ]) == 'a://base/path' - - def test_should_return_common_path_of_two_files_using_forward_slash(self): - assert base_path_for_file_list([ - '\\base\\path\\1\\file', '\\base\\path\\2\\file' - ]) == '\\base\\path' - - def test_should_return_empty_string_if_no_common_path_was_found(self): - assert base_path_for_file_list(['a://base/path/1/file', 'b://base/path/2/file']) == '' - - def test_should_return_common_path_ignoring_partial_name_match(self): - assert base_path_for_file_list(['/base/path/file1', '/base/path/file2']) == '/base/path' - -class TestGetOrValidateBasePath(object): - def test_should_return_base_path_of_two_files_if_no_base_path_was_provided(self): - assert get_or_validate_base_path( - ['/base/path/1/file', '/base/path/2/file'], - None - ) == '/base/path' - - def test_should_return_passed_in_base_path_if_valid(self): - assert get_or_validate_base_path( - ['/base/path/1/file', '/base/path/2/file'], - '/base' - ) == '/base' - - def test_should_raise_error_if_passed_in_base_path_is_invalid(self): - with pytest.raises(AssertionError): - get_or_validate_base_path( - ['/base/path/1/file', '/base/path/2/file'], - '/base/other' - ) - -class TestGetOutputFile(object): - def test_should_return_output_file_with_path_and_change_ext(self): - assert get_output_file( - '/source/path/file.pdf', - '/source', - '/output', - '.xml' - ) == '/output/path/file.xml' - class TestPageRange(object): def test_should_parse_single_page_number_as_range(self): assert parse_page_range('1') == (1, 1) diff --git a/sciencebeam_gym/preprocess/split_csv_dataset.py b/sciencebeam_gym/preprocess/split_csv_dataset.py deleted file mode 100644 index 9035fba..0000000 --- a/sciencebeam_gym/preprocess/split_csv_dataset.py +++ /dev/null @@ -1,142 +0,0 @@ -import argparse -import csv -import logging -from math import trunc -from random import shuffle - -from apache_beam.io.filesystems import FileSystems - -from sciencebeam_gym.utils.csv import ( - csv_delimiter_by_filename, - write_csv_rows -) - -from sciencebeam_gym.preprocess.preprocessing_utils import ( - strip_ext, - get_ext -) - -def get_logger(): - return logging.getLogger(__name__) - -def extract_proportions_from_args(args): - digits = 3 - proportions = [ - (name, round(p, digits)) - for name, p in [ - ('train', args.train), - ('test', args.test), - ('validation', args.validation) - ] - if p > 0 - ] - if sum(p for _, p in proportions) > 1.0: - raise ValueError('proportions add up to more than 1.0') - if not args.test: - proportions.append(('test', 1.0 - sum(p for _, p in proportions))) - elif not args.validation: - proportions.append(('validation', round(1.0 - sum(p for _, p in proportions), digits))) - proportions = [(name, p) for name, p in proportions if p > 0] - return proportions - -def split_rows(rows, percentages, fill=False): - size = len(rows) - chunk_size_list = [int(trunc(p * size)) for p in percentages] - if fill: - chunk_size_list[-1] = size - sum(chunk_size_list[:-1]) - chunk_offset_list = [0] - for chunk_size in chunk_size_list[0:-1]: - chunk_offset_list.append(chunk_offset_list[-1] + chunk_size) - get_logger().debug('chunk_offset_list: %s', chunk_offset_list) - get_logger().debug('chunk_size_list: %s', chunk_size_list) - return [ - rows[chunk_offset:chunk_offset + chunk_size] - for chunk_offset, chunk_size in zip(chunk_offset_list, chunk_size_list) - ] - -def output_filenames_for_names(names, prefix, ext): - return [ - prefix + ('' if prefix.endswith('/') else '-') + name + ext - for name in names - ] - -def parse_args(argv=None): - parser = argparse.ArgumentParser() - parser.add_argument( - '--input', type=str, required=True, - help='input csv/tsv file' - ) - parser.add_argument( - '--train', type=float, required=True, - help='Train dataset proportion' - ) - parser.add_argument( - '--test', type=float, required=False, - help='Test dataset proportion (if not specified it is assumed to be the remaining percentage)' - ) - parser.add_argument( - '--validation', type=float, required=False, - help='Validation dataset proportion (requires test-proportion)' - ) - parser.add_argument( - '--random', action='store_true', default=False, - help='randomise samples before doing the split' - ) - parser.add_argument( - '--fill', action='store_true', default=False, - help='use up all of the remaining data rows for the last set' - ) - parser.add_argument( - '--no-header', action='store_true', default=False, - help='input file does not contain a header' - ) - parser.add_argument( - '--out', type=str, required=False, - help='output csv/tsv file prefix or directory (if ending with slash)' - ' will use input file name by default' - ) - return parser.parse_args(argv) - -def process_args(args): - if not args.out: - args.out = strip_ext(args.input) - -def main(argv=None): - args = parse_args(argv) - process_args(args) - ext = get_ext(args.input) - proportions = extract_proportions_from_args(args) - output_filenames = output_filenames_for_names( - [name for name, _ in proportions], - args.out, - ext - ) - get_logger().info('proportions: %s', proportions) - get_logger().info('output_filenames: %s', output_filenames) - delimiter = csv_delimiter_by_filename(args.input) - with FileSystems.open(args.input) as f: - reader = csv.reader(f, delimiter=delimiter) - header_row = None if args.no_header else next(reader) - data_rows = list(reader) - get_logger().info('number of rows: %d', len(data_rows)) - if args.random: - shuffle(data_rows) - data_rows_by_set = split_rows( - data_rows, - [p for _, p in proportions], - fill=args.fill - ) - - mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv' - for output_filename, set_data_rows in zip(output_filenames, data_rows_by_set): - get_logger().info('set size: %d (%s)', len(set_data_rows), output_filename) - with FileSystems.create(output_filename, mime_type=mime_type) as f: - writer = csv.writer(f, delimiter=delimiter) - if header_row: - write_csv_rows(writer, [header_row]) - write_csv_rows(writer, set_data_rows) - -if __name__ == '__main__': - logging.basicConfig(level='INFO') - - main() diff --git a/sciencebeam_gym/preprocess/split_csv_dataset_test.py b/sciencebeam_gym/preprocess/split_csv_dataset_test.py deleted file mode 100644 index a3c470a..0000000 --- a/sciencebeam_gym/preprocess/split_csv_dataset_test.py +++ /dev/null @@ -1,63 +0,0 @@ -from collections import namedtuple - -from sciencebeam_gym.preprocess.split_csv_dataset import ( - extract_proportions_from_args, - split_rows, - output_filenames_for_names -) - -def create_args(**kwargs): - return namedtuple('args', kwargs.keys())(**kwargs) - -class TestExtractProportionsFromArgs(object): - def test_should_create_train_test_split_with_only_train_specified(self): - assert extract_proportions_from_args( - create_args(train=0.6, test=None, validation=None) - ) == [('train', 0.6), ('test', 0.4)] - - def test_should_create_train_test_validation_split_with_train_and_test_specified(self): - assert extract_proportions_from_args( - create_args(train=0.6, test=0.3, validation=None) - ) == [('train', 0.6), ('test', 0.3), ('validation', 0.1)] - - def test_should_not_add_validation_if_remaining_percentage_is_zero(self): - assert extract_proportions_from_args( - create_args(train=0.6, test=0.4, validation=None) - ) == [('train', 0.6), ('test', 0.4)] - -class TestSplitRows(object): - def test_should_split_train_test(self): - assert split_rows(list(range(10)), [0.6, 0.4]) == [ - list(range(6)), - list(range(6, 10)) - ] - - def test_should_split_train_test_validation(self): - assert split_rows(list(range(10)), [0.6, 0.3, 0.1]) == [ - list(range(6)), - list(range(6, 9)), - list(range(9, 10)) - ] - - def test_should_round_down(self): - assert split_rows(list(range(11)), [0.6, 0.4]) == [ - list(range(6)), - list(range(6, 10)) - ] - - def test_should_fill_last_chunk_if_enabled(self): - assert split_rows(list(range(11)), [0.6, 0.4], fill=True) == [ - list(range(6)), - list(range(6, 11)) - ] - -class TestGetOutputFilenamesForNames(object): - def test_should_add_name_and_ext_with_path_sep_if_out_ends_with_slash(self): - assert output_filenames_for_names( - ['train', 'test'], 'out/', '.tsv' - ) == ['out/train.tsv', 'out/test.tsv'] - - def test_should_add_name_and_ext_with_hyphen_if_out_does_not_end_with_slash(self): - assert output_filenames_for_names( - ['train', 'test'], 'out', '.tsv' - ) == ['out-train.tsv', 'out-test.tsv'] diff --git a/sciencebeam_gym/structured_document/lxml.py b/sciencebeam_gym/structured_document/lxml.py index 6cb374b..5219529 100644 --- a/sciencebeam_gym/structured_document/lxml.py +++ b/sciencebeam_gym/structured_document/lxml.py @@ -1,9 +1,9 @@ -from sciencebeam_gym.utils.bounding_box import ( - BoundingBox +from sciencebeam_utils.utils.xml import ( + set_or_remove_attrib ) -from sciencebeam_gym.utils.xml import ( - set_or_remove_attrib +from sciencebeam_gym.utils.bounding_box import ( + BoundingBox ) from sciencebeam_gym.structured_document import ( diff --git a/sciencebeam_gym/structured_document/structured_document_saver.py b/sciencebeam_gym/structured_document/structured_document_saver.py index 97d170a..5ca17bd 100644 --- a/sciencebeam_gym/structured_document/structured_document_saver.py +++ b/sciencebeam_gym/structured_document/structured_document_saver.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from lxml import etree -from sciencebeam_gym.beam_utils.io import ( +from sciencebeam_utils.beam_utils.io import ( save_file_content ) diff --git a/sciencebeam_gym/structured_document/svg.py b/sciencebeam_gym/structured_document/svg.py index 27655c2..194ccf9 100644 --- a/sciencebeam_gym/structured_document/svg.py +++ b/sciencebeam_gym/structured_document/svg.py @@ -1,9 +1,9 @@ -from sciencebeam_gym.utils.bounding_box import ( - BoundingBox +from sciencebeam_utils.utils.xml import ( + set_or_remove_attrib ) -from sciencebeam_gym.utils.xml import ( - set_or_remove_attrib +from sciencebeam_gym.utils.bounding_box import ( + BoundingBox ) from sciencebeam_gym.structured_document import ( diff --git a/sciencebeam_gym/tools/calculate_class_weights_test.py b/sciencebeam_gym/tools/calculate_class_weights_test.py index bd9053f..5dd9935 100644 --- a/sciencebeam_gym/tools/calculate_class_weights_test.py +++ b/sciencebeam_gym/tools/calculate_class_weights_test.py @@ -6,7 +6,7 @@ from io import BytesIO from backports.tempfile import TemporaryDirectory -from sciencebeam_gym.utils.num import ( +from sciencebeam_utils.utils.num import ( assert_close, assert_all_close ) diff --git a/sciencebeam_gym/trainer/data/examples.py b/sciencebeam_gym/trainer/data/examples.py index bc2b313..0c13b04 100644 --- a/sciencebeam_gym/trainer/data/examples.py +++ b/sciencebeam_gym/trainer/data/examples.py @@ -4,7 +4,7 @@ from functools import partial import tensorflow as tf from tensorflow.python.lib.io import file_io # pylint: disable=E0611 -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( extend_dict ) diff --git a/sciencebeam_gym/trainer/data/examples_test.py b/sciencebeam_gym/trainer/data/examples_test.py index 970dac4..7a2886a 100644 --- a/sciencebeam_gym/trainer/data/examples_test.py +++ b/sciencebeam_gym/trainer/data/examples_test.py @@ -5,7 +5,7 @@ import pytest import tensorflow as tf -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( extend_dict ) diff --git a/sciencebeam_gym/trainer/evaluator_test.py b/sciencebeam_gym/trainer/evaluator_test.py index 9c1927a..3f257f0 100644 --- a/sciencebeam_gym/trainer/evaluator_test.py +++ b/sciencebeam_gym/trainer/evaluator_test.py @@ -3,7 +3,7 @@ import logging import pytest import tensorflow as tf -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( to_namedtuple ) diff --git a/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py b/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py index caf7f49..99e9c6f 100644 --- a/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py +++ b/sciencebeam_gym/trainer/models/pix2pix/evaluate_test.py @@ -6,7 +6,7 @@ import pytest import tensorflow as tf import numpy as np -from sciencebeam_gym.utils.num import ( +from sciencebeam_utils.utils.num import ( assert_close ) diff --git a/sciencebeam_gym/trainer/models/pix2pix/loss_test.py b/sciencebeam_gym/trainer/models/pix2pix/loss_test.py index be5c89f..25916b3 100644 --- a/sciencebeam_gym/trainer/models/pix2pix/loss_test.py +++ b/sciencebeam_gym/trainer/models/pix2pix/loss_test.py @@ -5,7 +5,7 @@ from six import raise_from import tensorflow as tf import numpy as np -from sciencebeam_gym.utils.num import ( +from sciencebeam_utils.utils.num import ( assert_close ) diff --git a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py index 488efe6..b514198 100644 --- a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py +++ b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_core_test.py @@ -6,12 +6,12 @@ import tensorflow as tf import numpy as np import pytest -from sciencebeam_gym.utils.num import ( +from sciencebeam_utils.utils.num import ( assert_all_close, assert_all_not_close ) -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( extend_dict ) diff --git a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py index ce921ec..7377812 100644 --- a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py +++ b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model_test.py @@ -6,7 +6,7 @@ from pytest import raises import tensorflow as tf -from sciencebeam_gym.utils.collection import ( +from sciencebeam_utils.utils.collection import ( extend_dict ) diff --git a/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py b/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py index abb269f..22cfc0d 100644 --- a/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py +++ b/sciencebeam_gym/trainer/models/pix2pix/tf_utils_test.py @@ -4,7 +4,7 @@ from __future__ import division import tensorflow as tf import numpy as np -from sciencebeam_gym.utils.num import ( +from sciencebeam_utils.utils.num import ( assert_all_close ) diff --git a/sciencebeam_gym/utils/collection.py b/sciencebeam_gym/utils/collection.py deleted file mode 100644 index c790410..0000000 --- a/sciencebeam_gym/utils/collection.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import absolute_import - -from collections import namedtuple -from itertools import groupby - -from six import iteritems - -flatten = lambda l: [item for sublist in l for item in sublist] - -iter_flatten = lambda l: (item for sublist in l for item in sublist) - -def filter_truthy(list_of_something): - return [l for l in list_of_something if l] - -def strip_all(list_of_strings): - return [(s or '').strip() for s in list_of_strings if s] - -def remove_key_from_dict(d, key): - return {k: v for k, v in iteritems(d) if k != key} - -def remove_keys_from_dict(d, keys_to_remove): - if not keys_to_remove: - return d - return { - k: v - for k, v in iteritems(d) - if k not in keys_to_remove - } - -def extract_from_dict(d, key, default_value=None): - return d.get(key, default_value), remove_key_from_dict(d, key) - -def extend_dict(d, *other_dicts, **kwargs): - """ - example: - - extend_dict(d1, d2) - - is equivalent to Python 3 syntax: - { - **d1, - **d2 - } - """ - d = d.copy() - for other_dict in other_dicts: - d.update(other_dict) - d.update(kwargs) - return d - -def groupby_to_dict(iterable, key): - return { - k: list(v) - for k, v in groupby(iterable, key=key) - } - -def sort_and_groupby_to_dict(iterable, key): - return groupby_to_dict(sorted(iterable, key=key), key) - -def to_namedtuple(*args, **kwargs): - name = kwargs.pop('name', 'Tuple') - d = extend_dict(*list(args) + [kwargs]) - return namedtuple(name, d.keys())(**d) diff --git a/sciencebeam_gym/utils/compat.py b/sciencebeam_gym/utils/compat.py deleted file mode 100644 index a52ca94..0000000 --- a/sciencebeam_gym/utils/compat.py +++ /dev/null @@ -1,14 +0,0 @@ -from six import PY3 - -def python_2_unicode_compatible(cls): - """ - Same as futures.utils.python_2_unicode_compatible but with support for __repr__ - """ - if not PY3: - if cls.__repr__ is not object.__repr__: - unicode_repr = cls.__repr__ - cls.__repr__ = lambda self: unicode_repr(self).encode('utf-8') - if cls.__str__ is not object.__str__: - cls.__unicode__ = cls.__str__ - cls.__str__ = lambda self: self.__unicode__().encode('utf-8') - return cls diff --git a/sciencebeam_gym/utils/compat_test.py b/sciencebeam_gym/utils/compat_test.py deleted file mode 100644 index 8c7d380..0000000 --- a/sciencebeam_gym/utils/compat_test.py +++ /dev/null @@ -1,57 +0,0 @@ -from six import text_type - -from sciencebeam_gym.utils.compat import ( - python_2_unicode_compatible -) - -ASCII_VALUE = 'abc' -UNICODE_VALUE = u'a\u1234b' - -@python_2_unicode_compatible -class ReprWrapper(object): - def __init__(self, value): - self.value = value - - def __repr__(self): - return self.value - -@python_2_unicode_compatible -class StrWrapper(object): - def __init__(self, value): - self.value = value - - def __str__(self): - return self.value - -@python_2_unicode_compatible -class ReprStrWrapper(object): - def __init__(self, value): - self.value = value - - def __repr__(self): - return self.value - - def __str__(self): - return self.value - -class TestPython2UnicodeCompatible(object): - def test_should_return_repr_ascii_value(self): - assert repr(ReprWrapper(text_type(ASCII_VALUE))) == ASCII_VALUE - - def test_should_encode_repr_unicode_value_without_str(self): - assert repr(ReprWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8') - - def test_should_encode_repr_unicode_value_with_str(self): - assert repr(ReprStrWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8') - - def test_should_return_str_ascii_value(self): - assert str(StrWrapper(text_type(ASCII_VALUE))) == ASCII_VALUE - - def test_should_encode_str_unicode_value_without_repr(self): - assert str(StrWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8') - - def test_should_encode_str_unicode_value_with_repr(self): - assert str(ReprStrWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8') - - def test_should_encode_str_unicode_value_with_repr_but_without_str(self): - assert str(ReprWrapper(UNICODE_VALUE)) == UNICODE_VALUE.encode('utf-8') diff --git a/sciencebeam_gym/utils/csv.py b/sciencebeam_gym/utils/csv.py deleted file mode 100644 index a4d7f99..0000000 --- a/sciencebeam_gym/utils/csv.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import csv - -import six - -TEMP_FILE_SUFFIX = '.part' - -def csv_delimiter_by_filename(filename): - if '.tsv' in filename: - return '\t' - else: - return ',' - -def open_csv_output(filename): - return open(filename, 'w') - -def write_csv_rows(writer, iterable): - if six.PY2: - for row in iterable: - writer.writerow([ - x.encode('utf-8') if isinstance(x, six.text_type) else x - for x in row - ]) - else: - for row in iterable: - writer.writerow(row) - -def write_csv_row(writer, row): - write_csv_rows(writer, [row]) - -def write_csv(filename, columns, iterable, delimiter=None): - if delimiter is None: - delimiter = csv_delimiter_by_filename(filename) - is_stdout = filename in {'stdout', '/dev/stdout'} - temp_filename = ( - filename + TEMP_FILE_SUFFIX - if is_stdout - else filename - ) - if not is_stdout and os.path.isfile(filename): - os.remove(filename) - with open_csv_output(temp_filename) as csv_f: - writer = csv.writer(csv_f, delimiter=delimiter) - write_csv_rows(writer, [columns]) - write_csv_rows(writer, iterable) - if not is_stdout: - os.rename(temp_filename, filename) - -def iter_dict_to_list(iterable, fields): - return ( - [item.get(field) for field in fields] - for item in iterable - ) - -def write_dict_csv(filename, columns, iterable, delimiter=None): - write_csv(filename, columns, iter_dict_to_list(iterable, columns), delimiter=delimiter) diff --git a/sciencebeam_gym/utils/file_list.py b/sciencebeam_gym/utils/file_list.py deleted file mode 100644 index b062236..0000000 --- a/sciencebeam_gym/utils/file_list.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import absolute_import - -import codecs -import csv -import os -from itertools import islice - -from apache_beam.io.filesystems import FileSystems - -from sciencebeam_gym.utils.csv import ( - csv_delimiter_by_filename -) - -from .file_path import ( - relative_path, - join_if_relative_path -) - - -def is_csv_or_tsv_file_list(file_list_path): - return '.csv' in file_list_path or '.tsv' in file_list_path - -def load_plain_file_list(file_list_path, limit=None): - with FileSystems.open(file_list_path) as f: - lines = (x.rstrip() for x in codecs.getreader('utf-8')(f)) - if limit: - lines = islice(lines, 0, limit) - return list(lines) - -def load_csv_or_tsv_file_list(file_list_path, column, header=True, limit=None): - delimiter = csv_delimiter_by_filename(file_list_path) - with FileSystems.open(file_list_path) as f: - reader = csv.reader(f, delimiter=delimiter) - if not header: - assert isinstance(column, int) - column_index = column - else: - header_row = next(reader) - if isinstance(column, int): - column_index = column - else: - try: - column_index = header_row.index(column) - except ValueError: - raise ValueError( - 'column %s not found, available columns: %s' % - (column, header_row) - ) - lines = (x[column_index].decode('utf-8') for x in reader) - if limit: - lines = islice(lines, 0, limit) - return list(lines) - -def to_absolute_file_list(base_path, file_list): - return [join_if_relative_path(base_path, s) for s in file_list] - -def to_relative_file_list(base_path, file_list): - return [relative_path(base_path, s) for s in file_list] - -def load_file_list(file_list_path, column, header=True, limit=None, to_absolute=True): - if is_csv_or_tsv_file_list(file_list_path): - file_list = load_csv_or_tsv_file_list( - file_list_path, column=column, header=header, limit=limit - ) - else: - file_list = load_plain_file_list(file_list_path, limit=limit) - if to_absolute: - file_list = to_absolute_file_list( - os.path.dirname(file_list_path), file_list - ) - return file_list - -def save_plain_file_list(file_list_path, file_list): - with FileSystems.create(file_list_path) as f: - f.write('\n'.join(file_list).encode('utf-8')) - -def save_csv_or_tsv_file_list(file_list_path, file_list, column, header=True): - if header: - file_list = [column] + file_list - save_plain_file_list(file_list_path, file_list) - -def save_file_list(file_list_path, file_list, column, header=True): - if is_csv_or_tsv_file_list(file_list_path): - return save_csv_or_tsv_file_list( - file_list_path, file_list, column=column, header=header - ) - else: - return save_plain_file_list(file_list_path, file_list) diff --git a/sciencebeam_gym/utils/file_list_test.py b/sciencebeam_gym/utils/file_list_test.py deleted file mode 100644 index efeff9a..0000000 --- a/sciencebeam_gym/utils/file_list_test.py +++ /dev/null @@ -1,202 +0,0 @@ -import os -from tempfile import NamedTemporaryFile -from mock import patch -from backports.tempfile import TemporaryDirectory - -import pytest - -import sciencebeam_gym.utils.file_list as file_list_loader -from sciencebeam_gym.utils.file_list import ( - is_csv_or_tsv_file_list, - load_plain_file_list, - load_csv_or_tsv_file_list, - to_absolute_file_list, - to_relative_file_list, - load_file_list, - save_plain_file_list, - save_csv_or_tsv_file_list, - save_file_list -) - -FILE_1 = 'file1.pdf' -FILE_2 = 'file2.pdf' -UNICODE_FILE_1 = u'file1\u1234.pdf' -FILE_LIST = [FILE_1, FILE_2] - -@pytest.fixture(name='load_plain_file_list_mock') -def _load_plain_file_list(): - with patch.object(file_list_loader, 'load_plain_file_list') as mock: - yield mock - -@pytest.fixture(name='load_csv_or_tsv_file_list_mock') -def _load_csv_or_tsv_file_list(): - with patch.object(file_list_loader, 'load_csv_or_tsv_file_list') as mock: - yield mock - -@pytest.fixture(name='to_absolute_file_list_mock') -def _to_absolute_file_list(): - with patch.object(file_list_loader, 'to_absolute_file_list') as mock: - yield mock - -class TestIsCsvOrTsvFileList(object): - def test_should_return_true_if_file_ext_is_csv(self): - assert is_csv_or_tsv_file_list('files.csv') - - def test_should_return_true_if_file_ext_is_csv_gz(self): - assert is_csv_or_tsv_file_list('files.csv.gz') - - def test_should_return_true_if_file_ext_is_tsv(self): - assert is_csv_or_tsv_file_list('files.tsv') - - def test_should_return_true_if_file_ext_is_tsv_gz(self): - assert is_csv_or_tsv_file_list('files.tsv.gz') - - def test_should_return_false_if_file_ext_is_lst(self): - assert not is_csv_or_tsv_file_list('files.lst') - - def test_should_return_false_if_file_ext_is_lst_gz(self): - assert not is_csv_or_tsv_file_list('files.lst.gz') - -class TestLoadPlainFileList(object): - def test_should_read_multiple_file_paths_from_file(self): - with NamedTemporaryFile() as f: - f.write('\n'.join([FILE_1, FILE_2])) - f.flush() - assert load_plain_file_list(f.name) == [FILE_1, FILE_2] - - def test_should_read_unicode_file(self): - with NamedTemporaryFile() as f: - f.write('\n'.join([UNICODE_FILE_1.encode('utf-8')])) - f.flush() - assert load_plain_file_list(f.name) == [UNICODE_FILE_1] - - def test_should_apply_limit(self): - with NamedTemporaryFile() as f: - f.write('\n'.join([FILE_1, FILE_2])) - f.flush() - assert load_plain_file_list(f.name, limit=1) == [FILE_1] - -class TestLoadCsvOrTsvFileList(object): - def test_should_read_multiple_file_paths_from_file_with_header_using_column_name(self): - with NamedTemporaryFile() as f: - f.write('\n'.join(['url', FILE_1, FILE_2])) - f.flush() - assert load_csv_or_tsv_file_list(f.name, 'url') == [FILE_1, FILE_2] - - def test_should_read_multiple_file_paths_from_file_with_header_using_column_index(self): - with NamedTemporaryFile() as f: - f.write('\n'.join(['url', FILE_1, FILE_2])) - f.flush() - assert load_csv_or_tsv_file_list(f.name, 0) == [FILE_1, FILE_2] - - def test_should_read_multiple_file_paths_from_file_without_header(self): - with NamedTemporaryFile() as f: - f.write('\n'.join([FILE_1, FILE_2])) - f.flush() - assert load_csv_or_tsv_file_list(f.name, 0, header=False) == [FILE_1, FILE_2] - - def test_should_read_unicode_file(self): - with NamedTemporaryFile() as f: - f.write('\n'.join(['url', UNICODE_FILE_1.encode('utf-8')])) - f.flush() - assert load_csv_or_tsv_file_list(f.name, 'url') == [UNICODE_FILE_1] - - def test_should_raise_exception_if_column_name_is_invalid(self): - with pytest.raises(ValueError): - with NamedTemporaryFile() as f: - f.write('\n'.join(['url', FILE_1, FILE_2])) - f.flush() - assert load_csv_or_tsv_file_list(f.name, 'xyz') == [FILE_1, FILE_2] - - def test_should_raise_exception_if_column_index_is_invalid(self): - with pytest.raises(IndexError): - with NamedTemporaryFile() as f: - f.write('\n'.join(['url', FILE_1, FILE_2])) - f.flush() - assert load_csv_or_tsv_file_list(f.name, 1) == [FILE_1, FILE_2] - - def test_should_apply_limit(self): - with NamedTemporaryFile() as f: - f.write('\n'.join(['url', FILE_1, FILE_2])) - f.flush() - assert load_csv_or_tsv_file_list(f.name, 'url', limit=1) == [FILE_1] - -class TestToAbsoluteFileList(object): - def test_should_make_path_absolute(self): - assert to_absolute_file_list('/base/path', ['sub/file1']) == ['/base/path/sub/file1'] - - def test_should_not_change_absolute_paths(self): - assert to_absolute_file_list('/base/path', ['/other/file1']) == ['/other/file1'] - -class TestToRelativeFileList(object): - def test_should_make_path_absolute(self): - assert to_relative_file_list('/base/path', ['/base/path/sub/file1']) == ['sub/file1'] - - def test_should_not_change_path_outside_base_path(self): - assert to_relative_file_list('/base/path', ['/other/file1']) == ['/other/file1'] - -@pytest.mark.usefixtures( - 'load_plain_file_list_mock', 'load_csv_or_tsv_file_list_mock', 'to_absolute_file_list_mock' -) -class TestLoadFileList(object): - def test_should_call_load_plain_file_list(self, load_plain_file_list_mock): - result = load_file_list( - 'file-list.lst', column='url', header=True, limit=1, to_absolute=False - ) - load_plain_file_list_mock.assert_called_with('file-list.lst', limit=1) - assert result == load_plain_file_list_mock.return_value - - def test_should_call_load_csv_or_tsv_file_list(self, load_csv_or_tsv_file_list_mock): - result = load_file_list( - 'file-list.csv', column='url', header=True, limit=1, to_absolute=False - ) - load_csv_or_tsv_file_list_mock.assert_called_with( - 'file-list.csv', column='url', header=True, limit=1 - ) - assert result == load_csv_or_tsv_file_list_mock.return_value - - def test_should_make_file_list_absolute( - self, load_plain_file_list_mock, to_absolute_file_list_mock): - - result = load_file_list('/base/path/file-list.lst', column='url', to_absolute=True) - to_absolute_file_list_mock.assert_called_with( - '/base/path', load_plain_file_list_mock.return_value - ) - assert result == to_absolute_file_list_mock.return_value - -class TestSavePlainFileList(object): - def test_should_write_multiple_file_paths(self): - with TemporaryDirectory() as path: - file_list_path = os.path.join(path, 'out.lst') - save_plain_file_list(file_list_path, [FILE_1, FILE_2]) - assert load_plain_file_list(file_list_path) == [FILE_1, FILE_2] - - def test_should_write_unicode_file(self): - with TemporaryDirectory() as path: - file_list_path = os.path.join(path, 'out.lst') - save_plain_file_list(file_list_path, [UNICODE_FILE_1]) - assert load_plain_file_list(file_list_path) == [UNICODE_FILE_1] - -class TestSaveCsvOrTsvFileList(object): - def test_should_write_multiple_file_paths(self): - with TemporaryDirectory() as path: - file_list_path = os.path.join(path, 'out.csv') - save_csv_or_tsv_file_list(file_list_path, [FILE_1, FILE_2], column='url') - assert load_csv_or_tsv_file_list(file_list_path, column='url') == [FILE_1, FILE_2] - - def test_should_write_unicode_file(self): - with TemporaryDirectory() as path: - file_list_path = os.path.join(path, 'out.lst') - save_csv_or_tsv_file_list(file_list_path, [UNICODE_FILE_1], column='url') - assert load_csv_or_tsv_file_list(file_list_path, column='url') == [UNICODE_FILE_1] - -class TestSaveFileList(object): - def test_should_call_save_plain_file_list(self): - with patch.object(file_list_loader, 'save_plain_file_list') as mock: - save_file_list('file-list.lst', FILE_LIST, column='url', header=True) - mock.assert_called_with('file-list.lst', FILE_LIST) - - def test_should_call_save_csv_or_tsv_file_list(self): - with patch.object(file_list_loader, 'save_csv_or_tsv_file_list') as mock: - save_file_list('file-list.csv', FILE_LIST, column='url', header=True) - mock.assert_called_with('file-list.csv', FILE_LIST, column='url', header=True) diff --git a/sciencebeam_gym/utils/file_path.py b/sciencebeam_gym/utils/file_path.py deleted file mode 100644 index cc798c1..0000000 --- a/sciencebeam_gym/utils/file_path.py +++ /dev/null @@ -1,21 +0,0 @@ - -from __future__ import absolute_import - -from apache_beam.io.filesystems import FileSystems - -def relative_path(base_path, path): - if not base_path: - return path - if not base_path.endswith('/'): - base_path += '/' - return path[len(base_path):] if path.startswith(base_path) else path - -def is_relative_path(path): - return not path.startswith('/') and '://' not in path - -def join_if_relative_path(base_path, path): - return ( - FileSystems.join(base_path, path) - if base_path and is_relative_path(path) - else path - ) diff --git a/sciencebeam_gym/utils/file_path_test.py b/sciencebeam_gym/utils/file_path_test.py deleted file mode 100644 index 9ef1680..0000000 --- a/sciencebeam_gym/utils/file_path_test.py +++ /dev/null @@ -1,24 +0,0 @@ -from .file_path import ( - relative_path, - join_if_relative_path -) - -class TestRelativePath(object): - def test_should_return_path_if_base_path_is_none(self): - assert relative_path(None, 'file') == 'file' - - def test_should_return_path_if_path_outside_base_path(self): - assert relative_path('/parent', '/other/file') == '/other/file' - - def test_should_return_absolute_path_if_base_path_matches(self): - assert relative_path('/parent', '/parent/file') == 'file' - -class TestJoinIfRelativePath(object): - def test_should_return_path_if_base_path_is_none(self): - assert join_if_relative_path(None, 'file') == 'file' - - def test_should_return_path_if_not_relative(self): - assert join_if_relative_path('/parent', '/other/file') == '/other/file' - - def test_should_return_joined_path_if_relative(self): - assert join_if_relative_path('/parent', 'file') == '/parent/file' diff --git a/sciencebeam_gym/utils/io.py b/sciencebeam_gym/utils/io.py deleted file mode 100644 index ab9644c..0000000 --- a/sciencebeam_gym/utils/io.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import errno - -def makedirs(path, exists_ok=False): - try: - # Python 3 - os.makedirs(path, exists_ok=exists_ok) - except TypeError: - # Python 2 - try: - os.makedirs(path) - except OSError as e: - if e.errno == errno.EEXIST and os.path.isdir(path) and exists_ok: - pass - else: - raise diff --git a/sciencebeam_gym/utils/num.py b/sciencebeam_gym/utils/num.py deleted file mode 100644 index 2e77f18..0000000 --- a/sciencebeam_gym/utils/num.py +++ /dev/null @@ -1,21 +0,0 @@ -from six import raise_from - -import numpy as np - -def assert_close(a, b, atol=1.e-8): - try: - assert np.allclose([a], [b], atol=atol) - except AssertionError as e: - raise_from(AssertionError('expected %s to be close to %s (atol=%s)' % (a, b, atol)), e) - -def assert_all_close(a, b, atol=1.e-8): - try: - assert np.allclose(a, b, atol=atol) - except AssertionError as e: - raise_from(AssertionError('expected %s to be close to %s (atol=%s)' % (a, b, atol)), e) - -def assert_all_not_close(a, b, atol=1.e-8): - try: - assert not np.allclose(a, b, atol=atol) - except AssertionError as e: - raise_from(AssertionError('expected %s not to be close to %s (atol=%s)' % (a, b, atol)), e) diff --git a/sciencebeam_gym/utils/pages_zip.py b/sciencebeam_gym/utils/pages_zip.py index 59d421d..f992400 100644 --- a/sciencebeam_gym/utils/pages_zip.py +++ b/sciencebeam_gym/utils/pages_zip.py @@ -3,7 +3,7 @@ from zipfile import ZipFile, ZIP_DEFLATED from apache_beam.io.filesystems import FileSystems -from sciencebeam_gym.beam_utils.io import ( +from sciencebeam_utils.beam_utils.io import ( dirname, mkdirs_if_not_exists ) diff --git a/sciencebeam_gym/utils/stopwatch.py b/sciencebeam_gym/utils/stopwatch.py deleted file mode 100644 index bc750b4..0000000 --- a/sciencebeam_gym/utils/stopwatch.py +++ /dev/null @@ -1,46 +0,0 @@ -import sys -import time - -try: - perf_counter = time.perf_counter -except AttributeError: - # as per original timeit source (before perf_counter) - if sys.platform == "win32": - # On Windows, the best timer is time.clock() - perf_counter = time.clock - else: - # On most other platforms the best timer is time.time() - perf_counter = time.time - -class StopWatch(object): - def __init__(self): - self.start = perf_counter() - - def get_elapsed_seconds(self, reset=False): - end = perf_counter() - elapsed = end - self.start - if reset: - self.start = end - return elapsed - -class StopWatchRecorder(object): - def __init__(self): - self.stop_watch = StopWatch() - self.recorded_timings = [] - self.started = None - - def stop(self): - self.start(None) - - def start(self, name): - elapsed = self.stop_watch.get_elapsed_seconds(reset=True) - if self.started: - self.recorded_timings.append((self.started, elapsed)) - self.started = name - - def __str__(self): - total = ('total', sum(elapsed for _, elapsed in self.recorded_timings)) - return ', '.join( - '%s: %.6fs' % (name, elapsed) - for name, elapsed in self.recorded_timings + [total] - ) diff --git a/sciencebeam_gym/utils/string.py b/sciencebeam_gym/utils/string.py deleted file mode 100644 index 00aa95d..0000000 --- a/sciencebeam_gym/utils/string.py +++ /dev/null @@ -1,9 +0,0 @@ -from future.utils import python_2_unicode_compatible - -@python_2_unicode_compatible -class LazyStr(object): - def __init__(self, fn): - self.fn = fn - - def __str__(self): - return self.fn() diff --git a/sciencebeam_gym/utils/xml.py b/sciencebeam_gym/utils/xml.py deleted file mode 100644 index 764c6a7..0000000 --- a/sciencebeam_gym/utils/xml.py +++ /dev/null @@ -1,41 +0,0 @@ -from lxml import etree - -def _get_text_content_and_exclude(node, exclude): - result = '' - if node.text is not None: - result += node.text - result += ''.join([ - ( - _get_text_content_and_exclude(c, exclude) - if c not in exclude - else '' - ) + - (c.tail if c.tail is not None else '') - for c in node.iterchildren() - ]) - return result - -def get_text_content(node, exclude=None): - ''' - Strip tags and return text content - ''' - if not exclude: - return ''.join(node.itertext()) - return _get_text_content_and_exclude(node, exclude) - -def get_immediate_text(node): - return node.xpath('text()') - -def get_text_content_list(nodes, exclude=None): - return [get_text_content(node, exclude=exclude) for node in nodes] - -def xml_from_string_with_recover(s): - parser = etree.XMLParser(recover=True) - return etree.fromstring(s, parser=parser) - -def set_or_remove_attrib(attrib, name, value): - if value is None: - if name in attrib: - del attrib[name] - else: - attrib[name] = value diff --git a/sciencebeam_gym/utils/xml_test.py b/sciencebeam_gym/utils/xml_test.py deleted file mode 100644 index cbfbefa..0000000 --- a/sciencebeam_gym/utils/xml_test.py +++ /dev/null @@ -1,62 +0,0 @@ -from lxml.builder import E - -from sciencebeam_gym.utils.xml import ( - get_text_content, - get_immediate_text, - xml_from_string_with_recover -) - -SOME_VALUE_1 = 'some value1' -SOME_VALUE_2 = 'some value2' - -class TestGetTextContent(object): - def test_should_return_simple_text(self): - node = E.parent(SOME_VALUE_1) - assert get_text_content(node) == SOME_VALUE_1 - - def test_should_return_text_of_child_element(self): - node = E.parent(E.child(SOME_VALUE_1)) - assert get_text_content(node) == SOME_VALUE_1 - - def test_should_return_text_of_child_element_and_preceeding_text(self): - node = E.parent(SOME_VALUE_1, E.child(SOME_VALUE_2)) - assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2 - - def test_should_return_text_of_child_element_and_trailing_text(self): - node = E.parent(E.child(SOME_VALUE_1), SOME_VALUE_2) - assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2 - - def test_should_return_text_of_parent_excluding_children_to_exclude(self): - child = E.child(SOME_VALUE_1) - node = E.parent(child, SOME_VALUE_2) - assert get_text_content(node, exclude=[child]) == SOME_VALUE_2 - -class TestGetImmediateText(object): - def test_should_return_simple_text(self): - node = E.parent(SOME_VALUE_1) - assert get_immediate_text(node) == [SOME_VALUE_1] - - def test_should_not_return_text_of_child_element(self): - node = E.parent(E.child(SOME_VALUE_1)) - assert get_immediate_text(node) == [] - -class TestXmlFromStringWithRecover(object): - def test_should_parse_clean_xml(self): - root = xml_from_string_with_recover('<root><child1>%s</child1></root>' % SOME_VALUE_1) - node = root.find('child1') - assert node is not None - assert node.text == SOME_VALUE_1 - - def test_should_parse_xml_with_unencoded_ampersand(self): - value = 'A & B' - root = xml_from_string_with_recover('<root><child1>%s</child1></root>' % value) - node = root.find('child1') - assert node is not None - assert node.text == 'A B' - - def test_should_parse_xml_with_unencoded_unknown_entity(self): - value = 'A &unknown; B' - root = xml_from_string_with_recover('<root><child1>%s</child1></root>' % value) - node = root.find('child1') - assert node is not None - assert node.text == 'A B' diff --git a/sciencebeam_gym/utils/zip.py b/sciencebeam_gym/utils/zip.py deleted file mode 100644 index 1155a99..0000000 --- a/sciencebeam_gym/utils/zip.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -from stat import S_IXUSR - -ZIP_UNIX_SYSTEM = 3 - -def make_executable(path): - os.chmod(path, os.stat(path).st_mode | S_IXUSR) - -def extract_all_with_permission(zf, target_dir): - for info in zf.infolist(): - extracted_path = zf.extract(info, target_dir) - - if info.create_system == ZIP_UNIX_SYSTEM: - unix_attributes = info.external_attr >> 16 - if unix_attributes: - os.chmod(extracted_path, unix_attributes) - -def extract_all_with_executable_permission(zf, target_dir): - for info in zf.infolist(): - extracted_path = zf.extract(info, target_dir) - - if info.create_system == ZIP_UNIX_SYSTEM and os.path.isfile(extracted_path): - unix_attributes = info.external_attr >> 16 - if unix_attributes & S_IXUSR: - make_executable(extracted_path) -- GitLab