From 1a58fbaa3b35eba92f417d19024f0d7f72e00eaf Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 6 Jul 2018 16:11:45 +0100 Subject: [PATCH] added support for relative file lists (#33) --- sciencebeam_gym/preprocess/find_file_pairs.py | 24 ++++++- .../preprocess/find_file_pairs_test.py | 41 +++++++++-- .../preprocess/get_output_files.py | 17 ++++- .../preprocess/get_output_files_test.py | 29 +++++++- .../preprocess/preprocessing_pipeline.py | 7 +- .../preprocess/preprocessing_utils.py | 30 ++++---- .../preprocess/preprocessing_utils_test.py | 11 --- sciencebeam_gym/utils/file_list.py | 24 ++++++- sciencebeam_gym/utils/file_list_test.py | 69 ++++++++++++++++--- sciencebeam_gym/utils/file_path.py | 21 ++++++ sciencebeam_gym/utils/file_path_test.py | 24 +++++++ 11 files changed, 242 insertions(+), 55 deletions(-) create mode 100644 sciencebeam_gym/utils/file_path.py create mode 100644 sciencebeam_gym/utils/file_path_test.py diff --git a/sciencebeam_gym/preprocess/find_file_pairs.py b/sciencebeam_gym/preprocess/find_file_pairs.py index a65a371..ff727d1 100644 --- a/sciencebeam_gym/preprocess/find_file_pairs.py +++ b/sciencebeam_gym/preprocess/find_file_pairs.py @@ -14,9 +14,13 @@ from sciencebeam_gym.beam_utils.io import ( mkdirs_if_not_exists ) +from sciencebeam_gym.utils.file_path import ( + join_if_relative_path, + relative_path +) + from sciencebeam_gym.preprocess.preprocessing_utils import ( - find_file_pairs_grouped_by_parent_directory_or_name, - join_if_relative_path + find_file_pairs_grouped_by_parent_directory_or_name ) def get_logger(): @@ -40,6 +44,12 @@ def parse_args(argv=None): '--out', type=str, required=True, help='output csv/tsv file' ) + + parser.add_argument( + '--use-relative-paths', action='store_true', + help='create a file list with relative paths (relative to the data path)' + ) + return parser.parse_args(argv) @@ -53,12 +63,22 @@ def save_file_pairs_to_csv(output_path, source_xml_pairs): write_csv_rows(writer, source_xml_pairs) get_logger().info('written results to %s', output_path) +def to_relative_file_pairs(base_path, file_pairs): + return ( + (relative_path(base_path, source_url), relative_path(base_path, xml_url)) + for source_url, xml_url in file_pairs + ) + def run(args): get_logger().info('finding file pairs') source_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([ join_if_relative_path(args.data_path, args.source_pattern), join_if_relative_path(args.data_path, args.xml_pattern) ]) + + if args.use_relative_paths: + source_xml_pairs = to_relative_file_pairs(args.data_path, source_xml_pairs) + source_xml_pairs = list(source_xml_pairs) save_file_pairs_to_csv(args.out, source_xml_pairs) diff --git a/sciencebeam_gym/preprocess/find_file_pairs_test.py b/sciencebeam_gym/preprocess/find_file_pairs_test.py index a93a7c4..4aaee56 100644 --- a/sciencebeam_gym/preprocess/find_file_pairs_test.py +++ b/sciencebeam_gym/preprocess/find_file_pairs_test.py @@ -6,6 +6,7 @@ import pytest import sciencebeam_gym.preprocess.find_file_pairs as find_file_pairs from sciencebeam_gym.preprocess.find_file_pairs import ( + to_relative_file_pairs, run, parse_args, main @@ -32,10 +33,18 @@ SOME_ARGV = [ '--out=%s' % OUTPUT_FILE ] +@pytest.fixture(name='to_relative_file_pairs_mock') +def _to_relative_file_pairs(): + with patch.object(find_file_pairs, 'to_relative_file_pairs') as m: + yield m @pytest.fixture(name='find_file_pairs_grouped_by_parent_directory_or_name_mock') def _find_file_pairs_grouped_by_parent_directory_or_name(): with patch.object(find_file_pairs, 'find_file_pairs_grouped_by_parent_directory_or_name') as m: + m.return_value = [ + (PDF_FILE_1, XML_FILE_1), + (PDF_FILE_2, XML_FILE_2) + ] yield m @pytest.fixture(name='save_file_pairs_to_csv_mock') @@ -78,6 +87,13 @@ def _data_path(tmpdir): def _out_file(tmpdir): return tmpdir.join(OUTPUT_FILE) +class TestToRelativeFilePairs(object): + def test_should_make_paths_relative(self): + assert list(to_relative_file_pairs( + '/parent', + [('/parent/sub/file1', '/parent/sub/file2')] + )) == [('sub/file1', 'sub/file2')] + class TestRun(object): def test_should_pass_around_parameters( self, @@ -85,10 +101,6 @@ class TestRun(object): save_file_pairs_to_csv_mock): opt = parse_args(SOME_ARGV) - find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value = [ - (PDF_FILE_1, XML_FILE_1), - (PDF_FILE_2, XML_FILE_2) - ] run(opt) find_file_pairs_grouped_by_parent_directory_or_name_mock.assert_called_with([ os.path.join(BASE_SOURCE_PATH, SOURCE_PATTERN), @@ -99,6 +111,27 @@ class TestRun(object): find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value ) + def test_should_use_relative_paths_if_enabled( + self, + find_file_pairs_grouped_by_parent_directory_or_name_mock, + to_relative_file_pairs_mock, + save_file_pairs_to_csv_mock): + + opt = parse_args(SOME_ARGV) + opt.use_relative_paths = True + + to_relative_file_pairs_mock.return_value = [('file1.pdf', 'file1.xml')] + + run(opt) + to_relative_file_pairs_mock.assert_called_with( + BASE_SOURCE_PATH, + find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value + ) + save_file_pairs_to_csv_mock.assert_called_with( + opt.out, + to_relative_file_pairs_mock.return_value + ) + def test_should_generate_file_list(self, data_path, pdf_file_1, xml_file_1, out_file): LOGGER.debug('pdf_file_1: %s, xml_file: %s', pdf_file_1, xml_file_1) opt = parse_args(SOME_ARGV) diff --git a/sciencebeam_gym/preprocess/get_output_files.py b/sciencebeam_gym/preprocess/get_output_files.py index 979bfbf..73eae63 100644 --- a/sciencebeam_gym/preprocess/get_output_files.py +++ b/sciencebeam_gym/preprocess/get_output_files.py @@ -3,13 +3,17 @@ import logging from sciencebeam_gym.utils.file_list import ( load_file_list, - save_file_list + save_file_list, + to_relative_file_list +) + +from sciencebeam_gym.utils.file_path import ( + join_if_relative_path ) from sciencebeam_gym.preprocess.preprocessing_utils import ( get_or_validate_base_path, - get_output_file, - join_if_relative_path + get_output_file ) from sciencebeam_gym.preprocess.check_file_list import ( @@ -57,6 +61,10 @@ def parse_args(argv=None): '--output-base-path', type=str, required=False, help='base output path (by default source base path with"-results" suffix)' ) + output.add_argument( + '--use-relative-paths', action='store_true', + help='create a file list with relative paths (relative to the output data path)' + ) parser.add_argument( '--limit', type=int, required=False, @@ -112,6 +120,9 @@ def run(opt): ) check_files_and_report_result(check_file_list) + if opt.use_relative_paths: + target_file_list = to_relative_file_list(opt.output_base_path, target_file_list) + get_logger().info( 'saving file list (with %d files) to: %s', len(target_file_list), opt.output_file_list diff --git a/sciencebeam_gym/preprocess/get_output_files_test.py b/sciencebeam_gym/preprocess/get_output_files_test.py index b0f27d9..41f665a 100644 --- a/sciencebeam_gym/preprocess/get_output_files_test.py +++ b/sciencebeam_gym/preprocess/get_output_files_test.py @@ -44,6 +44,11 @@ def _check_files_and_report_result(): with patch.object(get_output_files, 'check_files_and_report_result') as m: yield m +@pytest.fixture(name='to_relative_file_list_mock') +def _to_relative_file_list(): + with patch.object(get_output_files, 'to_relative_file_list') as m: + yield m + class TestGetOutputFileList(object): def test_should_return_output_file_with_path_and_change_ext(self): assert get_output_file_list( @@ -53,7 +58,10 @@ class TestGetOutputFileList(object): '.xml' ) == ['/output/path/file.xml'] -@pytest.mark.usefixtures("load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock") +@pytest.mark.usefixtures( + "load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock", + "to_relative_file_list_mock" +) class TestRun(object): def test_should_pass_around_parameters( self, @@ -144,6 +152,25 @@ class TestRun(object): get_output_file_list_mock.return_value[:opt.check_limit] ) + def test_should_save_relative_paths_if_enabled( + self, + get_output_file_list_mock, + to_relative_file_list_mock, + save_file_list_mock): + + opt = parse_args(SOME_ARGV) + opt.use_relative_paths = True + run(opt) + to_relative_file_list_mock.assert_called_with( + opt.output_base_path, + get_output_file_list_mock.return_value, + ) + save_file_list_mock.assert_called_with( + opt.output_file_list, + to_relative_file_list_mock.return_value, + column=opt.source_file_column + ) + class TestMain(object): def test_should_parse_args_and_call_run(self): m = get_output_files diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py index b128b8b..e1ce7ae 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py @@ -14,6 +14,11 @@ from sciencebeam_gym.utils.collection import ( remove_keys_from_dict ) +from sciencebeam_gym.utils.file_path import ( + relative_path, + join_if_relative_path +) + from sciencebeam_gym.beam_utils.utils import ( TransformAndCount, TransformAndLog, @@ -57,8 +62,6 @@ from sciencebeam_gym.preprocess.annotation.annotation_evaluation import ( from sciencebeam_gym.preprocess.preprocessing_utils import ( change_ext, - relative_path, - join_if_relative_path, find_file_pairs_grouped_by_parent_directory_or_name, convert_pdf_bytes_to_lxml, convert_and_annotate_lxml_content, diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py index e5d16ec..96d6ff6 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils.py @@ -29,9 +29,11 @@ from sciencebeam_gym.utils.pages_zip import ( ) from sciencebeam_gym.beam_utils.io import ( - dirname, - find_matching_filenames, - mkdirs_if_not_exists + find_matching_filenames +) + +from sciencebeam_gym.utils.file_path import ( + relative_path ) from sciencebeam_gym.preprocess.lxml_to_svg import ( @@ -75,6 +77,13 @@ from sciencebeam_gym.pdf import ( PdfToPng ) +# deprecated, moved to sciencebeam_gym.utils.file_path +# pylint: disable=wrong-import-position, unused-import +from sciencebeam_gym.utils.file_path import ( + join_if_relative_path, +) +# pylint: enable=wrong-import-position, unused-import + def get_logger(): return logging.getLogger(__name__) @@ -213,21 +222,6 @@ def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, na return svg_roots -def relative_path(base_path, path): - if not base_path.endswith('/'): - base_path += '/' - return path[len(base_path):] if path.startswith(base_path) else path - -def is_relative_path(path): - return not path.startswith('/') and '://' not in path - -def join_if_relative_path(base_path, path): - return ( - FileSystems.join(base_path, path) - if base_path and is_relative_path(path) - else path - ) - def change_ext(path, old_ext, new_ext): if old_ext is None: old_ext = os.path.splitext(path)[1] diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py index befec13..12b20f7 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py @@ -12,7 +12,6 @@ from sciencebeam_gym.preprocess.preprocessing_utils import ( svg_page_to_blockified_png_bytes, group_file_pairs_by_parent_directory_or_name, convert_pdf_bytes_to_lxml, - join_if_relative_path, change_ext, base_path_for_file_list, get_or_validate_base_path, @@ -116,16 +115,6 @@ class TestConvertPdfBytesToLxml(object): ) assert lxml_content == LXML_CONTENT_1 -class TestJoinIfRelativePath(object): - def test_should_return_path_if_base_path_is_none(self): - assert join_if_relative_path(None, 'file') == 'file' - - def test_should_return_path_if_not_relative(self): - assert join_if_relative_path('/parent', '/other/file') == '/other/file' - - def test_should_return_joined_path_if_relative(self): - assert join_if_relative_path('/parent', 'file') == '/parent/file' - class TestChangeExt(object): def test_should_replace_simple_ext_with_simple_ext(self): assert change_ext('file.pdf', None, '.xml') == 'file.xml' diff --git a/sciencebeam_gym/utils/file_list.py b/sciencebeam_gym/utils/file_list.py index 27000da..b062236 100644 --- a/sciencebeam_gym/utils/file_list.py +++ b/sciencebeam_gym/utils/file_list.py @@ -2,6 +2,7 @@ from __future__ import absolute_import import codecs import csv +import os from itertools import islice from apache_beam.io.filesystems import FileSystems @@ -10,6 +11,12 @@ from sciencebeam_gym.utils.csv import ( csv_delimiter_by_filename ) +from .file_path import ( + relative_path, + join_if_relative_path +) + + def is_csv_or_tsv_file_list(file_list_path): return '.csv' in file_list_path or '.tsv' in file_list_path @@ -44,13 +51,24 @@ def load_csv_or_tsv_file_list(file_list_path, column, header=True, limit=None): lines = islice(lines, 0, limit) return list(lines) -def load_file_list(file_list_path, column, header=True, limit=None): +def to_absolute_file_list(base_path, file_list): + return [join_if_relative_path(base_path, s) for s in file_list] + +def to_relative_file_list(base_path, file_list): + return [relative_path(base_path, s) for s in file_list] + +def load_file_list(file_list_path, column, header=True, limit=None, to_absolute=True): if is_csv_or_tsv_file_list(file_list_path): - return load_csv_or_tsv_file_list( + file_list = load_csv_or_tsv_file_list( file_list_path, column=column, header=header, limit=limit ) else: - return load_plain_file_list(file_list_path, limit=limit) + file_list = load_plain_file_list(file_list_path, limit=limit) + if to_absolute: + file_list = to_absolute_file_list( + os.path.dirname(file_list_path), file_list + ) + return file_list def save_plain_file_list(file_list_path, file_list): with FileSystems.create(file_list_path) as f: diff --git a/sciencebeam_gym/utils/file_list_test.py b/sciencebeam_gym/utils/file_list_test.py index 66a8855..efeff9a 100644 --- a/sciencebeam_gym/utils/file_list_test.py +++ b/sciencebeam_gym/utils/file_list_test.py @@ -10,6 +10,8 @@ from sciencebeam_gym.utils.file_list import ( is_csv_or_tsv_file_list, load_plain_file_list, load_csv_or_tsv_file_list, + to_absolute_file_list, + to_relative_file_list, load_file_list, save_plain_file_list, save_csv_or_tsv_file_list, @@ -21,6 +23,21 @@ FILE_2 = 'file2.pdf' UNICODE_FILE_1 = u'file1\u1234.pdf' FILE_LIST = [FILE_1, FILE_2] +@pytest.fixture(name='load_plain_file_list_mock') +def _load_plain_file_list(): + with patch.object(file_list_loader, 'load_plain_file_list') as mock: + yield mock + +@pytest.fixture(name='load_csv_or_tsv_file_list_mock') +def _load_csv_or_tsv_file_list(): + with patch.object(file_list_loader, 'load_csv_or_tsv_file_list') as mock: + yield mock + +@pytest.fixture(name='to_absolute_file_list_mock') +def _to_absolute_file_list(): + with patch.object(file_list_loader, 'to_absolute_file_list') as mock: + yield mock + class TestIsCsvOrTsvFileList(object): def test_should_return_true_if_file_ext_is_csv(self): assert is_csv_or_tsv_file_list('files.csv') @@ -104,18 +121,48 @@ class TestLoadCsvOrTsvFileList(object): f.flush() assert load_csv_or_tsv_file_list(f.name, 'url', limit=1) == [FILE_1] +class TestToAbsoluteFileList(object): + def test_should_make_path_absolute(self): + assert to_absolute_file_list('/base/path', ['sub/file1']) == ['/base/path/sub/file1'] + + def test_should_not_change_absolute_paths(self): + assert to_absolute_file_list('/base/path', ['/other/file1']) == ['/other/file1'] + +class TestToRelativeFileList(object): + def test_should_make_path_absolute(self): + assert to_relative_file_list('/base/path', ['/base/path/sub/file1']) == ['sub/file1'] + + def test_should_not_change_path_outside_base_path(self): + assert to_relative_file_list('/base/path', ['/other/file1']) == ['/other/file1'] + +@pytest.mark.usefixtures( + 'load_plain_file_list_mock', 'load_csv_or_tsv_file_list_mock', 'to_absolute_file_list_mock' +) class TestLoadFileList(object): - def test_should_call_load_plain_file_list(self): - with patch.object(file_list_loader, 'load_plain_file_list') as mock: - result = load_file_list('file-list.lst', column='url', header=True, limit=1) - mock.assert_called_with('file-list.lst', limit=1) - assert result == mock.return_value - - def test_should_call_load_csv_or_tsv_file_list(self): - with patch.object(file_list_loader, 'load_csv_or_tsv_file_list') as mock: - result = load_file_list('file-list.csv', column='url', header=True, limit=1) - mock.assert_called_with('file-list.csv', column='url', header=True, limit=1) - assert result == mock.return_value + def test_should_call_load_plain_file_list(self, load_plain_file_list_mock): + result = load_file_list( + 'file-list.lst', column='url', header=True, limit=1, to_absolute=False + ) + load_plain_file_list_mock.assert_called_with('file-list.lst', limit=1) + assert result == load_plain_file_list_mock.return_value + + def test_should_call_load_csv_or_tsv_file_list(self, load_csv_or_tsv_file_list_mock): + result = load_file_list( + 'file-list.csv', column='url', header=True, limit=1, to_absolute=False + ) + load_csv_or_tsv_file_list_mock.assert_called_with( + 'file-list.csv', column='url', header=True, limit=1 + ) + assert result == load_csv_or_tsv_file_list_mock.return_value + + def test_should_make_file_list_absolute( + self, load_plain_file_list_mock, to_absolute_file_list_mock): + + result = load_file_list('/base/path/file-list.lst', column='url', to_absolute=True) + to_absolute_file_list_mock.assert_called_with( + '/base/path', load_plain_file_list_mock.return_value + ) + assert result == to_absolute_file_list_mock.return_value class TestSavePlainFileList(object): def test_should_write_multiple_file_paths(self): diff --git a/sciencebeam_gym/utils/file_path.py b/sciencebeam_gym/utils/file_path.py new file mode 100644 index 0000000..cc798c1 --- /dev/null +++ b/sciencebeam_gym/utils/file_path.py @@ -0,0 +1,21 @@ + +from __future__ import absolute_import + +from apache_beam.io.filesystems import FileSystems + +def relative_path(base_path, path): + if not base_path: + return path + if not base_path.endswith('/'): + base_path += '/' + return path[len(base_path):] if path.startswith(base_path) else path + +def is_relative_path(path): + return not path.startswith('/') and '://' not in path + +def join_if_relative_path(base_path, path): + return ( + FileSystems.join(base_path, path) + if base_path and is_relative_path(path) + else path + ) diff --git a/sciencebeam_gym/utils/file_path_test.py b/sciencebeam_gym/utils/file_path_test.py new file mode 100644 index 0000000..9ef1680 --- /dev/null +++ b/sciencebeam_gym/utils/file_path_test.py @@ -0,0 +1,24 @@ +from .file_path import ( + relative_path, + join_if_relative_path +) + +class TestRelativePath(object): + def test_should_return_path_if_base_path_is_none(self): + assert relative_path(None, 'file') == 'file' + + def test_should_return_path_if_path_outside_base_path(self): + assert relative_path('/parent', '/other/file') == '/other/file' + + def test_should_return_absolute_path_if_base_path_matches(self): + assert relative_path('/parent', '/parent/file') == 'file' + +class TestJoinIfRelativePath(object): + def test_should_return_path_if_base_path_is_none(self): + assert join_if_relative_path(None, 'file') == 'file' + + def test_should_return_path_if_not_relative(self): + assert join_if_relative_path('/parent', '/other/file') == '/other/file' + + def test_should_return_joined_path_if_relative(self): + assert join_if_relative_path('/parent', 'file') == '/parent/file' -- GitLab