From 1dae709566c5e26b685b2d825705211946f6828d Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 6 Jul 2018 09:15:48 +0100 Subject: [PATCH] make source file list relative to source base path if it is not an absolute path (#32) --- .../preprocess/get_output_files.py | 8 +- .../preprocess/get_output_files_test.py | 167 +++++++++++------- .../preprocess/preprocessing_utils.py | 6 +- .../preprocess/preprocessing_utils_test.py | 13 +- 4 files changed, 123 insertions(+), 71 deletions(-) diff --git a/sciencebeam_gym/preprocess/get_output_files.py b/sciencebeam_gym/preprocess/get_output_files.py index 11f92b4..979bfbf 100644 --- a/sciencebeam_gym/preprocess/get_output_files.py +++ b/sciencebeam_gym/preprocess/get_output_files.py @@ -8,7 +8,8 @@ from sciencebeam_gym.utils.file_list import ( from sciencebeam_gym.preprocess.preprocessing_utils import ( get_or_validate_base_path, - get_output_file + get_output_file, + join_if_relative_path ) from sciencebeam_gym.preprocess.check_file_list import ( @@ -85,7 +86,10 @@ def get_output_file_list(file_list, source_base_path, output_base_path, output_f def run(opt): source_file_list = load_file_list( - opt.source_file_list, + join_if_relative_path( + opt.source_base_path, + opt.source_file_list + ), column=opt.source_file_column, limit=opt.limit ) diff --git a/sciencebeam_gym/preprocess/get_output_files_test.py b/sciencebeam_gym/preprocess/get_output_files_test.py index c406842..b0f27d9 100644 --- a/sciencebeam_gym/preprocess/get_output_files_test.py +++ b/sciencebeam_gym/preprocess/get_output_files_test.py @@ -1,3 +1,4 @@ +import os from mock import patch, ANY import pytest @@ -21,6 +22,28 @@ BASE_SOURCE_PATH = '/source' FILE_1 = BASE_SOURCE_PATH + '/file1' FILE_2 = BASE_SOURCE_PATH + '/file2' + +@pytest.fixture(name='load_file_list_mock') +def _load_file_list(): + with patch.object(get_output_files, 'load_file_list') as m: + m.return_value = [FILE_1, FILE_2] + yield m + +@pytest.fixture(name='get_output_file_list_mock') +def _get_output_file_list(): + with patch.object(get_output_files, 'get_output_file_list') as m: + yield m + +@pytest.fixture(name='save_file_list_mock') +def _save_file_list(): + with patch.object(get_output_files, 'save_file_list') as m: + yield m + +@pytest.fixture(name='check_files_and_report_result_mock') +def _check_files_and_report_result(): + with patch.object(get_output_files, 'check_files_and_report_result') as m: + yield m + class TestGetOutputFileList(object): def test_should_return_output_file_with_path_and_change_ext(self): assert get_output_file_list( @@ -30,86 +53,96 @@ class TestGetOutputFileList(object): '.xml' ) == ['/output/path/file.xml'] +@pytest.mark.usefixtures("load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock") class TestRun(object): - def test_should_pass_around_parameters(self): - m = get_output_files + def test_should_pass_around_parameters( + self, + load_file_list_mock, + get_output_file_list_mock, + save_file_list_mock): + + load_file_list_mock.return_value = [FILE_1, FILE_2] + opt = parse_args(SOME_ARGV) + run(opt) + load_file_list_mock.assert_called_with( + opt.source_file_list, + column=opt.source_file_column, + limit=opt.limit + ) + get_output_file_list_mock.assert_called_with( + load_file_list_mock.return_value, + BASE_SOURCE_PATH, + opt.output_base_path, + opt.output_file_suffix + ) + save_file_list_mock.assert_called_with( + opt.output_file_list, + get_output_file_list_mock.return_value, + column=opt.source_file_column + ) + + def test_should_make_file_list_absolute_if_it_is_relative( + self, + load_file_list_mock): + opt = parse_args(SOME_ARGV) - with patch.object(m, 'load_file_list') as load_file_list: - with patch.object(m, 'get_output_file_list') as get_output_file_list_mock: - with patch.object(m, 'save_file_list') as save_file_list: - load_file_list.return_value = [FILE_1, FILE_2] - run(opt) - load_file_list.assert_called_with( - opt.source_file_list, - column=opt.source_file_column, - limit=opt.limit - ) - get_output_file_list_mock.assert_called_with( - load_file_list.return_value, - BASE_SOURCE_PATH, - opt.output_base_path, - opt.output_file_suffix - ) - save_file_list.assert_called_with( - opt.output_file_list, - get_output_file_list_mock.return_value, - column=opt.source_file_column - ) + opt.source_base_path = BASE_SOURCE_PATH + opt.source_file_list = 'source.tsv' + run(opt) + load_file_list_mock.assert_called_with( + os.path.join(opt.source_base_path, opt.source_file_list), + column=opt.source_file_column, + limit=opt.limit + ) def test_should_raise_error_if_source_path_is_invalid(self): - m = get_output_files opt = parse_args(SOME_ARGV) opt.source_base_path = '/other/path' - with patch.object(m, 'load_file_list') as load_file_list: - with patch.object(m, 'get_output_file_list'): - with patch.object(m, 'save_file_list'): - with pytest.raises(AssertionError): - load_file_list.return_value = [FILE_1, FILE_2] - run(opt) - - def test_should_use_passed_in_source_path_if_valid(self): - m = get_output_files + with pytest.raises(AssertionError): + run(opt) + + def test_should_use_passed_in_source_path_if_valid( + self, + get_output_file_list_mock, + load_file_list_mock): + opt = parse_args(SOME_ARGV) opt.source_base_path = '/base' - with patch.object(m, 'load_file_list') as load_file_list: - with patch.object(m, 'get_output_file_list') as get_output_file_list_mock: - with patch.object(m, 'save_file_list'): - load_file_list.return_value = ['/base/source/file1', '/base/source/file2'] - run(opt) - get_output_file_list_mock.assert_called_with( - ANY, - opt.source_base_path, - ANY, - ANY - ) - def test_should_check_file_list_if_enabled(self): - m = get_output_files + load_file_list_mock.return_value = ['/base/source/file1', '/base/source/file2'] + run(opt) + get_output_file_list_mock.assert_called_with( + ANY, + opt.source_base_path, + ANY, + ANY + ) + + def test_should_check_file_list_if_enabled( + self, + get_output_file_list_mock, + check_files_and_report_result_mock): + opt = parse_args(SOME_ARGV) opt.check = True - with patch.object(m, 'load_file_list') as load_file_list: - with patch.object(m, 'get_output_file_list') as get_output_file_list_mock: - with patch.object(m, 'save_file_list'): - with patch.object(m, 'check_files_and_report_result') as check_files_and_report_result: - load_file_list.return_value = [FILE_1, FILE_2] - run(opt) - check_files_and_report_result.assert_called_with( - get_output_file_list_mock.return_value - ) - - def test_should_limit_files_to_check(self): - m = get_output_files + run(opt) + check_files_and_report_result_mock.assert_called_with( + get_output_file_list_mock.return_value + ) + + def test_should_limit_files_to_check( + self, + load_file_list_mock, + get_output_file_list_mock, + check_files_and_report_result_mock): + opt = parse_args(SOME_ARGV) opt.check = True opt.check_limit = 1 - with patch.object(m, 'load_file_list') as load_file_list: - with patch.object(m, 'get_output_file_list') as get_output_file_list_mock: - with patch.object(m, 'save_file_list'): - with patch.object(m, 'check_files_and_report_result') as check_files_and_report_result: - load_file_list.return_value = [FILE_1, FILE_2] - run(opt) - check_files_and_report_result.assert_called_with( - get_output_file_list_mock.return_value[:opt.check_limit] - ) + load_file_list_mock.return_value = [FILE_1, FILE_2] + run(opt) + check_files_and_report_result_mock.assert_called_with( + get_output_file_list_mock.return_value[:opt.check_limit] + ) class TestMain(object): def test_should_parse_args_and_call_run(self): diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py index 1ea5c1e..e5d16ec 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils.py @@ -222,7 +222,11 @@ def is_relative_path(path): return not path.startswith('/') and '://' not in path def join_if_relative_path(base_path, path): - return FileSystems.join(base_path, path) if is_relative_path(path) else path + return ( + FileSystems.join(base_path, path) + if base_path and is_relative_path(path) + else path + ) def change_ext(path, old_ext, new_ext): if old_ext is None: diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py index 210cefd..befec13 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py @@ -12,11 +12,12 @@ from sciencebeam_gym.preprocess.preprocessing_utils import ( svg_page_to_blockified_png_bytes, group_file_pairs_by_parent_directory_or_name, convert_pdf_bytes_to_lxml, + join_if_relative_path, change_ext, base_path_for_file_list, get_or_validate_base_path, get_output_file, - parse_page_range + parse_page_range, ) PROCESSING_UTILS = 'sciencebeam_gym.preprocess.preprocessing_utils' @@ -115,6 +116,16 @@ class TestConvertPdfBytesToLxml(object): ) assert lxml_content == LXML_CONTENT_1 +class TestJoinIfRelativePath(object): + def test_should_return_path_if_base_path_is_none(self): + assert join_if_relative_path(None, 'file') == 'file' + + def test_should_return_path_if_not_relative(self): + assert join_if_relative_path('/parent', '/other/file') == '/other/file' + + def test_should_return_joined_path_if_relative(self): + assert join_if_relative_path('/parent', 'file') == '/parent/file' + class TestChangeExt(object): def test_should_replace_simple_ext_with_simple_ext(self): assert change_ext('file.pdf', None, '.xml') == 'file.xml' -- GitLab