From 67c0e3084b7ebe0ddfb4000a0c0cd012a9ebcf67 Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 6 Jul 2018 09:15:32 +0100 Subject: [PATCH] renamed pdf_url to source_url (#31) --- README.md | 10 +- sciencebeam_gym/conftest.py | 8 ++ sciencebeam_gym/preprocess/find_file_pairs.py | 37 +++--- .../preprocess/find_file_pairs_test.py | 119 ++++++++++++++++++ .../preprocess/preprocessing_pipeline.py | 2 +- .../preprocess/preprocessing_pipeline_test.py | 8 +- 6 files changed, 159 insertions(+), 25 deletions(-) create mode 100644 sciencebeam_gym/conftest.py create mode 100644 sciencebeam_gym/preprocess/find_file_pairs_test.py diff --git a/README.md b/README.md index f1e1c8c..8a6bb46 100644 --- a/README.md +++ b/README.md @@ -107,24 +107,24 @@ The parent directory per manuscript is optional. If that is not the case then th Run: ```bash -python -m sciencebeam_lab.preprocess.find_file_pairs \ +python -m sciencebeam_gym.preprocess.find_file_pairs \ --data-path <source directory> \ ---pdf-pattern *.pdf.gz --xml-pattern *.nxml.gz \ +--source-pattern *.pdf.gz --xml-pattern *.nxml.gz \ --out <output file list csv/tsv> ``` e.g.: ```bash -python -m sciencebeam_lab.preprocess.find_file_pairs \ +python -m sciencebeam_gym.preprocess.find_file_pairs \ --data-path gs://some-bucket/some-dataset \ ---pdf-pattern *.pdf.gz --xml-pattern *.nxml.gz \ +--source-pattern *.pdf.gz --xml-pattern *.nxml.gz \ --out gs://some-bucket/some-dataset/file-list.tsv ``` That will create the TSV (tab separated) file `file-list.tsv` with the following columns: -- _pdf_url_ +- _source_url_ - _xml_url_ That file could also be generated using any other preferred method. diff --git a/sciencebeam_gym/conftest.py b/sciencebeam_gym/conftest.py new file mode 100644 index 0000000..b12470c --- /dev/null +++ b/sciencebeam_gym/conftest.py @@ -0,0 +1,8 @@ +import logging + +import pytest + +@pytest.fixture(scope='session', autouse=True) +def setup_logging(): + logging.root.handlers = [] + logging.basicConfig(level='DEBUG') diff --git a/sciencebeam_gym/preprocess/find_file_pairs.py b/sciencebeam_gym/preprocess/find_file_pairs.py index a97358e..a65a371 100644 --- a/sciencebeam_gym/preprocess/find_file_pairs.py +++ b/sciencebeam_gym/preprocess/find_file_pairs.py @@ -29,8 +29,8 @@ def parse_args(argv=None): help='base data path' ) parser.add_argument( - '--pdf-pattern', type=str, required=True, - help='pdf pattern' + '--source-pattern', type=str, required=True, + help='source pattern' ) parser.add_argument( '--xml-pattern', type=str, required=True, @@ -42,23 +42,30 @@ def parse_args(argv=None): ) return parser.parse_args(argv) -def main(argv=None): - args = parse_args(argv) + +def save_file_pairs_to_csv(output_path, source_xml_pairs): + mkdirs_if_not_exists(dirname(output_path)) + delimiter = csv_delimiter_by_filename(output_path) + mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv' + with FileSystems.create(output_path, mime_type=mime_type) as f: + writer = csv.writer(f, delimiter=delimiter) + write_csv_rows(writer, [['source_url', 'xml_url']]) + write_csv_rows(writer, source_xml_pairs) + get_logger().info('written results to %s', output_path) + +def run(args): get_logger().info('finding file pairs') - pdf_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([ - join_if_relative_path(args.data_path, args.pdf_pattern), + source_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([ + join_if_relative_path(args.data_path, args.source_pattern), join_if_relative_path(args.data_path, args.xml_pattern) ]) - pdf_xml_pairs = list(pdf_xml_pairs) + source_xml_pairs = list(source_xml_pairs) - mkdirs_if_not_exists(dirname(args.out)) - delimiter = csv_delimiter_by_filename(args.out) - mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv' - with FileSystems.create(args.out, mime_type=mime_type) as f: - writer = csv.writer(f, delimiter=delimiter) - write_csv_rows(writer, [['pdf_url', 'xml_url']]) - write_csv_rows(writer, pdf_xml_pairs) - get_logger().info('written results to %s', args.out) + save_file_pairs_to_csv(args.out, source_xml_pairs) + +def main(argv=None): + args = parse_args(argv) + run(args) if __name__ == '__main__': logging.basicConfig(level='INFO') diff --git a/sciencebeam_gym/preprocess/find_file_pairs_test.py b/sciencebeam_gym/preprocess/find_file_pairs_test.py new file mode 100644 index 0000000..a93a7c4 --- /dev/null +++ b/sciencebeam_gym/preprocess/find_file_pairs_test.py @@ -0,0 +1,119 @@ +import logging +import os +from mock import patch + +import pytest + +import sciencebeam_gym.preprocess.find_file_pairs as find_file_pairs +from sciencebeam_gym.preprocess.find_file_pairs import ( + run, + parse_args, + main +) + + +LOGGER = logging.getLogger(__name__) + +BASE_SOURCE_PATH = '/source' + +PDF_FILE_1 = BASE_SOURCE_PATH + '/file1.pdf' +XML_FILE_1 = BASE_SOURCE_PATH + '/file1.xml' +PDF_FILE_2 = BASE_SOURCE_PATH + '/file2.pdf' +XML_FILE_2 = BASE_SOURCE_PATH + '/file2.xml' + +SOURCE_PATTERN = '*.pdf' +XML_PATTERN = '*.xml' +OUTPUT_FILE = 'file-list.tsv' + +SOME_ARGV = [ + '--data-path=%s' % BASE_SOURCE_PATH, + '--source-pattern=%s' % SOURCE_PATTERN, + '--xml-pattern=%s' % XML_PATTERN, + '--out=%s' % OUTPUT_FILE +] + + +@pytest.fixture(name='find_file_pairs_grouped_by_parent_directory_or_name_mock') +def _find_file_pairs_grouped_by_parent_directory_or_name(): + with patch.object(find_file_pairs, 'find_file_pairs_grouped_by_parent_directory_or_name') as m: + yield m + +@pytest.fixture(name='save_file_pairs_to_csv_mock') +def _save_file_pairs_to_csv(): + with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m: + yield m + +@pytest.fixture(name='save_file_pairs_to_csv_mock') +def _save_file_pairs_to_csv(): + with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m: + yield m + +@pytest.fixture(name='parse_args_mock') +def _parse_args(): + with patch.object(find_file_pairs, 'parse_args') as m: + yield m + +@pytest.fixture(name='run_mock') +def _run(): + with patch.object(find_file_pairs, 'run') as m: + yield m + +def _touch(path): + path.write(b'', ensure=True) + return path + +@pytest.fixture(name='pdf_file_1') +def _pdf_file_1(tmpdir): + return _touch(tmpdir.join(PDF_FILE_1)) + +@pytest.fixture(name='xml_file_1') +def _xml_file_1(tmpdir): + return _touch(tmpdir.join(XML_FILE_1)) + +@pytest.fixture(name='data_path') +def _data_path(tmpdir): + return tmpdir.join(BASE_SOURCE_PATH) + +@pytest.fixture(name='out_file') +def _out_file(tmpdir): + return tmpdir.join(OUTPUT_FILE) + +class TestRun(object): + def test_should_pass_around_parameters( + self, + find_file_pairs_grouped_by_parent_directory_or_name_mock, + save_file_pairs_to_csv_mock): + + opt = parse_args(SOME_ARGV) + find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value = [ + (PDF_FILE_1, XML_FILE_1), + (PDF_FILE_2, XML_FILE_2) + ] + run(opt) + find_file_pairs_grouped_by_parent_directory_or_name_mock.assert_called_with([ + os.path.join(BASE_SOURCE_PATH, SOURCE_PATTERN), + os.path.join(BASE_SOURCE_PATH, XML_PATTERN) + ]) + save_file_pairs_to_csv_mock.assert_called_with( + opt.out, + find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value + ) + + def test_should_generate_file_list(self, data_path, pdf_file_1, xml_file_1, out_file): + LOGGER.debug('pdf_file_1: %s, xml_file: %s', pdf_file_1, xml_file_1) + opt = parse_args(SOME_ARGV) + opt.data_path = str(data_path) + opt.out = str(out_file) + run(opt) + out_lines = [s.strip() for s in out_file.read().strip().split('\n')] + LOGGER.debug('out_lines: %s', out_lines) + assert out_lines == [ + 'source_url\txml_url', + '%s\t%s' % (pdf_file_1, xml_file_1) + ] + +class TestMain(object): + def test_should_parse_args_and_call_run(self, parse_args_mock, run_mock): + main(SOME_ARGV) + parse_args_mock.assert_called_with(SOME_ARGV) + run_mock.assert_called_with(parse_args_mock.return_value) diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py index 4229426..b128b8b 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py @@ -125,7 +125,7 @@ def configure_pipeline(p, opt): pdf_xml_url_pairs = ( p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list, limit=opt.limit) | - "TranslateFilePairUrls" >> beam.Map(lambda row: (row['pdf_url'], row['xml_url'])) + "TranslateFilePairUrls" >> beam.Map(lambda row: (row['source_url'], row['xml_url'])) ) else: pdf_xml_url_pairs = ( diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py index 52917ef..e42cf0b 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py @@ -173,7 +173,7 @@ class TestConfigurePipeline(BeamTest): opt.save_tfrecords = True with TestPipeline() as p: mocks['ReadDictCsv'].return_value = beam.Create([{ - 'pdf_url': PDF_FILE_1, + 'source_url': PDF_FILE_1, 'xml_url': XML_FILE_1 }]) _setup_mocks_for_pages(mocks, [1]) @@ -193,10 +193,10 @@ class TestConfigurePipeline(BeamTest): opt.save_tfrecords = True with TestPipeline() as p: mocks['ReadDictCsv'].return_value = beam.Create([{ - 'pdf_url': PDF_FILE_1, + 'source_url': PDF_FILE_1, 'xml_url': XML_FILE_1 }, { - 'pdf_url': PDF_FILE_2, + 'source_url': PDF_FILE_2, 'xml_url': XML_FILE_2 }]) _setup_mocks_for_pages(mocks, [1], file_count=2) @@ -219,7 +219,7 @@ class TestConfigurePipeline(BeamTest): opt.save_tfrecords = True with TestPipeline() as p: mocks['ReadDictCsv'].return_value = beam.Create([{ - 'pdf_url': PDF_FILE_1, + 'source_url': PDF_FILE_1, 'xml_url': XML_FILE_1 }]) _setup_mocks_for_pages(mocks, [1]) -- GitLab