From 67c0e3084b7ebe0ddfb4000a0c0cd012a9ebcf67 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 6 Jul 2018 09:15:32 +0100
Subject: [PATCH] renamed pdf_url to source_url (#31)

---
 README.md                                     |  10 +-
 sciencebeam_gym/conftest.py                   |   8 ++
 sciencebeam_gym/preprocess/find_file_pairs.py |  37 +++---
 .../preprocess/find_file_pairs_test.py        | 119 ++++++++++++++++++
 .../preprocess/preprocessing_pipeline.py      |   2 +-
 .../preprocess/preprocessing_pipeline_test.py |   8 +-
 6 files changed, 159 insertions(+), 25 deletions(-)
 create mode 100644 sciencebeam_gym/conftest.py
 create mode 100644 sciencebeam_gym/preprocess/find_file_pairs_test.py

diff --git a/README.md b/README.md
index f1e1c8c..8a6bb46 100644
--- a/README.md
+++ b/README.md
@@ -107,24 +107,24 @@ The parent directory per manuscript is optional. If that is not the case then th
 Run:
 
 ```bash
-python -m sciencebeam_lab.preprocess.find_file_pairs \
+python -m sciencebeam_gym.preprocess.find_file_pairs \
 --data-path <source directory> \
---pdf-pattern *.pdf.gz --xml-pattern *.nxml.gz \
+--source-pattern *.pdf.gz --xml-pattern *.nxml.gz \
 --out <output file list csv/tsv>
 ```
 
 e.g.:
 
 ```bash
-python -m sciencebeam_lab.preprocess.find_file_pairs \
+python -m sciencebeam_gym.preprocess.find_file_pairs \
 --data-path gs://some-bucket/some-dataset \
---pdf-pattern *.pdf.gz --xml-pattern *.nxml.gz \
+--source-pattern *.pdf.gz --xml-pattern *.nxml.gz \
 --out gs://some-bucket/some-dataset/file-list.tsv
 ```
 
 That will create the TSV (tab separated) file `file-list.tsv` with the following columns:
 
-- _pdf_url_
+- _source_url_
 - _xml_url_
 
 That file could also be generated using any other preferred method.
diff --git a/sciencebeam_gym/conftest.py b/sciencebeam_gym/conftest.py
new file mode 100644
index 0000000..b12470c
--- /dev/null
+++ b/sciencebeam_gym/conftest.py
@@ -0,0 +1,8 @@
+import logging
+
+import pytest
+
+@pytest.fixture(scope='session', autouse=True)
+def setup_logging():
+  logging.root.handlers = []
+  logging.basicConfig(level='DEBUG')
diff --git a/sciencebeam_gym/preprocess/find_file_pairs.py b/sciencebeam_gym/preprocess/find_file_pairs.py
index a97358e..a65a371 100644
--- a/sciencebeam_gym/preprocess/find_file_pairs.py
+++ b/sciencebeam_gym/preprocess/find_file_pairs.py
@@ -29,8 +29,8 @@ def parse_args(argv=None):
     help='base data path'
   )
   parser.add_argument(
-    '--pdf-pattern', type=str, required=True,
-    help='pdf pattern'
+    '--source-pattern', type=str, required=True,
+    help='source pattern'
   )
   parser.add_argument(
     '--xml-pattern', type=str, required=True,
@@ -42,23 +42,30 @@ def parse_args(argv=None):
   )
   return parser.parse_args(argv)
 
-def main(argv=None):
-  args = parse_args(argv)
+
+def save_file_pairs_to_csv(output_path, source_xml_pairs):
+  mkdirs_if_not_exists(dirname(output_path))
+  delimiter = csv_delimiter_by_filename(output_path)
+  mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv'
+  with FileSystems.create(output_path, mime_type=mime_type) as f:
+    writer = csv.writer(f, delimiter=delimiter)
+    write_csv_rows(writer, [['source_url', 'xml_url']])
+    write_csv_rows(writer, source_xml_pairs)
+  get_logger().info('written results to %s', output_path)
+
+def run(args):
   get_logger().info('finding file pairs')
-  pdf_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([
-    join_if_relative_path(args.data_path, args.pdf_pattern),
+  source_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([
+    join_if_relative_path(args.data_path, args.source_pattern),
     join_if_relative_path(args.data_path, args.xml_pattern)
   ])
-  pdf_xml_pairs = list(pdf_xml_pairs)
+  source_xml_pairs = list(source_xml_pairs)
 
-  mkdirs_if_not_exists(dirname(args.out))
-  delimiter = csv_delimiter_by_filename(args.out)
-  mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv'
-  with FileSystems.create(args.out, mime_type=mime_type) as f:
-    writer = csv.writer(f, delimiter=delimiter)
-    write_csv_rows(writer, [['pdf_url', 'xml_url']])
-    write_csv_rows(writer, pdf_xml_pairs)
-  get_logger().info('written results to %s', args.out)
+  save_file_pairs_to_csv(args.out, source_xml_pairs)
+
+def main(argv=None):
+  args = parse_args(argv)
+  run(args)
 
 if __name__ == '__main__':
   logging.basicConfig(level='INFO')
diff --git a/sciencebeam_gym/preprocess/find_file_pairs_test.py b/sciencebeam_gym/preprocess/find_file_pairs_test.py
new file mode 100644
index 0000000..a93a7c4
--- /dev/null
+++ b/sciencebeam_gym/preprocess/find_file_pairs_test.py
@@ -0,0 +1,119 @@
+import logging
+import os
+from mock import patch
+
+import pytest
+
+import sciencebeam_gym.preprocess.find_file_pairs as find_file_pairs
+from sciencebeam_gym.preprocess.find_file_pairs import (
+  run,
+  parse_args,
+  main
+)
+
+
+LOGGER = logging.getLogger(__name__)
+
+BASE_SOURCE_PATH = '/source'
+
+PDF_FILE_1 = BASE_SOURCE_PATH + '/file1.pdf'
+XML_FILE_1 = BASE_SOURCE_PATH + '/file1.xml'
+PDF_FILE_2 = BASE_SOURCE_PATH + '/file2.pdf'
+XML_FILE_2 = BASE_SOURCE_PATH + '/file2.xml'
+
+SOURCE_PATTERN = '*.pdf'
+XML_PATTERN = '*.xml'
+OUTPUT_FILE = 'file-list.tsv'
+
+SOME_ARGV = [
+  '--data-path=%s' % BASE_SOURCE_PATH,
+  '--source-pattern=%s' % SOURCE_PATTERN,
+  '--xml-pattern=%s' % XML_PATTERN,
+  '--out=%s' % OUTPUT_FILE
+]
+
+
+@pytest.fixture(name='find_file_pairs_grouped_by_parent_directory_or_name_mock')
+def _find_file_pairs_grouped_by_parent_directory_or_name():
+  with patch.object(find_file_pairs, 'find_file_pairs_grouped_by_parent_directory_or_name') as m:
+    yield m
+
+@pytest.fixture(name='save_file_pairs_to_csv_mock')
+def _save_file_pairs_to_csv():
+  with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m:
+    yield m
+
+@pytest.fixture(name='save_file_pairs_to_csv_mock')
+def _save_file_pairs_to_csv():
+  with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m:
+    yield m
+
+@pytest.fixture(name='parse_args_mock')
+def _parse_args():
+  with patch.object(find_file_pairs, 'parse_args') as m:
+    yield m
+
+@pytest.fixture(name='run_mock')
+def _run():
+  with patch.object(find_file_pairs, 'run') as m:
+    yield m
+
+def _touch(path):
+  path.write(b'', ensure=True)
+  return path
+
+@pytest.fixture(name='pdf_file_1')
+def _pdf_file_1(tmpdir):
+  return _touch(tmpdir.join(PDF_FILE_1))
+
+@pytest.fixture(name='xml_file_1')
+def _xml_file_1(tmpdir):
+  return _touch(tmpdir.join(XML_FILE_1))
+
+@pytest.fixture(name='data_path')
+def _data_path(tmpdir):
+  return tmpdir.join(BASE_SOURCE_PATH)
+
+@pytest.fixture(name='out_file')
+def _out_file(tmpdir):
+  return tmpdir.join(OUTPUT_FILE)
+
+class TestRun(object):
+  def test_should_pass_around_parameters(
+    self,
+    find_file_pairs_grouped_by_parent_directory_or_name_mock,
+    save_file_pairs_to_csv_mock):
+
+    opt = parse_args(SOME_ARGV)
+    find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value = [
+      (PDF_FILE_1, XML_FILE_1),
+      (PDF_FILE_2, XML_FILE_2)
+    ]
+    run(opt)
+    find_file_pairs_grouped_by_parent_directory_or_name_mock.assert_called_with([
+      os.path.join(BASE_SOURCE_PATH, SOURCE_PATTERN),
+      os.path.join(BASE_SOURCE_PATH, XML_PATTERN)
+    ])
+    save_file_pairs_to_csv_mock.assert_called_with(
+      opt.out,
+      find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value
+    )
+
+  def test_should_generate_file_list(self, data_path, pdf_file_1, xml_file_1, out_file):
+    LOGGER.debug('pdf_file_1: %s, xml_file: %s', pdf_file_1, xml_file_1)
+    opt = parse_args(SOME_ARGV)
+    opt.data_path = str(data_path)
+    opt.out = str(out_file)
+    run(opt)
+    out_lines = [s.strip() for s in out_file.read().strip().split('\n')]
+    LOGGER.debug('out_lines: %s', out_lines)
+    assert out_lines == [
+      'source_url\txml_url',
+      '%s\t%s' % (pdf_file_1, xml_file_1)
+    ]
+
+class TestMain(object):
+  def test_should_parse_args_and_call_run(self, parse_args_mock, run_mock):
+    main(SOME_ARGV)
+    parse_args_mock.assert_called_with(SOME_ARGV)
+    run_mock.assert_called_with(parse_args_mock.return_value)
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
index 4229426..b128b8b 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
@@ -125,7 +125,7 @@ def configure_pipeline(p, opt):
       pdf_xml_url_pairs = (
         p |
         "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list, limit=opt.limit) |
-        "TranslateFilePairUrls" >> beam.Map(lambda row: (row['pdf_url'], row['xml_url']))
+        "TranslateFilePairUrls" >> beam.Map(lambda row: (row['source_url'], row['xml_url']))
       )
     else:
       pdf_xml_url_pairs = (
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
index 52917ef..e42cf0b 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
@@ -173,7 +173,7 @@ class TestConfigurePipeline(BeamTest):
       opt.save_tfrecords = True
       with TestPipeline() as p:
         mocks['ReadDictCsv'].return_value = beam.Create([{
-          'pdf_url': PDF_FILE_1,
+          'source_url': PDF_FILE_1,
           'xml_url': XML_FILE_1
         }])
         _setup_mocks_for_pages(mocks, [1])
@@ -193,10 +193,10 @@ class TestConfigurePipeline(BeamTest):
       opt.save_tfrecords = True
       with TestPipeline() as p:
         mocks['ReadDictCsv'].return_value = beam.Create([{
-          'pdf_url': PDF_FILE_1,
+          'source_url': PDF_FILE_1,
           'xml_url': XML_FILE_1
         }, {
-          'pdf_url': PDF_FILE_2,
+          'source_url': PDF_FILE_2,
           'xml_url': XML_FILE_2
         }])
         _setup_mocks_for_pages(mocks, [1], file_count=2)
@@ -219,7 +219,7 @@ class TestConfigurePipeline(BeamTest):
       opt.save_tfrecords = True
       with TestPipeline() as p:
         mocks['ReadDictCsv'].return_value = beam.Create([{
-          'pdf_url': PDF_FILE_1,
+          'source_url': PDF_FILE_1,
           'xml_url': XML_FILE_1
         }])
         _setup_mocks_for_pages(mocks, [1])
-- 
GitLab