From 1a58fbaa3b35eba92f417d19024f0d7f72e00eaf Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 6 Jul 2018 16:11:45 +0100
Subject: [PATCH] added support for relative file lists (#33)

---
 sciencebeam_gym/preprocess/find_file_pairs.py | 24 ++++++-
 .../preprocess/find_file_pairs_test.py        | 41 +++++++++--
 .../preprocess/get_output_files.py            | 17 ++++-
 .../preprocess/get_output_files_test.py       | 29 +++++++-
 .../preprocess/preprocessing_pipeline.py      |  7 +-
 .../preprocess/preprocessing_utils.py         | 30 ++++----
 .../preprocess/preprocessing_utils_test.py    | 11 ---
 sciencebeam_gym/utils/file_list.py            | 24 ++++++-
 sciencebeam_gym/utils/file_list_test.py       | 69 ++++++++++++++++---
 sciencebeam_gym/utils/file_path.py            | 21 ++++++
 sciencebeam_gym/utils/file_path_test.py       | 24 +++++++
 11 files changed, 242 insertions(+), 55 deletions(-)
 create mode 100644 sciencebeam_gym/utils/file_path.py
 create mode 100644 sciencebeam_gym/utils/file_path_test.py

diff --git a/sciencebeam_gym/preprocess/find_file_pairs.py b/sciencebeam_gym/preprocess/find_file_pairs.py
index a65a371..ff727d1 100644
--- a/sciencebeam_gym/preprocess/find_file_pairs.py
+++ b/sciencebeam_gym/preprocess/find_file_pairs.py
@@ -14,9 +14,13 @@ from sciencebeam_gym.beam_utils.io import (
   mkdirs_if_not_exists
 )
 
+from sciencebeam_gym.utils.file_path import (
+  join_if_relative_path,
+  relative_path
+)
+
 from sciencebeam_gym.preprocess.preprocessing_utils import (
-  find_file_pairs_grouped_by_parent_directory_or_name,
-  join_if_relative_path
+  find_file_pairs_grouped_by_parent_directory_or_name
 )
 
 def get_logger():
@@ -40,6 +44,12 @@ def parse_args(argv=None):
     '--out', type=str, required=True,
     help='output csv/tsv file'
   )
+
+  parser.add_argument(
+    '--use-relative-paths', action='store_true',
+    help='create a file list with relative paths (relative to the data path)'
+  )
+
   return parser.parse_args(argv)
 
 
@@ -53,12 +63,22 @@ def save_file_pairs_to_csv(output_path, source_xml_pairs):
     write_csv_rows(writer, source_xml_pairs)
   get_logger().info('written results to %s', output_path)
 
+def to_relative_file_pairs(base_path, file_pairs):
+  return (
+    (relative_path(base_path, source_url), relative_path(base_path, xml_url))
+    for source_url, xml_url in file_pairs
+  )
+
 def run(args):
   get_logger().info('finding file pairs')
   source_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([
     join_if_relative_path(args.data_path, args.source_pattern),
     join_if_relative_path(args.data_path, args.xml_pattern)
   ])
+
+  if args.use_relative_paths:
+    source_xml_pairs = to_relative_file_pairs(args.data_path, source_xml_pairs)
+
   source_xml_pairs = list(source_xml_pairs)
 
   save_file_pairs_to_csv(args.out, source_xml_pairs)
diff --git a/sciencebeam_gym/preprocess/find_file_pairs_test.py b/sciencebeam_gym/preprocess/find_file_pairs_test.py
index a93a7c4..4aaee56 100644
--- a/sciencebeam_gym/preprocess/find_file_pairs_test.py
+++ b/sciencebeam_gym/preprocess/find_file_pairs_test.py
@@ -6,6 +6,7 @@ import pytest
 
 import sciencebeam_gym.preprocess.find_file_pairs as find_file_pairs
 from sciencebeam_gym.preprocess.find_file_pairs import (
+  to_relative_file_pairs,
   run,
   parse_args,
   main
@@ -32,10 +33,18 @@ SOME_ARGV = [
   '--out=%s' % OUTPUT_FILE
 ]
 
+@pytest.fixture(name='to_relative_file_pairs_mock')
+def _to_relative_file_pairs():
+  with patch.object(find_file_pairs, 'to_relative_file_pairs') as m:
+    yield m
 
 @pytest.fixture(name='find_file_pairs_grouped_by_parent_directory_or_name_mock')
 def _find_file_pairs_grouped_by_parent_directory_or_name():
   with patch.object(find_file_pairs, 'find_file_pairs_grouped_by_parent_directory_or_name') as m:
+    m.return_value = [
+      (PDF_FILE_1, XML_FILE_1),
+      (PDF_FILE_2, XML_FILE_2)
+    ]
     yield m
 
 @pytest.fixture(name='save_file_pairs_to_csv_mock')
@@ -78,6 +87,13 @@ def _data_path(tmpdir):
 def _out_file(tmpdir):
   return tmpdir.join(OUTPUT_FILE)
 
+class TestToRelativeFilePairs(object):
+  def test_should_make_paths_relative(self):
+    assert list(to_relative_file_pairs(
+      '/parent',
+      [('/parent/sub/file1', '/parent/sub/file2')]
+    )) == [('sub/file1', 'sub/file2')]
+
 class TestRun(object):
   def test_should_pass_around_parameters(
     self,
@@ -85,10 +101,6 @@ class TestRun(object):
     save_file_pairs_to_csv_mock):
 
     opt = parse_args(SOME_ARGV)
-    find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value = [
-      (PDF_FILE_1, XML_FILE_1),
-      (PDF_FILE_2, XML_FILE_2)
-    ]
     run(opt)
     find_file_pairs_grouped_by_parent_directory_or_name_mock.assert_called_with([
       os.path.join(BASE_SOURCE_PATH, SOURCE_PATTERN),
@@ -99,6 +111,27 @@ class TestRun(object):
       find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value
     )
 
+  def test_should_use_relative_paths_if_enabled(
+    self,
+    find_file_pairs_grouped_by_parent_directory_or_name_mock,
+    to_relative_file_pairs_mock,
+    save_file_pairs_to_csv_mock):
+
+    opt = parse_args(SOME_ARGV)
+    opt.use_relative_paths = True
+
+    to_relative_file_pairs_mock.return_value = [('file1.pdf', 'file1.xml')]
+
+    run(opt)
+    to_relative_file_pairs_mock.assert_called_with(
+      BASE_SOURCE_PATH,
+      find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value
+    )
+    save_file_pairs_to_csv_mock.assert_called_with(
+      opt.out,
+      to_relative_file_pairs_mock.return_value
+    )
+
   def test_should_generate_file_list(self, data_path, pdf_file_1, xml_file_1, out_file):
     LOGGER.debug('pdf_file_1: %s, xml_file: %s', pdf_file_1, xml_file_1)
     opt = parse_args(SOME_ARGV)
diff --git a/sciencebeam_gym/preprocess/get_output_files.py b/sciencebeam_gym/preprocess/get_output_files.py
index 979bfbf..73eae63 100644
--- a/sciencebeam_gym/preprocess/get_output_files.py
+++ b/sciencebeam_gym/preprocess/get_output_files.py
@@ -3,13 +3,17 @@ import logging
 
 from sciencebeam_gym.utils.file_list import (
   load_file_list,
-  save_file_list
+  save_file_list,
+  to_relative_file_list
+)
+
+from sciencebeam_gym.utils.file_path import (
+  join_if_relative_path
 )
 
 from sciencebeam_gym.preprocess.preprocessing_utils import (
   get_or_validate_base_path,
-  get_output_file,
-  join_if_relative_path
+  get_output_file
 )
 
 from sciencebeam_gym.preprocess.check_file_list import (
@@ -57,6 +61,10 @@ def parse_args(argv=None):
     '--output-base-path', type=str, required=False,
     help='base output path (by default source base path with"-results" suffix)'
   )
+  output.add_argument(
+    '--use-relative-paths', action='store_true',
+    help='create a file list with relative paths (relative to the output data path)'
+  )
 
   parser.add_argument(
     '--limit', type=int, required=False,
@@ -112,6 +120,9 @@ def run(opt):
     )
     check_files_and_report_result(check_file_list)
 
+  if opt.use_relative_paths:
+    target_file_list = to_relative_file_list(opt.output_base_path, target_file_list)
+
   get_logger().info(
     'saving file list (with %d files) to: %s',
     len(target_file_list), opt.output_file_list
diff --git a/sciencebeam_gym/preprocess/get_output_files_test.py b/sciencebeam_gym/preprocess/get_output_files_test.py
index b0f27d9..41f665a 100644
--- a/sciencebeam_gym/preprocess/get_output_files_test.py
+++ b/sciencebeam_gym/preprocess/get_output_files_test.py
@@ -44,6 +44,11 @@ def _check_files_and_report_result():
   with patch.object(get_output_files, 'check_files_and_report_result') as m:
     yield m
 
+@pytest.fixture(name='to_relative_file_list_mock')
+def _to_relative_file_list():
+  with patch.object(get_output_files, 'to_relative_file_list') as m:
+    yield m
+
 class TestGetOutputFileList(object):
   def test_should_return_output_file_with_path_and_change_ext(self):
     assert get_output_file_list(
@@ -53,7 +58,10 @@ class TestGetOutputFileList(object):
       '.xml'
     ) == ['/output/path/file.xml']
 
-@pytest.mark.usefixtures("load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock")
+@pytest.mark.usefixtures(
+  "load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock",
+  "to_relative_file_list_mock"
+)
 class TestRun(object):
   def test_should_pass_around_parameters(
     self,
@@ -144,6 +152,25 @@ class TestRun(object):
       get_output_file_list_mock.return_value[:opt.check_limit]
     )
 
+  def test_should_save_relative_paths_if_enabled(
+    self,
+    get_output_file_list_mock,
+    to_relative_file_list_mock,
+    save_file_list_mock):
+
+    opt = parse_args(SOME_ARGV)
+    opt.use_relative_paths = True
+    run(opt)
+    to_relative_file_list_mock.assert_called_with(
+      opt.output_base_path,
+      get_output_file_list_mock.return_value,
+    )
+    save_file_list_mock.assert_called_with(
+      opt.output_file_list,
+      to_relative_file_list_mock.return_value,
+      column=opt.source_file_column
+    )
+
 class TestMain(object):
   def test_should_parse_args_and_call_run(self):
     m = get_output_files
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
index b128b8b..e1ce7ae 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
@@ -14,6 +14,11 @@ from sciencebeam_gym.utils.collection import (
   remove_keys_from_dict
 )
 
+from sciencebeam_gym.utils.file_path import (
+  relative_path,
+  join_if_relative_path
+)
+
 from sciencebeam_gym.beam_utils.utils import (
   TransformAndCount,
   TransformAndLog,
@@ -57,8 +62,6 @@ from sciencebeam_gym.preprocess.annotation.annotation_evaluation import (
 
 from sciencebeam_gym.preprocess.preprocessing_utils import (
   change_ext,
-  relative_path,
-  join_if_relative_path,
   find_file_pairs_grouped_by_parent_directory_or_name,
   convert_pdf_bytes_to_lxml,
   convert_and_annotate_lxml_content,
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py
index e5d16ec..96d6ff6 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -29,9 +29,11 @@ from sciencebeam_gym.utils.pages_zip import (
 )
 
 from sciencebeam_gym.beam_utils.io import (
-  dirname,
-  find_matching_filenames,
-  mkdirs_if_not_exists
+  find_matching_filenames
+)
+
+from sciencebeam_gym.utils.file_path import (
+  relative_path
 )
 
 from sciencebeam_gym.preprocess.lxml_to_svg import (
@@ -75,6 +77,13 @@ from sciencebeam_gym.pdf import (
   PdfToPng
 )
 
+# deprecated, moved to sciencebeam_gym.utils.file_path
+# pylint: disable=wrong-import-position, unused-import
+from sciencebeam_gym.utils.file_path import (
+  join_if_relative_path,
+)
+# pylint: enable=wrong-import-position, unused-import
+
 
 def get_logger():
   return logging.getLogger(__name__)
@@ -213,21 +222,6 @@ def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, na
 
   return svg_roots
 
-def relative_path(base_path, path):
-  if not base_path.endswith('/'):
-    base_path += '/'
-  return path[len(base_path):] if path.startswith(base_path) else path
-
-def is_relative_path(path):
-  return not path.startswith('/') and '://' not in path
-
-def join_if_relative_path(base_path, path):
-  return (
-    FileSystems.join(base_path, path)
-    if base_path and is_relative_path(path)
-    else path
-  )
-
 def change_ext(path, old_ext, new_ext):
   if old_ext is None:
     old_ext = os.path.splitext(path)[1]
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
index befec13..12b20f7 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
@@ -12,7 +12,6 @@ from sciencebeam_gym.preprocess.preprocessing_utils import (
   svg_page_to_blockified_png_bytes,
   group_file_pairs_by_parent_directory_or_name,
   convert_pdf_bytes_to_lxml,
-  join_if_relative_path,
   change_ext,
   base_path_for_file_list,
   get_or_validate_base_path,
@@ -116,16 +115,6 @@ class TestConvertPdfBytesToLxml(object):
       )
       assert lxml_content == LXML_CONTENT_1
 
-class TestJoinIfRelativePath(object):
-  def test_should_return_path_if_base_path_is_none(self):
-    assert join_if_relative_path(None, 'file') == 'file'
-
-  def test_should_return_path_if_not_relative(self):
-    assert join_if_relative_path('/parent', '/other/file') == '/other/file'
-
-  def test_should_return_joined_path_if_relative(self):
-    assert join_if_relative_path('/parent', 'file') == '/parent/file'
-
 class TestChangeExt(object):
   def test_should_replace_simple_ext_with_simple_ext(self):
     assert change_ext('file.pdf', None, '.xml') == 'file.xml'
diff --git a/sciencebeam_gym/utils/file_list.py b/sciencebeam_gym/utils/file_list.py
index 27000da..b062236 100644
--- a/sciencebeam_gym/utils/file_list.py
+++ b/sciencebeam_gym/utils/file_list.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import
 
 import codecs
 import csv
+import os
 from itertools import islice
 
 from apache_beam.io.filesystems import FileSystems
@@ -10,6 +11,12 @@ from sciencebeam_gym.utils.csv import (
   csv_delimiter_by_filename
 )
 
+from .file_path import (
+  relative_path,
+  join_if_relative_path
+)
+
+
 def is_csv_or_tsv_file_list(file_list_path):
   return '.csv' in file_list_path or '.tsv' in file_list_path
 
@@ -44,13 +51,24 @@ def load_csv_or_tsv_file_list(file_list_path, column, header=True, limit=None):
       lines = islice(lines, 0, limit)
     return list(lines)
 
-def load_file_list(file_list_path, column, header=True, limit=None):
+def to_absolute_file_list(base_path, file_list):
+  return [join_if_relative_path(base_path, s) for s in file_list]
+
+def to_relative_file_list(base_path, file_list):
+  return [relative_path(base_path, s) for s in file_list]
+
+def load_file_list(file_list_path, column, header=True, limit=None, to_absolute=True):
   if is_csv_or_tsv_file_list(file_list_path):
-    return load_csv_or_tsv_file_list(
+    file_list = load_csv_or_tsv_file_list(
       file_list_path, column=column, header=header, limit=limit
     )
   else:
-    return load_plain_file_list(file_list_path, limit=limit)
+    file_list = load_plain_file_list(file_list_path, limit=limit)
+  if to_absolute:
+    file_list = to_absolute_file_list(
+      os.path.dirname(file_list_path), file_list
+    )
+  return file_list
 
 def save_plain_file_list(file_list_path, file_list):
   with FileSystems.create(file_list_path) as f:
diff --git a/sciencebeam_gym/utils/file_list_test.py b/sciencebeam_gym/utils/file_list_test.py
index 66a8855..efeff9a 100644
--- a/sciencebeam_gym/utils/file_list_test.py
+++ b/sciencebeam_gym/utils/file_list_test.py
@@ -10,6 +10,8 @@ from sciencebeam_gym.utils.file_list import (
   is_csv_or_tsv_file_list,
   load_plain_file_list,
   load_csv_or_tsv_file_list,
+  to_absolute_file_list,
+  to_relative_file_list,
   load_file_list,
   save_plain_file_list,
   save_csv_or_tsv_file_list,
@@ -21,6 +23,21 @@ FILE_2 = 'file2.pdf'
 UNICODE_FILE_1 = u'file1\u1234.pdf'
 FILE_LIST = [FILE_1, FILE_2]
 
+@pytest.fixture(name='load_plain_file_list_mock')
+def _load_plain_file_list():
+  with patch.object(file_list_loader, 'load_plain_file_list') as mock:
+    yield mock
+
+@pytest.fixture(name='load_csv_or_tsv_file_list_mock')
+def _load_csv_or_tsv_file_list():
+  with patch.object(file_list_loader, 'load_csv_or_tsv_file_list') as mock:
+    yield mock
+
+@pytest.fixture(name='to_absolute_file_list_mock')
+def _to_absolute_file_list():
+  with patch.object(file_list_loader, 'to_absolute_file_list') as mock:
+    yield mock
+
 class TestIsCsvOrTsvFileList(object):
   def test_should_return_true_if_file_ext_is_csv(self):
     assert is_csv_or_tsv_file_list('files.csv')
@@ -104,18 +121,48 @@ class TestLoadCsvOrTsvFileList(object):
       f.flush()
       assert load_csv_or_tsv_file_list(f.name, 'url', limit=1) == [FILE_1]
 
+class TestToAbsoluteFileList(object):
+  def test_should_make_path_absolute(self):
+    assert to_absolute_file_list('/base/path', ['sub/file1']) == ['/base/path/sub/file1']
+
+  def test_should_not_change_absolute_paths(self):
+    assert to_absolute_file_list('/base/path', ['/other/file1']) == ['/other/file1']
+
+class TestToRelativeFileList(object):
+  def test_should_make_path_absolute(self):
+    assert to_relative_file_list('/base/path', ['/base/path/sub/file1']) == ['sub/file1']
+
+  def test_should_not_change_path_outside_base_path(self):
+    assert to_relative_file_list('/base/path', ['/other/file1']) == ['/other/file1']
+
+@pytest.mark.usefixtures(
+  'load_plain_file_list_mock', 'load_csv_or_tsv_file_list_mock', 'to_absolute_file_list_mock'
+)
 class TestLoadFileList(object):
-  def test_should_call_load_plain_file_list(self):
-    with patch.object(file_list_loader, 'load_plain_file_list') as mock:
-      result = load_file_list('file-list.lst', column='url', header=True, limit=1)
-      mock.assert_called_with('file-list.lst', limit=1)
-      assert result == mock.return_value
-
-  def test_should_call_load_csv_or_tsv_file_list(self):
-    with patch.object(file_list_loader, 'load_csv_or_tsv_file_list') as mock:
-      result = load_file_list('file-list.csv', column='url', header=True, limit=1)
-      mock.assert_called_with('file-list.csv', column='url', header=True, limit=1)
-      assert result == mock.return_value
+  def test_should_call_load_plain_file_list(self, load_plain_file_list_mock):
+    result = load_file_list(
+      'file-list.lst', column='url', header=True, limit=1, to_absolute=False
+    )
+    load_plain_file_list_mock.assert_called_with('file-list.lst', limit=1)
+    assert result == load_plain_file_list_mock.return_value
+
+  def test_should_call_load_csv_or_tsv_file_list(self, load_csv_or_tsv_file_list_mock):
+    result = load_file_list(
+      'file-list.csv', column='url', header=True, limit=1, to_absolute=False
+    )
+    load_csv_or_tsv_file_list_mock.assert_called_with(
+      'file-list.csv', column='url', header=True, limit=1
+    )
+    assert result == load_csv_or_tsv_file_list_mock.return_value
+
+  def test_should_make_file_list_absolute(
+    self, load_plain_file_list_mock, to_absolute_file_list_mock):
+
+    result = load_file_list('/base/path/file-list.lst', column='url', to_absolute=True)
+    to_absolute_file_list_mock.assert_called_with(
+      '/base/path', load_plain_file_list_mock.return_value
+    )
+    assert result == to_absolute_file_list_mock.return_value
 
 class TestSavePlainFileList(object):
   def test_should_write_multiple_file_paths(self):
diff --git a/sciencebeam_gym/utils/file_path.py b/sciencebeam_gym/utils/file_path.py
new file mode 100644
index 0000000..cc798c1
--- /dev/null
+++ b/sciencebeam_gym/utils/file_path.py
@@ -0,0 +1,21 @@
+
+from __future__ import absolute_import
+
+from apache_beam.io.filesystems import FileSystems
+
+def relative_path(base_path, path):
+  if not base_path:
+    return path
+  if not base_path.endswith('/'):
+    base_path += '/'
+  return path[len(base_path):] if path.startswith(base_path) else path
+
+def is_relative_path(path):
+  return not path.startswith('/') and '://' not in path
+
+def join_if_relative_path(base_path, path):
+  return (
+    FileSystems.join(base_path, path)
+    if base_path and is_relative_path(path)
+    else path
+  )
diff --git a/sciencebeam_gym/utils/file_path_test.py b/sciencebeam_gym/utils/file_path_test.py
new file mode 100644
index 0000000..9ef1680
--- /dev/null
+++ b/sciencebeam_gym/utils/file_path_test.py
@@ -0,0 +1,24 @@
+from .file_path import (
+  relative_path,
+  join_if_relative_path
+)
+
+class TestRelativePath(object):
+  def test_should_return_path_if_base_path_is_none(self):
+    assert relative_path(None, 'file') == 'file'
+
+  def test_should_return_path_if_path_outside_base_path(self):
+    assert relative_path('/parent', '/other/file') == '/other/file'
+
+  def test_should_return_absolute_path_if_base_path_matches(self):
+    assert relative_path('/parent', '/parent/file') == 'file'
+
+class TestJoinIfRelativePath(object):
+  def test_should_return_path_if_base_path_is_none(self):
+    assert join_if_relative_path(None, 'file') == 'file'
+
+  def test_should_return_path_if_not_relative(self):
+    assert join_if_relative_path('/parent', '/other/file') == '/other/file'
+
+  def test_should_return_joined_path_if_relative(self):
+    assert join_if_relative_path('/parent', 'file') == '/parent/file'
-- 
GitLab