From 1dae709566c5e26b685b2d825705211946f6828d Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 6 Jul 2018 09:15:48 +0100
Subject: [PATCH] make source file list relative to source base path if it is
 not an absolute path (#32)

---
 .../preprocess/get_output_files.py            |   8 +-
 .../preprocess/get_output_files_test.py       | 167 +++++++++++-------
 .../preprocess/preprocessing_utils.py         |   6 +-
 .../preprocess/preprocessing_utils_test.py    |  13 +-
 4 files changed, 123 insertions(+), 71 deletions(-)

diff --git a/sciencebeam_gym/preprocess/get_output_files.py b/sciencebeam_gym/preprocess/get_output_files.py
index 11f92b4..979bfbf 100644
--- a/sciencebeam_gym/preprocess/get_output_files.py
+++ b/sciencebeam_gym/preprocess/get_output_files.py
@@ -8,7 +8,8 @@ from sciencebeam_gym.utils.file_list import (
 
 from sciencebeam_gym.preprocess.preprocessing_utils import (
   get_or_validate_base_path,
-  get_output_file
+  get_output_file,
+  join_if_relative_path
 )
 
 from sciencebeam_gym.preprocess.check_file_list import (
@@ -85,7 +86,10 @@ def get_output_file_list(file_list, source_base_path, output_base_path, output_f
 
 def run(opt):
   source_file_list = load_file_list(
-    opt.source_file_list,
+    join_if_relative_path(
+      opt.source_base_path,
+      opt.source_file_list
+    ),
     column=opt.source_file_column,
     limit=opt.limit
   )
diff --git a/sciencebeam_gym/preprocess/get_output_files_test.py b/sciencebeam_gym/preprocess/get_output_files_test.py
index c406842..b0f27d9 100644
--- a/sciencebeam_gym/preprocess/get_output_files_test.py
+++ b/sciencebeam_gym/preprocess/get_output_files_test.py
@@ -1,3 +1,4 @@
+import os
 from mock import patch, ANY
 
 import pytest
@@ -21,6 +22,28 @@ BASE_SOURCE_PATH = '/source'
 FILE_1 = BASE_SOURCE_PATH + '/file1'
 FILE_2 = BASE_SOURCE_PATH + '/file2'
 
+
+@pytest.fixture(name='load_file_list_mock')
+def _load_file_list():
+  with patch.object(get_output_files, 'load_file_list') as m:
+    m.return_value = [FILE_1, FILE_2]
+    yield m
+
+@pytest.fixture(name='get_output_file_list_mock')
+def _get_output_file_list():
+  with patch.object(get_output_files, 'get_output_file_list') as m:
+    yield m
+
+@pytest.fixture(name='save_file_list_mock')
+def _save_file_list():
+  with patch.object(get_output_files, 'save_file_list') as m:
+    yield m
+
+@pytest.fixture(name='check_files_and_report_result_mock')
+def _check_files_and_report_result():
+  with patch.object(get_output_files, 'check_files_and_report_result') as m:
+    yield m
+
 class TestGetOutputFileList(object):
   def test_should_return_output_file_with_path_and_change_ext(self):
     assert get_output_file_list(
@@ -30,86 +53,96 @@ class TestGetOutputFileList(object):
       '.xml'
     ) == ['/output/path/file.xml']
 
+@pytest.mark.usefixtures("load_file_list_mock", "get_output_file_list_mock", "save_file_list_mock")
 class TestRun(object):
-  def test_should_pass_around_parameters(self):
-    m = get_output_files
+  def test_should_pass_around_parameters(
+    self,
+    load_file_list_mock,
+    get_output_file_list_mock,
+    save_file_list_mock):
+
+    load_file_list_mock.return_value = [FILE_1, FILE_2]
+    opt = parse_args(SOME_ARGV)
+    run(opt)
+    load_file_list_mock.assert_called_with(
+      opt.source_file_list,
+      column=opt.source_file_column,
+      limit=opt.limit
+    )
+    get_output_file_list_mock.assert_called_with(
+      load_file_list_mock.return_value,
+      BASE_SOURCE_PATH,
+      opt.output_base_path,
+      opt.output_file_suffix
+    )
+    save_file_list_mock.assert_called_with(
+      opt.output_file_list,
+      get_output_file_list_mock.return_value,
+      column=opt.source_file_column
+    )
+
+  def test_should_make_file_list_absolute_if_it_is_relative(
+    self,
+    load_file_list_mock):
+
     opt = parse_args(SOME_ARGV)
-    with patch.object(m, 'load_file_list') as load_file_list:
-      with patch.object(m, 'get_output_file_list') as get_output_file_list_mock:
-        with patch.object(m, 'save_file_list') as save_file_list:
-          load_file_list.return_value = [FILE_1, FILE_2]
-          run(opt)
-          load_file_list.assert_called_with(
-            opt.source_file_list,
-            column=opt.source_file_column,
-            limit=opt.limit
-          )
-          get_output_file_list_mock.assert_called_with(
-            load_file_list.return_value,
-            BASE_SOURCE_PATH,
-            opt.output_base_path,
-            opt.output_file_suffix
-          )
-          save_file_list.assert_called_with(
-            opt.output_file_list,
-            get_output_file_list_mock.return_value,
-            column=opt.source_file_column
-          )
+    opt.source_base_path = BASE_SOURCE_PATH
+    opt.source_file_list = 'source.tsv'
+    run(opt)
+    load_file_list_mock.assert_called_with(
+      os.path.join(opt.source_base_path, opt.source_file_list),
+      column=opt.source_file_column,
+      limit=opt.limit
+    )
 
   def test_should_raise_error_if_source_path_is_invalid(self):
-    m = get_output_files
     opt = parse_args(SOME_ARGV)
     opt.source_base_path = '/other/path'
-    with patch.object(m, 'load_file_list') as load_file_list:
-      with patch.object(m, 'get_output_file_list'):
-        with patch.object(m, 'save_file_list'):
-          with pytest.raises(AssertionError):
-            load_file_list.return_value = [FILE_1, FILE_2]
-            run(opt)
-
-  def test_should_use_passed_in_source_path_if_valid(self):
-    m = get_output_files
+    with pytest.raises(AssertionError):
+      run(opt)
+
+  def test_should_use_passed_in_source_path_if_valid(
+    self,
+    get_output_file_list_mock,
+    load_file_list_mock):
+
     opt = parse_args(SOME_ARGV)
     opt.source_base_path = '/base'
-    with patch.object(m, 'load_file_list') as load_file_list:
-      with patch.object(m, 'get_output_file_list') as get_output_file_list_mock:
-        with patch.object(m, 'save_file_list'):
-          load_file_list.return_value = ['/base/source/file1', '/base/source/file2']
-          run(opt)
-          get_output_file_list_mock.assert_called_with(
-            ANY,
-            opt.source_base_path,
-            ANY,
-            ANY
-          )
-  def test_should_check_file_list_if_enabled(self):
-    m = get_output_files
+    load_file_list_mock.return_value = ['/base/source/file1', '/base/source/file2']
+    run(opt)
+    get_output_file_list_mock.assert_called_with(
+      ANY,
+      opt.source_base_path,
+      ANY,
+      ANY
+    )
+
+  def test_should_check_file_list_if_enabled(
+    self,
+    get_output_file_list_mock,
+    check_files_and_report_result_mock):
+
     opt = parse_args(SOME_ARGV)
     opt.check = True
-    with patch.object(m, 'load_file_list') as load_file_list:
-      with patch.object(m, 'get_output_file_list') as get_output_file_list_mock:
-        with patch.object(m, 'save_file_list'):
-          with patch.object(m, 'check_files_and_report_result') as check_files_and_report_result:
-            load_file_list.return_value = [FILE_1, FILE_2]
-            run(opt)
-            check_files_and_report_result.assert_called_with(
-              get_output_file_list_mock.return_value
-            )
-
-  def test_should_limit_files_to_check(self):
-    m = get_output_files
+    run(opt)
+    check_files_and_report_result_mock.assert_called_with(
+      get_output_file_list_mock.return_value
+    )
+
+  def test_should_limit_files_to_check(
+    self,
+    load_file_list_mock,
+    get_output_file_list_mock,
+    check_files_and_report_result_mock):
+
     opt = parse_args(SOME_ARGV)
     opt.check = True
     opt.check_limit = 1
-    with patch.object(m, 'load_file_list') as load_file_list:
-      with patch.object(m, 'get_output_file_list') as get_output_file_list_mock:
-        with patch.object(m, 'save_file_list'):
-          with patch.object(m, 'check_files_and_report_result') as check_files_and_report_result:
-            load_file_list.return_value = [FILE_1, FILE_2]
-            run(opt)
-            check_files_and_report_result.assert_called_with(
-              get_output_file_list_mock.return_value[:opt.check_limit]
-            )
+    load_file_list_mock.return_value = [FILE_1, FILE_2]
+    run(opt)
+    check_files_and_report_result_mock.assert_called_with(
+      get_output_file_list_mock.return_value[:opt.check_limit]
+    )
 
 class TestMain(object):
   def test_should_parse_args_and_call_run(self):
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py
index 1ea5c1e..e5d16ec 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -222,7 +222,11 @@ def is_relative_path(path):
   return not path.startswith('/') and '://' not in path
 
 def join_if_relative_path(base_path, path):
-  return FileSystems.join(base_path, path) if is_relative_path(path) else path
+  return (
+    FileSystems.join(base_path, path)
+    if base_path and is_relative_path(path)
+    else path
+  )
 
 def change_ext(path, old_ext, new_ext):
   if old_ext is None:
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
index 210cefd..befec13 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
@@ -12,11 +12,12 @@ from sciencebeam_gym.preprocess.preprocessing_utils import (
   svg_page_to_blockified_png_bytes,
   group_file_pairs_by_parent_directory_or_name,
   convert_pdf_bytes_to_lxml,
+  join_if_relative_path,
   change_ext,
   base_path_for_file_list,
   get_or_validate_base_path,
   get_output_file,
-  parse_page_range
+  parse_page_range,
 )
 
 PROCESSING_UTILS = 'sciencebeam_gym.preprocess.preprocessing_utils'
@@ -115,6 +116,16 @@ class TestConvertPdfBytesToLxml(object):
       )
       assert lxml_content == LXML_CONTENT_1
 
+class TestJoinIfRelativePath(object):
+  def test_should_return_path_if_base_path_is_none(self):
+    assert join_if_relative_path(None, 'file') == 'file'
+
+  def test_should_return_path_if_not_relative(self):
+    assert join_if_relative_path('/parent', '/other/file') == '/other/file'
+
+  def test_should_return_joined_path_if_relative(self):
+    assert join_if_relative_path('/parent', 'file') == '/parent/file'
+
 class TestChangeExt(object):
   def test_should_replace_simple_ext_with_simple_ext(self):
     assert change_ext('file.pdf', None, '.xml') == 'file.xml'
-- 
GitLab