From bd903d0fa902d2edff767f7b6a20bf94ae732836 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 1 Dec 2017 18:03:28 +0000
Subject: [PATCH] allow a page range to be specified

---
 sciencebeam_gym/beam_utils/testing.py         | 22 +++++-
 sciencebeam_gym/pdf/pdf_to_png.py             |  9 ++-
 sciencebeam_gym/pdf/pdf_to_png_test.py        | 68 +++++++++++++++++++
 .../preprocess/preprocessing_pipeline.py      | 15 +++-
 .../preprocess/preprocessing_pipeline_test.py | 57 ++++++++++++----
 .../preprocess/preprocessing_utils.py         | 24 +++++--
 .../preprocess/preprocessing_utils_test.py    | 48 ++++++++++++-
 7 files changed, 218 insertions(+), 25 deletions(-)
 create mode 100644 sciencebeam_gym/pdf/pdf_to_png_test.py

diff --git a/sciencebeam_gym/beam_utils/testing.py b/sciencebeam_gym/beam_utils/testing.py
index bf5a0bf..af5bfa3 100644
--- a/sciencebeam_gym/beam_utils/testing.py
+++ b/sciencebeam_gym/beam_utils/testing.py
@@ -3,7 +3,8 @@ from __future__ import absolute_import
 import logging
 from contextlib import contextmanager
 from io import BytesIO
-from mock import patch
+from mock import patch, Mock, MagicMock
+from mock.mock import MagicProxy
 
 import pytest
 
@@ -25,6 +26,7 @@ def get_logger():
 class TestContext(object):
   def __init__(self):
     self.file_content_map = dict()
+    self.object_map = dict()
 
   def set_file_content(self, name, content):
     get_logger().debug('set_file_content: %s (size: %d)', name, len(content))
@@ -36,6 +38,24 @@ class TestContext(object):
 def get_current_test_context():
   return _local['test_context']
 
+# Apache Beam serialises everything, pretend Mocks being serialised
+def unpickle_mock(state):
+  get_logger().debug('unpickle mock: state=%s', state)
+  obj_id = state[0] if isinstance(state, tuple) else state
+  obj = get_current_test_context().object_map[obj_id]
+  return obj
+
+unpickle_mock.__safe_for_unpickling__ = True
+
+def mock_reduce(obj):
+  obj_id = id(obj)
+  get_logger().debug('pickle mock, obj_id: %s', obj_id)
+  get_current_test_context().object_map[obj_id] = obj
+  return unpickle_mock, (obj_id,)
+
+for c in [Mock, MagicMock, MagicProxy]:
+  c.__reduce__ = mock_reduce
+
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
 @pytest.mark.filterwarnings('ignore::UserWarning')
 class BeamTest(object):
diff --git a/sciencebeam_gym/pdf/pdf_to_png.py b/sciencebeam_gym/pdf/pdf_to_png.py
index e3095aa..0bc816a 100644
--- a/sciencebeam_gym/pdf/pdf_to_png.py
+++ b/sciencebeam_gym/pdf/pdf_to_png.py
@@ -2,24 +2,27 @@ import logging
 import os
 from subprocess import Popen, PIPE
 
-from backports import tempfile
+from backports.tempfile import TemporaryDirectory
 
 def get_logger():
   return logging.getLogger(__name__)
 
 class PdfToPng(object):
-  def __init__(self, dpi=None, image_size=None):
+  def __init__(self, dpi=None, image_size=None, page_range=None):
     self.dpi = dpi
     self.image_size = image_size
+    self.page_range = page_range
 
   def iter_pdf_bytes_to_png_fp(self, pdf_bytes):
     cmd = ['pdftoppm', '-png']
+    if self.page_range:
+      cmd += ['-f', str(self.page_range[0]), '-l', str(self.page_range[1])]
     if self.image_size:
       cmd += ['-scale-to-x', str(self.image_size[0]), '-scale-to-y', str(self.image_size[1])]
     elif self.dpi:
       cmd += ['-r', str(self.dpi)]
     cmd += ['-']
-    with tempfile.TemporaryDirectory() as path:
+    with TemporaryDirectory() as path:
       cmd += [os.path.join(path, 'page')]
 
       p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE)
diff --git a/sciencebeam_gym/pdf/pdf_to_png_test.py b/sciencebeam_gym/pdf/pdf_to_png_test.py
new file mode 100644
index 0000000..4a19024
--- /dev/null
+++ b/sciencebeam_gym/pdf/pdf_to_png_test.py
@@ -0,0 +1,68 @@
+import logging
+from subprocess import PIPE
+from contextlib import contextmanager
+from mock import patch
+
+from sciencebeam_gym.pdf.pdf_to_png import (
+  PdfToPng
+)
+
+import sciencebeam_gym.pdf.pdf_to_png as pdf_to_png
+
+TEMP_DIR = '/tmp/1'
+
+PDF_CONTENT_1 = b'pdf content 1'
+
+ARGS_PREFIX = ['pdftoppm', '-png']
+ARGS_SUFFIX = ['-', TEMP_DIR + '/page']
+DEFAULT_KWARGS = dict(stdout=PIPE, stdin=PIPE, stderr=PIPE)
+
+@contextmanager
+def patch_popen():
+  with patch.object(pdf_to_png, 'Popen') as mock:
+    p = mock.return_value
+    p.communicate.return_value = (None, None)
+    p.returncode = 0
+    yield mock
+
+@contextmanager
+def mock_temp_dir():
+  with patch.object(pdf_to_png, 'TemporaryDirectory') as mock:
+    mock.return_value.__enter__.return_value = TEMP_DIR
+    with patch('os.listdir') as listdir:
+      listdir.return_value = []
+      yield mock
+
+class TestPdfToPng(object):
+  def test_should_pass_default_args_to_Popen(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng().iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        assert mock.called
+        mock.assert_called_with(
+          ARGS_PREFIX + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
+
+  def test_should_add_page_range_to_args(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng(page_range=(1, 3)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        mock.assert_called_with(
+          ARGS_PREFIX + ['-f', '1', '-l', '3'] + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
+
+  def test_should_add_image_size_to_args(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng(image_size=(100, 200)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        mock.assert_called_with(
+          ARGS_PREFIX + ['-scale-to-x', '100', '-scale-to-y', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
+
+  def test_should_add_dpi_to_args(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng(dpi=200).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        mock.assert_called_with(
+          ARGS_PREFIX + ['-r', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
index 99cdf1e..48d3ce4 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
@@ -65,7 +65,8 @@ from sciencebeam_gym.preprocess.preprocessing_utils import (
   save_pages,
   save_svg_roots,
   filter_list_props_by_indices,
-  get_page_indices_with_min_annotation_percentage
+  get_page_indices_with_min_annotation_percentage,
+  parse_page_range
 )
 
 from sciencebeam_gym.preprocess.preprocessing_transforms import (
@@ -81,6 +82,7 @@ def configure_pipeline(p, opt):
     if opt.image_width and opt.image_height
     else None
   )
+  page_range = opt.pages
   xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
   if opt.lxml_path:
     lxml_xml_file_pairs = (
@@ -146,7 +148,8 @@ def configure_pipeline(p, opt):
       "ConvertPdfToLxml" >> MapOrLog(lambda v: remove_keys_from_dict(
         extend_dict(v, {
           'lxml_content': convert_pdf_bytes_to_lxml(
-            v['pdf_content'], path=v['source_filename']
+            v['pdf_content'], path=v['source_filename'],
+            page_range=page_range
           )
         }),
         # we don't need the pdf_content unless we are writing tf_records
@@ -169,7 +172,8 @@ def configure_pipeline(p, opt):
           'pdf_png_pages':  list(pdf_bytes_to_png_pages(
             v['pdf_content'],
             dpi=opt.png_dpi,
-            image_size=image_size
+            image_size=image_size,
+            page_range=page_range
           ))
         }),
         {'pdf_content'} # we no longer need the pdf_content
@@ -425,6 +429,11 @@ def add_main_args(parser):
     help='path to xml mapping file'
   )
 
+  parser.add_argument(
+    '--pages', type=parse_page_range, default=None,
+    help='only processes the selected pages'
+  )
+
   parser.add_argument(
     '--save-tfrecords', default=False, action='store_true',
     help='Save TFRecords with PDF PNG and Annotation PNG'
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
index 05191c9..be6158d 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
@@ -1,6 +1,6 @@
 from contextlib import contextmanager
 import logging
-from mock import Mock, patch, DEFAULT
+from mock import Mock, patch, DEFAULT, MagicMock
 
 import pytest
 
@@ -43,8 +43,8 @@ def get_logger():
 def fake_content(path):
   return 'fake content: %s' % path
 
-def fake_lxml_for_pdf(pdf, path):
-  return 'fake lxml for pdf: %s (%s)' % (pdf, path)
+def fake_lxml_for_pdf(pdf, path, page_range=None):
+  return 'fake lxml for pdf: %s (%s) [%s]' % (pdf, path, page_range)
 
 fake_svg_page = lambda i=0: 'fake svg page: %d' % i
 fake_pdf_png_page = lambda i=0: 'fake pdf png page: %d' % i
@@ -56,14 +56,11 @@ def get_global_tfrecords_mock():
 
 @contextmanager
 def patch_preprocessing_pipeline(**kwargs):
-  def DummyWritePropsToTFRecord(file_path, extract_props):
-    return TransformAndLog(beam.Map(
-      lambda v: get_global_tfrecords_mock()(file_path, list(extract_props(v)))
-    ), log_fn=lambda x: get_logger().info('tfrecords: %s', x))
-
   always_mock = {
     'find_file_pairs_grouped_by_parent_directory_or_name',
+    'read_all_from_path',
     'pdf_bytes_to_png_pages',
+    'convert_pdf_bytes_to_lxml',
     'convert_and_annotate_lxml_content',
     'svg_page_to_blockified_png_bytes',
     'save_svg_roots',
@@ -72,24 +69,35 @@ def patch_preprocessing_pipeline(**kwargs):
     'ReadDictCsv'
   }
   tfrecords_mock = Mock(name='tfrecords_mock')
-  get_current_test_context().tfrecords_mock = tfrecords_mock
+
+  def DummyWritePropsToTFRecord(file_path, extract_props):
+    return TransformAndLog(beam.Map(
+      lambda v: tfrecords_mock(file_path, list(extract_props(v)))
+    ), log_fn=lambda x: get_logger().info('tfrecords: %s', x))
 
   with patch.multiple(
     PREPROCESSING_PIPELINE,
-    read_all_from_path=fake_content,
-    convert_pdf_bytes_to_lxml=fake_lxml_for_pdf,
     WritePropsToTFRecord=DummyWritePropsToTFRecord,
     **{
       k: kwargs.get(k, DEFAULT)
       for k in always_mock
     }
   ) as mocks:
-    # mocks['read_all_from_path'] = lambda path: fake_content(path)
+    get_current_test_context().mocks = mocks
+    mocks['read_all_from_path'].side_effect = fake_content
+    mocks['convert_pdf_bytes_to_lxml'].side_effect = fake_lxml_for_pdf
     yield extend_dict(
       mocks,
       {'tfrecords': tfrecords_mock}
     )
 
+MIN_ARGV = [
+  '--data-path=' + BASE_DATA_PATH,
+  '--pdf-path=' + PDF_PATH,
+  '--xml-path=' + XML_PATH,
+  '--save-svg'
+]
+
 def get_default_args():
   return parse_args([
     '--data-path=' + BASE_DATA_PATH,
@@ -292,6 +300,25 @@ class TestConfigurePipeline(BeamTest):
         for i in [1]
       ])
 
+  def test_should_only_process_selected_pages(self):
+    with patch_preprocessing_pipeline() as mocks:
+      opt = get_default_args()
+      opt.save_tfrecords = True
+      opt.save_png = True
+      opt.pages = (1, 3)
+      with TestPipeline() as p:
+        mocks['find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
+          (PDF_FILE_1, XML_FILE_1)
+        ]
+        _setup_mocks_for_pages(mocks, [1, 2])
+        configure_pipeline(p, opt)
+
+      assert mocks['convert_pdf_bytes_to_lxml'].called
+      assert mocks['convert_pdf_bytes_to_lxml'].call_args[1].get('page_range') == opt.pages
+
+      assert mocks['pdf_bytes_to_png_pages'].called
+      assert mocks['pdf_bytes_to_png_pages'].call_args[1].get('page_range') == opt.pages
+
 class TestParseArgs(object):
   def test_should_raise_error_without_arguments(self):
     with pytest.raises(SystemExit):
@@ -372,3 +399,9 @@ class TestParseArgs(object):
     parse_args([
       '--data-path=test', '--pdf-xml-file-list=test', '--xml-path=test', '--save-tfrecords'
     ])
+
+  def test_should_have_none_page_range_by_default(self):
+    assert parse_args(MIN_ARGV).pages is None
+
+  def test_should_parse_pages_as_list(self):
+    assert parse_args(MIN_ARGV + ['--pages=1-3']).pages == (1, 3)
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py
index 697403e..0816aea 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -142,13 +142,17 @@ def find_file_pairs_grouped_by_parent_directory_or_name(patterns, limit=None):
     matching_files_by_pattern
   )
 
-def convert_pdf_bytes_to_lxml(pdf_content, path=None):
+def convert_pdf_bytes_to_lxml(pdf_content, path=None, page_range=None):
   stop_watch_recorder = StopWatchRecorder()
 
+  args = '-blocks -noImageInline -noImage -fullFontName'.split()
+  if page_range:
+    args += ['-f', str(page_range[0]), '-l', str(page_range[1])]
+
   stop_watch_recorder.start('convert to lxml')
   lxml_content = PdfToLxmlWrapper().process_input(
     pdf_content,
-    '-blocks -noImageInline -noImage -fullFontName'.split()
+    args
   )
   stop_watch_recorder.stop()
 
@@ -238,8 +242,8 @@ def save_svg_roots(output_filename, svg_pages):
     for svg_page in svg_pages
   ))
 
-def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size):
-  pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size)
+def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size, page_range=None):
+  pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size, page_range=page_range)
   return (
     fp.read()
     for fp in pdf_to_png.iter_pdf_bytes_to_png_fp(pdf_bytes)
@@ -288,3 +292,15 @@ def get_page_indices_with_min_annotation_percentage(
     for i, page_evaluation in enumerate(annotation_evaluation)
     if page_evaluation['percentage'].get(None) <= (1 - min_annotation_percentage)
   ]
+
+def parse_page_range(s):
+  s = s.strip()
+  if not s:
+    return None
+  a = tuple([int(x) for x in s.split('-')])
+  if len(a) == 1:
+    return (a[0], a[0])
+  elif len(a) == 2:
+    return a
+  else:
+    raise TypeError('invalid page range: %s' % s)
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
index 3895255..59dfe62 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
@@ -1,4 +1,4 @@
-from mock import patch, DEFAULT
+from mock import patch, MagicMock, DEFAULT
 
 from lxml import etree
 
@@ -8,11 +8,15 @@ from sciencebeam_gym.structured_document.svg import (
 
 from sciencebeam_gym.preprocess.preprocessing_utils import (
   svg_page_to_blockified_png_bytes,
-  group_file_pairs_by_parent_directory_or_name
+  group_file_pairs_by_parent_directory_or_name,
+  convert_pdf_bytes_to_lxml,
+  parse_page_range
 )
 
 PROCESSING_UTILS = 'sciencebeam_gym.preprocess.preprocessing_utils'
 
+PDF_CONTENT_1 = b'pdf content 1'
+
 class TestSvgPageToBlockifiedPngBytes(object):
   def test_should_parse_viewbox_and_pass_width_and_height_to_annotated_blocks_to_image(self):
     with patch.multiple(PROCESSING_UTILS, annotated_blocks_to_image=DEFAULT) as mocks:
@@ -77,3 +81,43 @@ class TestGroupFilePairsByParentDirectoryOrName(object):
       ('parent1/file1.x.gz', 'parent1/file1.y.gz'),
       ('parent1/file2.x.gz', 'parent1/file2.y.gz')
     ]
+
+DEFAULT_PDF_TO_LXML_ARGS = ['-blocks', '-noImageInline', '-noImage', '-fullFontName']
+
+LXML_CONTENT_1 = b'lxml content 1'
+
+class TestConvertPdfBytesToLxml(object):
+  def test_should_pass_pdf_content_and_default_args_to_process_input(self):
+    mock = MagicMock()
+    with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
+      mock.return_value.process_input.return_value = LXML_CONTENT_1
+      lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1)
+      mock.return_value.process_input.assert_called_with(
+        PDF_CONTENT_1,
+        DEFAULT_PDF_TO_LXML_ARGS
+      )
+      assert lxml_content == LXML_CONTENT_1
+
+  def test_should_pass_include_page_range_in_args(self):
+    mock = MagicMock()
+    with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
+      mock.return_value.process_input.return_value = LXML_CONTENT_1
+      lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1, page_range=(1, 3))
+      mock.return_value.process_input.assert_called_with(
+        PDF_CONTENT_1,
+        DEFAULT_PDF_TO_LXML_ARGS + ['-f', '1', '-l', '3']
+      )
+      assert lxml_content == LXML_CONTENT_1
+
+class TestPageRange(object):
+  def test_should_parse_single_page_number_as_range(self):
+    assert parse_page_range('1') == (1, 1)
+
+  def test_should_parse_range_with_hyphen(self):
+    assert parse_page_range('1-3') == (1, 3)
+
+  def test_should_parse_range_with_spaces(self):
+    assert parse_page_range(' 1 - 3 ') == (1, 3)
+
+  def test_should_return_none_for_empty_range(self):
+    assert parse_page_range('') is None
-- 
GitLab