allow a page range to be specified

bd903d0f · Daniel Ecer · 52f1f762 · bd903d0f · bd903d0f · bd903d0f
Commit bd903d0f authored 7 years ago by Daniel Ecer
--- a/sciencebeam_gym/beam_utils/testing.py
+++ b/sciencebeam_gym/beam_utils/testing.py
@@ -3,7 +3,8 @@ from __future__ import absolute_import
 import logging
 from contextlib import contextmanager
 from io import BytesIO
-from mock import patch
+from mock import patch, Mock, MagicMock
+from mock.mock import MagicProxy

 import pytest

@@ -25,6 +26,7 @@ def get_logger():
 class TestContext(object):
  def __init__(self):
    self.file_content_map = dict()
+    self.object_map = dict()

  def set_file_content(self, name, content):
    get_logger().debug('set_file_content: %s (size: %d)', name, len(content))
@@ -36,6 +38,24 @@ class TestContext(object):
 def get_current_test_context():
  return _local['test_context']

+# Apache Beam serialises everything, pretend Mocks being serialised
+def unpickle_mock(state):
+  get_logger().debug('unpickle mock: state=%s', state)
+  obj_id = state[0] if isinstance(state, tuple) else state
+  obj = get_current_test_context().object_map[obj_id]
+  return obj
+
+unpickle_mock.__safe_for_unpickling__ = True
+
+def mock_reduce(obj):
+  obj_id = id(obj)
+  get_logger().debug('pickle mock, obj_id: %s', obj_id)
+  get_current_test_context().object_map[obj_id] = obj
+  return unpickle_mock, (obj_id,)
+
+for c in [Mock, MagicMock, MagicProxy]:
+  c.__reduce__ = mock_reduce
+
 @pytest.mark.filterwarnings('ignore::DeprecationWarning')
 @pytest.mark.filterwarnings('ignore::UserWarning')
 class BeamTest(object):

--- a/sciencebeam_gym/pdf/pdf_to_png.py
+++ b/sciencebeam_gym/pdf/pdf_to_png.py
@@ -2,24 +2,27 @@ import logging
 import os
 from subprocess import Popen, PIPE

-from backports import tempfile
+from backports.tempfile import TemporaryDirectory

 def get_logger():
  return logging.getLogger(__name__)

 class PdfToPng(object):
-  def __init__(self, dpi=None, image_size=None):
+  def __init__(self, dpi=None, image_size=None, page_range=None):
    self.dpi = dpi
    self.image_size = image_size
+    self.page_range = page_range

  def iter_pdf_bytes_to_png_fp(self, pdf_bytes):
    cmd = ['pdftoppm', '-png']
+    if self.page_range:
+      cmd += ['-f', str(self.page_range[0]), '-l', str(self.page_range[1])]
    if self.image_size:
      cmd += ['-scale-to-x', str(self.image_size[0]), '-scale-to-y', str(self.image_size[1])]
    elif self.dpi:
      cmd += ['-r', str(self.dpi)]
    cmd += ['-']
-    with tempfile.TemporaryDirectory() as path:
+    with TemporaryDirectory() as path:
      cmd += [os.path.join(path, 'page')]

      p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE)

--- a/sciencebeam_gym/pdf/pdf_to_png_test.py
+++ b/sciencebeam_gym/pdf/pdf_to_png_test.py
+import logging
+from subprocess import PIPE
+from contextlib import contextmanager
+from mock import patch
+
+from sciencebeam_gym.pdf.pdf_to_png import (
+  PdfToPng
+)
+
+import sciencebeam_gym.pdf.pdf_to_png as pdf_to_png
+
+TEMP_DIR = '/tmp/1'
+
+PDF_CONTENT_1 = b'pdf content 1'
+
+ARGS_PREFIX = ['pdftoppm', '-png']
+ARGS_SUFFIX = ['-', TEMP_DIR + '/page']
+DEFAULT_KWARGS = dict(stdout=PIPE, stdin=PIPE, stderr=PIPE)
+
+@contextmanager
+def patch_popen():
+  with patch.object(pdf_to_png, 'Popen') as mock:
+    p = mock.return_value
+    p.communicate.return_value = (None, None)
+    p.returncode = 0
+    yield mock
+
+@contextmanager
+def mock_temp_dir():
+  with patch.object(pdf_to_png, 'TemporaryDirectory') as mock:
+    mock.return_value.__enter__.return_value = TEMP_DIR
+    with patch('os.listdir') as listdir:
+      listdir.return_value = []
+      yield mock
+
+class TestPdfToPng(object):
+  def test_should_pass_default_args_to_Popen(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng().iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        assert mock.called
+        mock.assert_called_with(
+          ARGS_PREFIX + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
+
+  def test_should_add_page_range_to_args(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng(page_range=(1, 3)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        mock.assert_called_with(
+          ARGS_PREFIX + ['-f', '1', '-l', '3'] + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
+
+  def test_should_add_image_size_to_args(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng(image_size=(100, 200)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        mock.assert_called_with(
+          ARGS_PREFIX + ['-scale-to-x', '100', '-scale-to-y', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
+
+  def test_should_add_dpi_to_args(self):
+    with patch_popen() as mock:
+      with mock_temp_dir():
+        list(PdfToPng(dpi=200).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
+        mock.assert_called_with(
+          ARGS_PREFIX + ['-r', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS
+        )
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
@@ -65,7 +65,8 @@ from sciencebeam_gym.preprocess.preprocessing_utils import (
  save_pages,
  save_svg_roots,
  filter_list_props_by_indices,
-  get_page_indices_with_min_annotation_percentage
+  get_page_indices_with_min_annotation_percentage,
+  parse_page_range
 )

 from sciencebeam_gym.preprocess.preprocessing_transforms import (
@@ -81,6 +82,7 @@ def configure_pipeline(p, opt):
    if opt.image_width and opt.image_height
    else None
  )
+  page_range = opt.pages
  xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
  if opt.lxml_path:
    lxml_xml_file_pairs = (
@@ -146,7 +148,8 @@ def configure_pipeline(p, opt):
      "ConvertPdfToLxml" >> MapOrLog(lambda v: remove_keys_from_dict(
        extend_dict(v, {
          'lxml_content': convert_pdf_bytes_to_lxml(
-            v['pdf_content'], path=v['source_filename']
+            v['pdf_content'], path=v['source_filename'],
+            page_range=page_range
          )
        }),
        # we don't need the pdf_content unless we are writing tf_records
@@ -169,7 +172,8 @@ def configure_pipeline(p, opt):
          'pdf_png_pages':  list(pdf_bytes_to_png_pages(
            v['pdf_content'],
            dpi=opt.png_dpi,
-            image_size=image_size
+            image_size=image_size,
+            page_range=page_range
          ))
        }),
        {'pdf_content'} # we no longer need the pdf_content
@@ -425,6 +429,11 @@ def add_main_args(parser):
    help='path to xml mapping file'
  )

+  parser.add_argument(
+    '--pages', type=parse_page_range, default=None,
+    help='only processes the selected pages'
+  )
+
  parser.add_argument(
    '--save-tfrecords', default=False, action='store_true',
    help='Save TFRecords with PDF PNG and Annotation PNG'

--- a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py
 from contextlib import contextmanager
 import logging
-from mock import Mock, patch, DEFAULT
+from mock import Mock, patch, DEFAULT, MagicMock

 import pytest

@@ -43,8 +43,8 @@ def get_logger():
 def fake_content(path):
  return 'fake content: %s' % path

-def fake_lxml_for_pdf(pdf, path):
-  return 'fake lxml for pdf: %s (%s)' % (pdf, path)
+def fake_lxml_for_pdf(pdf, path, page_range=None):
+  return 'fake lxml for pdf: %s (%s) [%s]' % (pdf, path, page_range)

 fake_svg_page = lambda i=0: 'fake svg page: %d' % i
 fake_pdf_png_page = lambda i=0: 'fake pdf png page: %d' % i
@@ -56,14 +56,11 @@ def get_global_tfrecords_mock():

 @contextmanager
 def patch_preprocessing_pipeline(**kwargs):
-  def DummyWritePropsToTFRecord(file_path, extract_props):
-    return TransformAndLog(beam.Map(
-      lambda v: get_global_tfrecords_mock()(file_path, list(extract_props(v)))
-    ), log_fn=lambda x: get_logger().info('tfrecords: %s', x))
-
  always_mock = {
    'find_file_pairs_grouped_by_parent_directory_or_name',
+    'read_all_from_path',
    'pdf_bytes_to_png_pages',
+    'convert_pdf_bytes_to_lxml',
    'convert_and_annotate_lxml_content',
    'svg_page_to_blockified_png_bytes',
    'save_svg_roots',
@@ -72,24 +69,35 @@ def patch_preprocessing_pipeline(**kwargs):
    'ReadDictCsv'
  }
  tfrecords_mock = Mock(name='tfrecords_mock')
-  get_current_test_context().tfrecords_mock = tfrecords_mock
+
+  def DummyWritePropsToTFRecord(file_path, extract_props):
+    return TransformAndLog(beam.Map(
+      lambda v: tfrecords_mock(file_path, list(extract_props(v)))
+    ), log_fn=lambda x: get_logger().info('tfrecords: %s', x))

  with patch.multiple(
    PREPROCESSING_PIPELINE,
-    read_all_from_path=fake_content,
-    convert_pdf_bytes_to_lxml=fake_lxml_for_pdf,
    WritePropsToTFRecord=DummyWritePropsToTFRecord,
    **{
      k: kwargs.get(k, DEFAULT)
      for k in always_mock
    }
  ) as mocks:
-    # mocks['read_all_from_path'] = lambda path: fake_content(path)
+    get_current_test_context().mocks = mocks
+    mocks['read_all_from_path'].side_effect = fake_content
+    mocks['convert_pdf_bytes_to_lxml'].side_effect = fake_lxml_for_pdf
    yield extend_dict(
      mocks,
      {'tfrecords': tfrecords_mock}
    )

+MIN_ARGV = [
+  '--data-path=' + BASE_DATA_PATH,
+  '--pdf-path=' + PDF_PATH,
+  '--xml-path=' + XML_PATH,
+  '--save-svg'
+]
+
 def get_default_args():
  return parse_args([
    '--data-path=' + BASE_DATA_PATH,
@@ -292,6 +300,25 @@ class TestConfigurePipeline(BeamTest):
        for i in [1]
      ])

+  def test_should_only_process_selected_pages(self):
+    with patch_preprocessing_pipeline() as mocks:
+      opt = get_default_args()
+      opt.save_tfrecords = True
+      opt.save_png = True
+      opt.pages = (1, 3)
+      with TestPipeline() as p:
+        mocks['find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
+          (PDF_FILE_1, XML_FILE_1)
+        ]
+        _setup_mocks_for_pages(mocks, [1, 2])
+        configure_pipeline(p, opt)
+
+      assert mocks['convert_pdf_bytes_to_lxml'].called
+      assert mocks['convert_pdf_bytes_to_lxml'].call_args[1].get('page_range') == opt.pages
+
+      assert mocks['pdf_bytes_to_png_pages'].called
+      assert mocks['pdf_bytes_to_png_pages'].call_args[1].get('page_range') == opt.pages
+
 class TestParseArgs(object):
  def test_should_raise_error_without_arguments(self):
    with pytest.raises(SystemExit):
@@ -372,3 +399,9 @@ class TestParseArgs(object):
    parse_args([
      '--data-path=test', '--pdf-xml-file-list=test', '--xml-path=test', '--save-tfrecords'
    ])
+
+  def test_should_have_none_page_range_by_default(self):
+    assert parse_args(MIN_ARGV).pages is None
+
+  def test_should_parse_pages_as_list(self):
+    assert parse_args(MIN_ARGV + ['--pages=1-3']).pages == (1, 3)
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -142,13 +142,17 @@ def find_file_pairs_grouped_by_parent_directory_or_name(patterns, limit=None):
    matching_files_by_pattern
  )

-def convert_pdf_bytes_to_lxml(pdf_content, path=None):
+def convert_pdf_bytes_to_lxml(pdf_content, path=None, page_range=None):
  stop_watch_recorder = StopWatchRecorder()

+  args = '-blocks -noImageInline -noImage -fullFontName'.split()
+  if page_range:
+    args += ['-f', str(page_range[0]), '-l', str(page_range[1])]
+
  stop_watch_recorder.start('convert to lxml')
  lxml_content = PdfToLxmlWrapper().process_input(
    pdf_content,
-    '-blocks -noImageInline -noImage -fullFontName'.split()
+    args
  )
  stop_watch_recorder.stop()

@@ -238,8 +242,8 @@ def save_svg_roots(output_filename, svg_pages):
    for svg_page in svg_pages
  ))

-def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size):
-  pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size)
+def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size, page_range=None):
+  pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size, page_range=page_range)
  return (
    fp.read()
    for fp in pdf_to_png.iter_pdf_bytes_to_png_fp(pdf_bytes)
@@ -288,3 +292,15 @@ def get_page_indices_with_min_annotation_percentage(
    for i, page_evaluation in enumerate(annotation_evaluation)
    if page_evaluation['percentage'].get(None) <= (1 - min_annotation_percentage)
  ]
+
+def parse_page_range(s):
+  s = s.strip()
+  if not s:
+    return None
+  a = tuple([int(x) for x in s.split('-')])
+  if len(a) == 1:
+    return (a[0], a[0])
+  elif len(a) == 2:
+    return a
+  else:
+    raise TypeError('invalid page range: %s' % s)
--- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
-from mock import patch, DEFAULT
+from mock import patch, MagicMock, DEFAULT

 from lxml import etree

@@ -8,11 +8,15 @@ from sciencebeam_gym.structured_document.svg import (

 from sciencebeam_gym.preprocess.preprocessing_utils import (
  svg_page_to_blockified_png_bytes,
-  group_file_pairs_by_parent_directory_or_name
+  group_file_pairs_by_parent_directory_or_name,
+  convert_pdf_bytes_to_lxml,
+  parse_page_range
 )

 PROCESSING_UTILS = 'sciencebeam_gym.preprocess.preprocessing_utils'

+PDF_CONTENT_1 = b'pdf content 1'
+
 class TestSvgPageToBlockifiedPngBytes(object):
  def test_should_parse_viewbox_and_pass_width_and_height_to_annotated_blocks_to_image(self):
    with patch.multiple(PROCESSING_UTILS, annotated_blocks_to_image=DEFAULT) as mocks:
@@ -77,3 +81,43 @@ class TestGroupFilePairsByParentDirectoryOrName(object):
      ('parent1/file1.x.gz', 'parent1/file1.y.gz'),
      ('parent1/file2.x.gz', 'parent1/file2.y.gz')
    ]
+
+DEFAULT_PDF_TO_LXML_ARGS = ['-blocks', '-noImageInline', '-noImage', '-fullFontName']
+
+LXML_CONTENT_1 = b'lxml content 1'
+
+class TestConvertPdfBytesToLxml(object):
+  def test_should_pass_pdf_content_and_default_args_to_process_input(self):
+    mock = MagicMock()
+    with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
+      mock.return_value.process_input.return_value = LXML_CONTENT_1
+      lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1)
+      mock.return_value.process_input.assert_called_with(
+        PDF_CONTENT_1,
+        DEFAULT_PDF_TO_LXML_ARGS
+      )
+      assert lxml_content == LXML_CONTENT_1
+
+  def test_should_pass_include_page_range_in_args(self):
+    mock = MagicMock()
+    with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
+      mock.return_value.process_input.return_value = LXML_CONTENT_1
+      lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1, page_range=(1, 3))
+      mock.return_value.process_input.assert_called_with(
+        PDF_CONTENT_1,
+        DEFAULT_PDF_TO_LXML_ARGS + ['-f', '1', '-l', '3']
+      )
+      assert lxml_content == LXML_CONTENT_1
+
+class TestPageRange(object):
+  def test_should_parse_single_page_number_as_range(self):
+    assert parse_page_range('1') == (1, 1)
+
+  def test_should_parse_range_with_hyphen(self):
+    assert parse_page_range('1-3') == (1, 3)
+
+  def test_should_parse_range_with_spaces(self):
+    assert parse_page_range(' 1 - 3 ') == (1, 3)
+
+  def test_should_return_none_for_empty_range(self):
+    assert parse_page_range('') is None