From bd903d0fa902d2edff767f7b6a20bf94ae732836 Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 1 Dec 2017 18:03:28 +0000 Subject: [PATCH] allow a page range to be specified --- sciencebeam_gym/beam_utils/testing.py | 22 +++++- sciencebeam_gym/pdf/pdf_to_png.py | 9 ++- sciencebeam_gym/pdf/pdf_to_png_test.py | 68 +++++++++++++++++++ .../preprocess/preprocessing_pipeline.py | 15 +++- .../preprocess/preprocessing_pipeline_test.py | 57 ++++++++++++---- .../preprocess/preprocessing_utils.py | 24 +++++-- .../preprocess/preprocessing_utils_test.py | 48 ++++++++++++- 7 files changed, 218 insertions(+), 25 deletions(-) create mode 100644 sciencebeam_gym/pdf/pdf_to_png_test.py diff --git a/sciencebeam_gym/beam_utils/testing.py b/sciencebeam_gym/beam_utils/testing.py index bf5a0bf..af5bfa3 100644 --- a/sciencebeam_gym/beam_utils/testing.py +++ b/sciencebeam_gym/beam_utils/testing.py @@ -3,7 +3,8 @@ from __future__ import absolute_import import logging from contextlib import contextmanager from io import BytesIO -from mock import patch +from mock import patch, Mock, MagicMock +from mock.mock import MagicProxy import pytest @@ -25,6 +26,7 @@ def get_logger(): class TestContext(object): def __init__(self): self.file_content_map = dict() + self.object_map = dict() def set_file_content(self, name, content): get_logger().debug('set_file_content: %s (size: %d)', name, len(content)) @@ -36,6 +38,24 @@ class TestContext(object): def get_current_test_context(): return _local['test_context'] +# Apache Beam serialises everything, pretend Mocks being serialised +def unpickle_mock(state): + get_logger().debug('unpickle mock: state=%s', state) + obj_id = state[0] if isinstance(state, tuple) else state + obj = get_current_test_context().object_map[obj_id] + return obj + +unpickle_mock.__safe_for_unpickling__ = True + +def mock_reduce(obj): + obj_id = id(obj) + get_logger().debug('pickle mock, obj_id: %s', obj_id) + get_current_test_context().object_map[obj_id] = obj + return unpickle_mock, (obj_id,) + +for c in [Mock, MagicMock, MagicProxy]: + c.__reduce__ = mock_reduce + @pytest.mark.filterwarnings('ignore::DeprecationWarning') @pytest.mark.filterwarnings('ignore::UserWarning') class BeamTest(object): diff --git a/sciencebeam_gym/pdf/pdf_to_png.py b/sciencebeam_gym/pdf/pdf_to_png.py index e3095aa..0bc816a 100644 --- a/sciencebeam_gym/pdf/pdf_to_png.py +++ b/sciencebeam_gym/pdf/pdf_to_png.py @@ -2,24 +2,27 @@ import logging import os from subprocess import Popen, PIPE -from backports import tempfile +from backports.tempfile import TemporaryDirectory def get_logger(): return logging.getLogger(__name__) class PdfToPng(object): - def __init__(self, dpi=None, image_size=None): + def __init__(self, dpi=None, image_size=None, page_range=None): self.dpi = dpi self.image_size = image_size + self.page_range = page_range def iter_pdf_bytes_to_png_fp(self, pdf_bytes): cmd = ['pdftoppm', '-png'] + if self.page_range: + cmd += ['-f', str(self.page_range[0]), '-l', str(self.page_range[1])] if self.image_size: cmd += ['-scale-to-x', str(self.image_size[0]), '-scale-to-y', str(self.image_size[1])] elif self.dpi: cmd += ['-r', str(self.dpi)] cmd += ['-'] - with tempfile.TemporaryDirectory() as path: + with TemporaryDirectory() as path: cmd += [os.path.join(path, 'page')] p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE) diff --git a/sciencebeam_gym/pdf/pdf_to_png_test.py b/sciencebeam_gym/pdf/pdf_to_png_test.py new file mode 100644 index 0000000..4a19024 --- /dev/null +++ b/sciencebeam_gym/pdf/pdf_to_png_test.py @@ -0,0 +1,68 @@ +import logging +from subprocess import PIPE +from contextlib import contextmanager +from mock import patch + +from sciencebeam_gym.pdf.pdf_to_png import ( + PdfToPng +) + +import sciencebeam_gym.pdf.pdf_to_png as pdf_to_png + +TEMP_DIR = '/tmp/1' + +PDF_CONTENT_1 = b'pdf content 1' + +ARGS_PREFIX = ['pdftoppm', '-png'] +ARGS_SUFFIX = ['-', TEMP_DIR + '/page'] +DEFAULT_KWARGS = dict(stdout=PIPE, stdin=PIPE, stderr=PIPE) + +@contextmanager +def patch_popen(): + with patch.object(pdf_to_png, 'Popen') as mock: + p = mock.return_value + p.communicate.return_value = (None, None) + p.returncode = 0 + yield mock + +@contextmanager +def mock_temp_dir(): + with patch.object(pdf_to_png, 'TemporaryDirectory') as mock: + mock.return_value.__enter__.return_value = TEMP_DIR + with patch('os.listdir') as listdir: + listdir.return_value = [] + yield mock + +class TestPdfToPng(object): + def test_should_pass_default_args_to_Popen(self): + with patch_popen() as mock: + with mock_temp_dir(): + list(PdfToPng().iter_pdf_bytes_to_png_fp(PDF_CONTENT_1)) + assert mock.called + mock.assert_called_with( + ARGS_PREFIX + ARGS_SUFFIX, **DEFAULT_KWARGS + ) + + def test_should_add_page_range_to_args(self): + with patch_popen() as mock: + with mock_temp_dir(): + list(PdfToPng(page_range=(1, 3)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1)) + mock.assert_called_with( + ARGS_PREFIX + ['-f', '1', '-l', '3'] + ARGS_SUFFIX, **DEFAULT_KWARGS + ) + + def test_should_add_image_size_to_args(self): + with patch_popen() as mock: + with mock_temp_dir(): + list(PdfToPng(image_size=(100, 200)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1)) + mock.assert_called_with( + ARGS_PREFIX + ['-scale-to-x', '100', '-scale-to-y', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS + ) + + def test_should_add_dpi_to_args(self): + with patch_popen() as mock: + with mock_temp_dir(): + list(PdfToPng(dpi=200).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1)) + mock.assert_called_with( + ARGS_PREFIX + ['-r', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS + ) diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py index 99cdf1e..48d3ce4 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py @@ -65,7 +65,8 @@ from sciencebeam_gym.preprocess.preprocessing_utils import ( save_pages, save_svg_roots, filter_list_props_by_indices, - get_page_indices_with_min_annotation_percentage + get_page_indices_with_min_annotation_percentage, + parse_page_range ) from sciencebeam_gym.preprocess.preprocessing_transforms import ( @@ -81,6 +82,7 @@ def configure_pipeline(p, opt): if opt.image_width and opt.image_height else None ) + page_range = opt.pages xml_mapping = parse_xml_mapping(opt.xml_mapping_path) if opt.lxml_path: lxml_xml_file_pairs = ( @@ -146,7 +148,8 @@ def configure_pipeline(p, opt): "ConvertPdfToLxml" >> MapOrLog(lambda v: remove_keys_from_dict( extend_dict(v, { 'lxml_content': convert_pdf_bytes_to_lxml( - v['pdf_content'], path=v['source_filename'] + v['pdf_content'], path=v['source_filename'], + page_range=page_range ) }), # we don't need the pdf_content unless we are writing tf_records @@ -169,7 +172,8 @@ def configure_pipeline(p, opt): 'pdf_png_pages': list(pdf_bytes_to_png_pages( v['pdf_content'], dpi=opt.png_dpi, - image_size=image_size + image_size=image_size, + page_range=page_range )) }), {'pdf_content'} # we no longer need the pdf_content @@ -425,6 +429,11 @@ def add_main_args(parser): help='path to xml mapping file' ) + parser.add_argument( + '--pages', type=parse_page_range, default=None, + help='only processes the selected pages' + ) + parser.add_argument( '--save-tfrecords', default=False, action='store_true', help='Save TFRecords with PDF PNG and Annotation PNG' diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py index 05191c9..be6158d 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline_test.py @@ -1,6 +1,6 @@ from contextlib import contextmanager import logging -from mock import Mock, patch, DEFAULT +from mock import Mock, patch, DEFAULT, MagicMock import pytest @@ -43,8 +43,8 @@ def get_logger(): def fake_content(path): return 'fake content: %s' % path -def fake_lxml_for_pdf(pdf, path): - return 'fake lxml for pdf: %s (%s)' % (pdf, path) +def fake_lxml_for_pdf(pdf, path, page_range=None): + return 'fake lxml for pdf: %s (%s) [%s]' % (pdf, path, page_range) fake_svg_page = lambda i=0: 'fake svg page: %d' % i fake_pdf_png_page = lambda i=0: 'fake pdf png page: %d' % i @@ -56,14 +56,11 @@ def get_global_tfrecords_mock(): @contextmanager def patch_preprocessing_pipeline(**kwargs): - def DummyWritePropsToTFRecord(file_path, extract_props): - return TransformAndLog(beam.Map( - lambda v: get_global_tfrecords_mock()(file_path, list(extract_props(v))) - ), log_fn=lambda x: get_logger().info('tfrecords: %s', x)) - always_mock = { 'find_file_pairs_grouped_by_parent_directory_or_name', + 'read_all_from_path', 'pdf_bytes_to_png_pages', + 'convert_pdf_bytes_to_lxml', 'convert_and_annotate_lxml_content', 'svg_page_to_blockified_png_bytes', 'save_svg_roots', @@ -72,24 +69,35 @@ def patch_preprocessing_pipeline(**kwargs): 'ReadDictCsv' } tfrecords_mock = Mock(name='tfrecords_mock') - get_current_test_context().tfrecords_mock = tfrecords_mock + + def DummyWritePropsToTFRecord(file_path, extract_props): + return TransformAndLog(beam.Map( + lambda v: tfrecords_mock(file_path, list(extract_props(v))) + ), log_fn=lambda x: get_logger().info('tfrecords: %s', x)) with patch.multiple( PREPROCESSING_PIPELINE, - read_all_from_path=fake_content, - convert_pdf_bytes_to_lxml=fake_lxml_for_pdf, WritePropsToTFRecord=DummyWritePropsToTFRecord, **{ k: kwargs.get(k, DEFAULT) for k in always_mock } ) as mocks: - # mocks['read_all_from_path'] = lambda path: fake_content(path) + get_current_test_context().mocks = mocks + mocks['read_all_from_path'].side_effect = fake_content + mocks['convert_pdf_bytes_to_lxml'].side_effect = fake_lxml_for_pdf yield extend_dict( mocks, {'tfrecords': tfrecords_mock} ) +MIN_ARGV = [ + '--data-path=' + BASE_DATA_PATH, + '--pdf-path=' + PDF_PATH, + '--xml-path=' + XML_PATH, + '--save-svg' +] + def get_default_args(): return parse_args([ '--data-path=' + BASE_DATA_PATH, @@ -292,6 +300,25 @@ class TestConfigurePipeline(BeamTest): for i in [1] ]) + def test_should_only_process_selected_pages(self): + with patch_preprocessing_pipeline() as mocks: + opt = get_default_args() + opt.save_tfrecords = True + opt.save_png = True + opt.pages = (1, 3) + with TestPipeline() as p: + mocks['find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [ + (PDF_FILE_1, XML_FILE_1) + ] + _setup_mocks_for_pages(mocks, [1, 2]) + configure_pipeline(p, opt) + + assert mocks['convert_pdf_bytes_to_lxml'].called + assert mocks['convert_pdf_bytes_to_lxml'].call_args[1].get('page_range') == opt.pages + + assert mocks['pdf_bytes_to_png_pages'].called + assert mocks['pdf_bytes_to_png_pages'].call_args[1].get('page_range') == opt.pages + class TestParseArgs(object): def test_should_raise_error_without_arguments(self): with pytest.raises(SystemExit): @@ -372,3 +399,9 @@ class TestParseArgs(object): parse_args([ '--data-path=test', '--pdf-xml-file-list=test', '--xml-path=test', '--save-tfrecords' ]) + + def test_should_have_none_page_range_by_default(self): + assert parse_args(MIN_ARGV).pages is None + + def test_should_parse_pages_as_list(self): + assert parse_args(MIN_ARGV + ['--pages=1-3']).pages == (1, 3) diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py index 697403e..0816aea 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils.py @@ -142,13 +142,17 @@ def find_file_pairs_grouped_by_parent_directory_or_name(patterns, limit=None): matching_files_by_pattern ) -def convert_pdf_bytes_to_lxml(pdf_content, path=None): +def convert_pdf_bytes_to_lxml(pdf_content, path=None, page_range=None): stop_watch_recorder = StopWatchRecorder() + args = '-blocks -noImageInline -noImage -fullFontName'.split() + if page_range: + args += ['-f', str(page_range[0]), '-l', str(page_range[1])] + stop_watch_recorder.start('convert to lxml') lxml_content = PdfToLxmlWrapper().process_input( pdf_content, - '-blocks -noImageInline -noImage -fullFontName'.split() + args ) stop_watch_recorder.stop() @@ -238,8 +242,8 @@ def save_svg_roots(output_filename, svg_pages): for svg_page in svg_pages )) -def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size): - pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size) +def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size, page_range=None): + pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size, page_range=page_range) return ( fp.read() for fp in pdf_to_png.iter_pdf_bytes_to_png_fp(pdf_bytes) @@ -288,3 +292,15 @@ def get_page_indices_with_min_annotation_percentage( for i, page_evaluation in enumerate(annotation_evaluation) if page_evaluation['percentage'].get(None) <= (1 - min_annotation_percentage) ] + +def parse_page_range(s): + s = s.strip() + if not s: + return None + a = tuple([int(x) for x in s.split('-')]) + if len(a) == 1: + return (a[0], a[0]) + elif len(a) == 2: + return a + else: + raise TypeError('invalid page range: %s' % s) diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py index 3895255..59dfe62 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py @@ -1,4 +1,4 @@ -from mock import patch, DEFAULT +from mock import patch, MagicMock, DEFAULT from lxml import etree @@ -8,11 +8,15 @@ from sciencebeam_gym.structured_document.svg import ( from sciencebeam_gym.preprocess.preprocessing_utils import ( svg_page_to_blockified_png_bytes, - group_file_pairs_by_parent_directory_or_name + group_file_pairs_by_parent_directory_or_name, + convert_pdf_bytes_to_lxml, + parse_page_range ) PROCESSING_UTILS = 'sciencebeam_gym.preprocess.preprocessing_utils' +PDF_CONTENT_1 = b'pdf content 1' + class TestSvgPageToBlockifiedPngBytes(object): def test_should_parse_viewbox_and_pass_width_and_height_to_annotated_blocks_to_image(self): with patch.multiple(PROCESSING_UTILS, annotated_blocks_to_image=DEFAULT) as mocks: @@ -77,3 +81,43 @@ class TestGroupFilePairsByParentDirectoryOrName(object): ('parent1/file1.x.gz', 'parent1/file1.y.gz'), ('parent1/file2.x.gz', 'parent1/file2.y.gz') ] + +DEFAULT_PDF_TO_LXML_ARGS = ['-blocks', '-noImageInline', '-noImage', '-fullFontName'] + +LXML_CONTENT_1 = b'lxml content 1' + +class TestConvertPdfBytesToLxml(object): + def test_should_pass_pdf_content_and_default_args_to_process_input(self): + mock = MagicMock() + with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock): + mock.return_value.process_input.return_value = LXML_CONTENT_1 + lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1) + mock.return_value.process_input.assert_called_with( + PDF_CONTENT_1, + DEFAULT_PDF_TO_LXML_ARGS + ) + assert lxml_content == LXML_CONTENT_1 + + def test_should_pass_include_page_range_in_args(self): + mock = MagicMock() + with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock): + mock.return_value.process_input.return_value = LXML_CONTENT_1 + lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1, page_range=(1, 3)) + mock.return_value.process_input.assert_called_with( + PDF_CONTENT_1, + DEFAULT_PDF_TO_LXML_ARGS + ['-f', '1', '-l', '3'] + ) + assert lxml_content == LXML_CONTENT_1 + +class TestPageRange(object): + def test_should_parse_single_page_number_as_range(self): + assert parse_page_range('1') == (1, 1) + + def test_should_parse_range_with_hyphen(self): + assert parse_page_range('1-3') == (1, 3) + + def test_should_parse_range_with_spaces(self): + assert parse_page_range(' 1 - 3 ') == (1, 3) + + def test_should_return_none_for_empty_range(self): + assert parse_page_range('') is None -- GitLab