Skip to content
Snippets Groups Projects
Commit bd903d0f authored by Daniel Ecer's avatar Daniel Ecer
Browse files

allow a page range to be specified

parent 52f1f762
No related branches found
No related tags found
No related merge requests found
......@@ -3,7 +3,8 @@ from __future__ import absolute_import
import logging
from contextlib import contextmanager
from io import BytesIO
from mock import patch
from mock import patch, Mock, MagicMock
from mock.mock import MagicProxy
import pytest
......@@ -25,6 +26,7 @@ def get_logger():
class TestContext(object):
def __init__(self):
self.file_content_map = dict()
self.object_map = dict()
def set_file_content(self, name, content):
get_logger().debug('set_file_content: %s (size: %d)', name, len(content))
......@@ -36,6 +38,24 @@ class TestContext(object):
def get_current_test_context():
return _local['test_context']
# Apache Beam serialises everything, pretend Mocks being serialised
def unpickle_mock(state):
get_logger().debug('unpickle mock: state=%s', state)
obj_id = state[0] if isinstance(state, tuple) else state
obj = get_current_test_context().object_map[obj_id]
return obj
unpickle_mock.__safe_for_unpickling__ = True
def mock_reduce(obj):
obj_id = id(obj)
get_logger().debug('pickle mock, obj_id: %s', obj_id)
get_current_test_context().object_map[obj_id] = obj
return unpickle_mock, (obj_id,)
for c in [Mock, MagicMock, MagicProxy]:
c.__reduce__ = mock_reduce
@pytest.mark.filterwarnings('ignore::DeprecationWarning')
@pytest.mark.filterwarnings('ignore::UserWarning')
class BeamTest(object):
......
......@@ -2,24 +2,27 @@ import logging
import os
from subprocess import Popen, PIPE
from backports import tempfile
from backports.tempfile import TemporaryDirectory
def get_logger():
return logging.getLogger(__name__)
class PdfToPng(object):
def __init__(self, dpi=None, image_size=None):
def __init__(self, dpi=None, image_size=None, page_range=None):
self.dpi = dpi
self.image_size = image_size
self.page_range = page_range
def iter_pdf_bytes_to_png_fp(self, pdf_bytes):
cmd = ['pdftoppm', '-png']
if self.page_range:
cmd += ['-f', str(self.page_range[0]), '-l', str(self.page_range[1])]
if self.image_size:
cmd += ['-scale-to-x', str(self.image_size[0]), '-scale-to-y', str(self.image_size[1])]
elif self.dpi:
cmd += ['-r', str(self.dpi)]
cmd += ['-']
with tempfile.TemporaryDirectory() as path:
with TemporaryDirectory() as path:
cmd += [os.path.join(path, 'page')]
p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE)
......
import logging
from subprocess import PIPE
from contextlib import contextmanager
from mock import patch
from sciencebeam_gym.pdf.pdf_to_png import (
PdfToPng
)
import sciencebeam_gym.pdf.pdf_to_png as pdf_to_png
TEMP_DIR = '/tmp/1'
PDF_CONTENT_1 = b'pdf content 1'
ARGS_PREFIX = ['pdftoppm', '-png']
ARGS_SUFFIX = ['-', TEMP_DIR + '/page']
DEFAULT_KWARGS = dict(stdout=PIPE, stdin=PIPE, stderr=PIPE)
@contextmanager
def patch_popen():
with patch.object(pdf_to_png, 'Popen') as mock:
p = mock.return_value
p.communicate.return_value = (None, None)
p.returncode = 0
yield mock
@contextmanager
def mock_temp_dir():
with patch.object(pdf_to_png, 'TemporaryDirectory') as mock:
mock.return_value.__enter__.return_value = TEMP_DIR
with patch('os.listdir') as listdir:
listdir.return_value = []
yield mock
class TestPdfToPng(object):
def test_should_pass_default_args_to_Popen(self):
with patch_popen() as mock:
with mock_temp_dir():
list(PdfToPng().iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
assert mock.called
mock.assert_called_with(
ARGS_PREFIX + ARGS_SUFFIX, **DEFAULT_KWARGS
)
def test_should_add_page_range_to_args(self):
with patch_popen() as mock:
with mock_temp_dir():
list(PdfToPng(page_range=(1, 3)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
mock.assert_called_with(
ARGS_PREFIX + ['-f', '1', '-l', '3'] + ARGS_SUFFIX, **DEFAULT_KWARGS
)
def test_should_add_image_size_to_args(self):
with patch_popen() as mock:
with mock_temp_dir():
list(PdfToPng(image_size=(100, 200)).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
mock.assert_called_with(
ARGS_PREFIX + ['-scale-to-x', '100', '-scale-to-y', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS
)
def test_should_add_dpi_to_args(self):
with patch_popen() as mock:
with mock_temp_dir():
list(PdfToPng(dpi=200).iter_pdf_bytes_to_png_fp(PDF_CONTENT_1))
mock.assert_called_with(
ARGS_PREFIX + ['-r', '200'] + ARGS_SUFFIX, **DEFAULT_KWARGS
)
......@@ -65,7 +65,8 @@ from sciencebeam_gym.preprocess.preprocessing_utils import (
save_pages,
save_svg_roots,
filter_list_props_by_indices,
get_page_indices_with_min_annotation_percentage
get_page_indices_with_min_annotation_percentage,
parse_page_range
)
from sciencebeam_gym.preprocess.preprocessing_transforms import (
......@@ -81,6 +82,7 @@ def configure_pipeline(p, opt):
if opt.image_width and opt.image_height
else None
)
page_range = opt.pages
xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
if opt.lxml_path:
lxml_xml_file_pairs = (
......@@ -146,7 +148,8 @@ def configure_pipeline(p, opt):
"ConvertPdfToLxml" >> MapOrLog(lambda v: remove_keys_from_dict(
extend_dict(v, {
'lxml_content': convert_pdf_bytes_to_lxml(
v['pdf_content'], path=v['source_filename']
v['pdf_content'], path=v['source_filename'],
page_range=page_range
)
}),
# we don't need the pdf_content unless we are writing tf_records
......@@ -169,7 +172,8 @@ def configure_pipeline(p, opt):
'pdf_png_pages': list(pdf_bytes_to_png_pages(
v['pdf_content'],
dpi=opt.png_dpi,
image_size=image_size
image_size=image_size,
page_range=page_range
))
}),
{'pdf_content'} # we no longer need the pdf_content
......@@ -425,6 +429,11 @@ def add_main_args(parser):
help='path to xml mapping file'
)
parser.add_argument(
'--pages', type=parse_page_range, default=None,
help='only processes the selected pages'
)
parser.add_argument(
'--save-tfrecords', default=False, action='store_true',
help='Save TFRecords with PDF PNG and Annotation PNG'
......
from contextlib import contextmanager
import logging
from mock import Mock, patch, DEFAULT
from mock import Mock, patch, DEFAULT, MagicMock
import pytest
......@@ -43,8 +43,8 @@ def get_logger():
def fake_content(path):
return 'fake content: %s' % path
def fake_lxml_for_pdf(pdf, path):
return 'fake lxml for pdf: %s (%s)' % (pdf, path)
def fake_lxml_for_pdf(pdf, path, page_range=None):
return 'fake lxml for pdf: %s (%s) [%s]' % (pdf, path, page_range)
fake_svg_page = lambda i=0: 'fake svg page: %d' % i
fake_pdf_png_page = lambda i=0: 'fake pdf png page: %d' % i
......@@ -56,14 +56,11 @@ def get_global_tfrecords_mock():
@contextmanager
def patch_preprocessing_pipeline(**kwargs):
def DummyWritePropsToTFRecord(file_path, extract_props):
return TransformAndLog(beam.Map(
lambda v: get_global_tfrecords_mock()(file_path, list(extract_props(v)))
), log_fn=lambda x: get_logger().info('tfrecords: %s', x))
always_mock = {
'find_file_pairs_grouped_by_parent_directory_or_name',
'read_all_from_path',
'pdf_bytes_to_png_pages',
'convert_pdf_bytes_to_lxml',
'convert_and_annotate_lxml_content',
'svg_page_to_blockified_png_bytes',
'save_svg_roots',
......@@ -72,24 +69,35 @@ def patch_preprocessing_pipeline(**kwargs):
'ReadDictCsv'
}
tfrecords_mock = Mock(name='tfrecords_mock')
get_current_test_context().tfrecords_mock = tfrecords_mock
def DummyWritePropsToTFRecord(file_path, extract_props):
return TransformAndLog(beam.Map(
lambda v: tfrecords_mock(file_path, list(extract_props(v)))
), log_fn=lambda x: get_logger().info('tfrecords: %s', x))
with patch.multiple(
PREPROCESSING_PIPELINE,
read_all_from_path=fake_content,
convert_pdf_bytes_to_lxml=fake_lxml_for_pdf,
WritePropsToTFRecord=DummyWritePropsToTFRecord,
**{
k: kwargs.get(k, DEFAULT)
for k in always_mock
}
) as mocks:
# mocks['read_all_from_path'] = lambda path: fake_content(path)
get_current_test_context().mocks = mocks
mocks['read_all_from_path'].side_effect = fake_content
mocks['convert_pdf_bytes_to_lxml'].side_effect = fake_lxml_for_pdf
yield extend_dict(
mocks,
{'tfrecords': tfrecords_mock}
)
MIN_ARGV = [
'--data-path=' + BASE_DATA_PATH,
'--pdf-path=' + PDF_PATH,
'--xml-path=' + XML_PATH,
'--save-svg'
]
def get_default_args():
return parse_args([
'--data-path=' + BASE_DATA_PATH,
......@@ -292,6 +300,25 @@ class TestConfigurePipeline(BeamTest):
for i in [1]
])
def test_should_only_process_selected_pages(self):
with patch_preprocessing_pipeline() as mocks:
opt = get_default_args()
opt.save_tfrecords = True
opt.save_png = True
opt.pages = (1, 3)
with TestPipeline() as p:
mocks['find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
(PDF_FILE_1, XML_FILE_1)
]
_setup_mocks_for_pages(mocks, [1, 2])
configure_pipeline(p, opt)
assert mocks['convert_pdf_bytes_to_lxml'].called
assert mocks['convert_pdf_bytes_to_lxml'].call_args[1].get('page_range') == opt.pages
assert mocks['pdf_bytes_to_png_pages'].called
assert mocks['pdf_bytes_to_png_pages'].call_args[1].get('page_range') == opt.pages
class TestParseArgs(object):
def test_should_raise_error_without_arguments(self):
with pytest.raises(SystemExit):
......@@ -372,3 +399,9 @@ class TestParseArgs(object):
parse_args([
'--data-path=test', '--pdf-xml-file-list=test', '--xml-path=test', '--save-tfrecords'
])
def test_should_have_none_page_range_by_default(self):
assert parse_args(MIN_ARGV).pages is None
def test_should_parse_pages_as_list(self):
assert parse_args(MIN_ARGV + ['--pages=1-3']).pages == (1, 3)
......@@ -142,13 +142,17 @@ def find_file_pairs_grouped_by_parent_directory_or_name(patterns, limit=None):
matching_files_by_pattern
)
def convert_pdf_bytes_to_lxml(pdf_content, path=None):
def convert_pdf_bytes_to_lxml(pdf_content, path=None, page_range=None):
stop_watch_recorder = StopWatchRecorder()
args = '-blocks -noImageInline -noImage -fullFontName'.split()
if page_range:
args += ['-f', str(page_range[0]), '-l', str(page_range[1])]
stop_watch_recorder.start('convert to lxml')
lxml_content = PdfToLxmlWrapper().process_input(
pdf_content,
'-blocks -noImageInline -noImage -fullFontName'.split()
args
)
stop_watch_recorder.stop()
......@@ -238,8 +242,8 @@ def save_svg_roots(output_filename, svg_pages):
for svg_page in svg_pages
))
def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size):
pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size)
def pdf_bytes_to_png_pages(pdf_bytes, dpi, image_size, page_range=None):
pdf_to_png = PdfToPng(dpi=dpi, image_size=image_size, page_range=page_range)
return (
fp.read()
for fp in pdf_to_png.iter_pdf_bytes_to_png_fp(pdf_bytes)
......@@ -288,3 +292,15 @@ def get_page_indices_with_min_annotation_percentage(
for i, page_evaluation in enumerate(annotation_evaluation)
if page_evaluation['percentage'].get(None) <= (1 - min_annotation_percentage)
]
def parse_page_range(s):
s = s.strip()
if not s:
return None
a = tuple([int(x) for x in s.split('-')])
if len(a) == 1:
return (a[0], a[0])
elif len(a) == 2:
return a
else:
raise TypeError('invalid page range: %s' % s)
from mock import patch, DEFAULT
from mock import patch, MagicMock, DEFAULT
from lxml import etree
......@@ -8,11 +8,15 @@ from sciencebeam_gym.structured_document.svg import (
from sciencebeam_gym.preprocess.preprocessing_utils import (
svg_page_to_blockified_png_bytes,
group_file_pairs_by_parent_directory_or_name
group_file_pairs_by_parent_directory_or_name,
convert_pdf_bytes_to_lxml,
parse_page_range
)
PROCESSING_UTILS = 'sciencebeam_gym.preprocess.preprocessing_utils'
PDF_CONTENT_1 = b'pdf content 1'
class TestSvgPageToBlockifiedPngBytes(object):
def test_should_parse_viewbox_and_pass_width_and_height_to_annotated_blocks_to_image(self):
with patch.multiple(PROCESSING_UTILS, annotated_blocks_to_image=DEFAULT) as mocks:
......@@ -77,3 +81,43 @@ class TestGroupFilePairsByParentDirectoryOrName(object):
('parent1/file1.x.gz', 'parent1/file1.y.gz'),
('parent1/file2.x.gz', 'parent1/file2.y.gz')
]
DEFAULT_PDF_TO_LXML_ARGS = ['-blocks', '-noImageInline', '-noImage', '-fullFontName']
LXML_CONTENT_1 = b'lxml content 1'
class TestConvertPdfBytesToLxml(object):
def test_should_pass_pdf_content_and_default_args_to_process_input(self):
mock = MagicMock()
with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
mock.return_value.process_input.return_value = LXML_CONTENT_1
lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1)
mock.return_value.process_input.assert_called_with(
PDF_CONTENT_1,
DEFAULT_PDF_TO_LXML_ARGS
)
assert lxml_content == LXML_CONTENT_1
def test_should_pass_include_page_range_in_args(self):
mock = MagicMock()
with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
mock.return_value.process_input.return_value = LXML_CONTENT_1
lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1, page_range=(1, 3))
mock.return_value.process_input.assert_called_with(
PDF_CONTENT_1,
DEFAULT_PDF_TO_LXML_ARGS + ['-f', '1', '-l', '3']
)
assert lxml_content == LXML_CONTENT_1
class TestPageRange(object):
def test_should_parse_single_page_number_as_range(self):
assert parse_page_range('1') == (1, 1)
def test_should_parse_range_with_hyphen(self):
assert parse_page_range('1-3') == (1, 3)
def test_should_parse_range_with_spaces(self):
assert parse_page_range(' 1 - 3 ') == (1, 3)
def test_should_return_none_for_empty_range(self):
assert parse_page_range('') is None
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment