Newer
Older
from mock import patch, MagicMock, DEFAULT
from lxml import etree
from sciencebeam_gym.structured_document.svg import (
SVG_DOC
)
from sciencebeam_gym.preprocess.preprocessing_utils import (
svg_page_to_blockified_png_bytes,
group_file_pairs_by_parent_directory_or_name,
convert_pdf_bytes_to_lxml,
parse_page_range
)
PROCESSING_UTILS = 'sciencebeam_gym.preprocess.preprocessing_utils'
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class TestSvgPageToBlockifiedPngBytes(object):
def test_should_parse_viewbox_and_pass_width_and_height_to_annotated_blocks_to_image(self):
with patch.multiple(PROCESSING_UTILS, annotated_blocks_to_image=DEFAULT) as mocks:
svg_page = etree.Element(SVG_DOC, attrib={
'viewBox': '0 0 100.1 200.9'
})
color_map = {}
image_size = (100, 200)
svg_page_to_blockified_png_bytes(svg_page, color_map, image_size)
call_args = mocks['annotated_blocks_to_image'].call_args
kwargs = call_args[1]
assert (kwargs.get('width'), kwargs.get('height')) == (100.1, 200.9)
class TestGroupFilePairsByParentDirectoryOrName(object):
def test_should_return_empty_list_with_empty_input_file_lists(self):
assert list(group_file_pairs_by_parent_directory_or_name([
[],
[]
])) == []
def test_should_group_single_file(self):
assert list(group_file_pairs_by_parent_directory_or_name([
['parent1/file.x'],
['parent1/file.y']
])) == [('parent1/file.x', 'parent1/file.y')]
def test_should_group_single_file_in_directory_with_different_names(self):
assert list(group_file_pairs_by_parent_directory_or_name([
['parent1/file1.x'],
['parent1/file2.y']
])) == [('parent1/file1.x', 'parent1/file2.y')]
def test_should_ignore_files_in_different_directories(self):
assert list(group_file_pairs_by_parent_directory_or_name([
['parent1/file.x'],
['parent2/file.y']
])) == []
def test_should_group_multiple_files_in_separate_parent_directories(self):
assert list(group_file_pairs_by_parent_directory_or_name([
['parent1/file.x', 'parent2/file.x'],
['parent1/file.y', 'parent2/file.y']
])) == [
('parent1/file.x', 'parent1/file.y'),
('parent2/file.x', 'parent2/file.y')
]
def test_should_group_multiple_files_in_same_parent_directory_with_same_name(self):
assert list(group_file_pairs_by_parent_directory_or_name([
['parent1/file1.x', 'parent1/file2.x'],
['parent1/file1.y', 'parent1/file2.y']
])) == [
('parent1/file1.x', 'parent1/file1.y'),
('parent1/file2.x', 'parent1/file2.y')
]
def test_should_group_multiple_files_in_same_parent_directory_with_same_name_gzipped(self):
assert list(group_file_pairs_by_parent_directory_or_name([
['parent1/file1.x.gz', 'parent1/file2.x.gz'],
['parent1/file1.y.gz', 'parent1/file2.y.gz']
])) == [
('parent1/file1.x.gz', 'parent1/file1.y.gz'),
('parent1/file2.x.gz', 'parent1/file2.y.gz')
]
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
DEFAULT_PDF_TO_LXML_ARGS = ['-blocks', '-noImageInline', '-noImage', '-fullFontName']
LXML_CONTENT_1 = b'lxml content 1'
class TestConvertPdfBytesToLxml(object):
def test_should_pass_pdf_content_and_default_args_to_process_input(self):
mock = MagicMock()
with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
mock.return_value.process_input.return_value = LXML_CONTENT_1
lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1)
mock.return_value.process_input.assert_called_with(
PDF_CONTENT_1,
DEFAULT_PDF_TO_LXML_ARGS
)
assert lxml_content == LXML_CONTENT_1
def test_should_pass_include_page_range_in_args(self):
mock = MagicMock()
with patch.multiple(PROCESSING_UTILS, PdfToLxmlWrapper=mock):
mock.return_value.process_input.return_value = LXML_CONTENT_1
lxml_content = convert_pdf_bytes_to_lxml(PDF_CONTENT_1, page_range=(1, 3))
mock.return_value.process_input.assert_called_with(
PDF_CONTENT_1,
DEFAULT_PDF_TO_LXML_ARGS + ['-f', '1', '-l', '3']
)
assert lxml_content == LXML_CONTENT_1
class TestPageRange(object):
def test_should_parse_single_page_number_as_range(self):
assert parse_page_range('1') == (1, 1)
def test_should_parse_range_with_hyphen(self):
assert parse_page_range('1-3') == (1, 3)
def test_should_parse_range_with_spaces(self):
assert parse_page_range(' 1 - 3 ') == (1, 3)
def test_should_return_none_for_empty_range(self):
assert parse_page_range('') is None