Skip to content
Snippets Groups Projects
Unverified Commit 67c0e308 authored by Daniel Ecer's avatar Daniel Ecer Committed by GitHub
Browse files

renamed pdf_url to source_url (#31)

parent 02255c14
No related branches found
No related tags found
No related merge requests found
......@@ -107,24 +107,24 @@ The parent directory per manuscript is optional. If that is not the case then th
Run:
```bash
python -m sciencebeam_lab.preprocess.find_file_pairs \
python -m sciencebeam_gym.preprocess.find_file_pairs \
--data-path <source directory> \
--pdf-pattern *.pdf.gz --xml-pattern *.nxml.gz \
--source-pattern *.pdf.gz --xml-pattern *.nxml.gz \
--out <output file list csv/tsv>
```
e.g.:
```bash
python -m sciencebeam_lab.preprocess.find_file_pairs \
python -m sciencebeam_gym.preprocess.find_file_pairs \
--data-path gs://some-bucket/some-dataset \
--pdf-pattern *.pdf.gz --xml-pattern *.nxml.gz \
--source-pattern *.pdf.gz --xml-pattern *.nxml.gz \
--out gs://some-bucket/some-dataset/file-list.tsv
```
That will create the TSV (tab separated) file `file-list.tsv` with the following columns:
- _pdf_url_
- _source_url_
- _xml_url_
That file could also be generated using any other preferred method.
......
import logging
import pytest
@pytest.fixture(scope='session', autouse=True)
def setup_logging():
logging.root.handlers = []
logging.basicConfig(level='DEBUG')
......@@ -29,8 +29,8 @@ def parse_args(argv=None):
help='base data path'
)
parser.add_argument(
'--pdf-pattern', type=str, required=True,
help='pdf pattern'
'--source-pattern', type=str, required=True,
help='source pattern'
)
parser.add_argument(
'--xml-pattern', type=str, required=True,
......@@ -42,23 +42,30 @@ def parse_args(argv=None):
)
return parser.parse_args(argv)
def main(argv=None):
args = parse_args(argv)
def save_file_pairs_to_csv(output_path, source_xml_pairs):
mkdirs_if_not_exists(dirname(output_path))
delimiter = csv_delimiter_by_filename(output_path)
mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv'
with FileSystems.create(output_path, mime_type=mime_type) as f:
writer = csv.writer(f, delimiter=delimiter)
write_csv_rows(writer, [['source_url', 'xml_url']])
write_csv_rows(writer, source_xml_pairs)
get_logger().info('written results to %s', output_path)
def run(args):
get_logger().info('finding file pairs')
pdf_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([
join_if_relative_path(args.data_path, args.pdf_pattern),
source_xml_pairs = find_file_pairs_grouped_by_parent_directory_or_name([
join_if_relative_path(args.data_path, args.source_pattern),
join_if_relative_path(args.data_path, args.xml_pattern)
])
pdf_xml_pairs = list(pdf_xml_pairs)
source_xml_pairs = list(source_xml_pairs)
mkdirs_if_not_exists(dirname(args.out))
delimiter = csv_delimiter_by_filename(args.out)
mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv'
with FileSystems.create(args.out, mime_type=mime_type) as f:
writer = csv.writer(f, delimiter=delimiter)
write_csv_rows(writer, [['pdf_url', 'xml_url']])
write_csv_rows(writer, pdf_xml_pairs)
get_logger().info('written results to %s', args.out)
save_file_pairs_to_csv(args.out, source_xml_pairs)
def main(argv=None):
args = parse_args(argv)
run(args)
if __name__ == '__main__':
logging.basicConfig(level='INFO')
......
import logging
import os
from mock import patch
import pytest
import sciencebeam_gym.preprocess.find_file_pairs as find_file_pairs
from sciencebeam_gym.preprocess.find_file_pairs import (
run,
parse_args,
main
)
LOGGER = logging.getLogger(__name__)
BASE_SOURCE_PATH = '/source'
PDF_FILE_1 = BASE_SOURCE_PATH + '/file1.pdf'
XML_FILE_1 = BASE_SOURCE_PATH + '/file1.xml'
PDF_FILE_2 = BASE_SOURCE_PATH + '/file2.pdf'
XML_FILE_2 = BASE_SOURCE_PATH + '/file2.xml'
SOURCE_PATTERN = '*.pdf'
XML_PATTERN = '*.xml'
OUTPUT_FILE = 'file-list.tsv'
SOME_ARGV = [
'--data-path=%s' % BASE_SOURCE_PATH,
'--source-pattern=%s' % SOURCE_PATTERN,
'--xml-pattern=%s' % XML_PATTERN,
'--out=%s' % OUTPUT_FILE
]
@pytest.fixture(name='find_file_pairs_grouped_by_parent_directory_or_name_mock')
def _find_file_pairs_grouped_by_parent_directory_or_name():
with patch.object(find_file_pairs, 'find_file_pairs_grouped_by_parent_directory_or_name') as m:
yield m
@pytest.fixture(name='save_file_pairs_to_csv_mock')
def _save_file_pairs_to_csv():
with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m:
yield m
@pytest.fixture(name='save_file_pairs_to_csv_mock')
def _save_file_pairs_to_csv():
with patch.object(find_file_pairs, 'save_file_pairs_to_csv') as m:
yield m
@pytest.fixture(name='parse_args_mock')
def _parse_args():
with patch.object(find_file_pairs, 'parse_args') as m:
yield m
@pytest.fixture(name='run_mock')
def _run():
with patch.object(find_file_pairs, 'run') as m:
yield m
def _touch(path):
path.write(b'', ensure=True)
return path
@pytest.fixture(name='pdf_file_1')
def _pdf_file_1(tmpdir):
return _touch(tmpdir.join(PDF_FILE_1))
@pytest.fixture(name='xml_file_1')
def _xml_file_1(tmpdir):
return _touch(tmpdir.join(XML_FILE_1))
@pytest.fixture(name='data_path')
def _data_path(tmpdir):
return tmpdir.join(BASE_SOURCE_PATH)
@pytest.fixture(name='out_file')
def _out_file(tmpdir):
return tmpdir.join(OUTPUT_FILE)
class TestRun(object):
def test_should_pass_around_parameters(
self,
find_file_pairs_grouped_by_parent_directory_or_name_mock,
save_file_pairs_to_csv_mock):
opt = parse_args(SOME_ARGV)
find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value = [
(PDF_FILE_1, XML_FILE_1),
(PDF_FILE_2, XML_FILE_2)
]
run(opt)
find_file_pairs_grouped_by_parent_directory_or_name_mock.assert_called_with([
os.path.join(BASE_SOURCE_PATH, SOURCE_PATTERN),
os.path.join(BASE_SOURCE_PATH, XML_PATTERN)
])
save_file_pairs_to_csv_mock.assert_called_with(
opt.out,
find_file_pairs_grouped_by_parent_directory_or_name_mock.return_value
)
def test_should_generate_file_list(self, data_path, pdf_file_1, xml_file_1, out_file):
LOGGER.debug('pdf_file_1: %s, xml_file: %s', pdf_file_1, xml_file_1)
opt = parse_args(SOME_ARGV)
opt.data_path = str(data_path)
opt.out = str(out_file)
run(opt)
out_lines = [s.strip() for s in out_file.read().strip().split('\n')]
LOGGER.debug('out_lines: %s', out_lines)
assert out_lines == [
'source_url\txml_url',
'%s\t%s' % (pdf_file_1, xml_file_1)
]
class TestMain(object):
def test_should_parse_args_and_call_run(self, parse_args_mock, run_mock):
main(SOME_ARGV)
parse_args_mock.assert_called_with(SOME_ARGV)
run_mock.assert_called_with(parse_args_mock.return_value)
......@@ -125,7 +125,7 @@ def configure_pipeline(p, opt):
pdf_xml_url_pairs = (
p |
"ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list, limit=opt.limit) |
"TranslateFilePairUrls" >> beam.Map(lambda row: (row['pdf_url'], row['xml_url']))
"TranslateFilePairUrls" >> beam.Map(lambda row: (row['source_url'], row['xml_url']))
)
else:
pdf_xml_url_pairs = (
......
......@@ -173,7 +173,7 @@ class TestConfigurePipeline(BeamTest):
opt.save_tfrecords = True
with TestPipeline() as p:
mocks['ReadDictCsv'].return_value = beam.Create([{
'pdf_url': PDF_FILE_1,
'source_url': PDF_FILE_1,
'xml_url': XML_FILE_1
}])
_setup_mocks_for_pages(mocks, [1])
......@@ -193,10 +193,10 @@ class TestConfigurePipeline(BeamTest):
opt.save_tfrecords = True
with TestPipeline() as p:
mocks['ReadDictCsv'].return_value = beam.Create([{
'pdf_url': PDF_FILE_1,
'source_url': PDF_FILE_1,
'xml_url': XML_FILE_1
}, {
'pdf_url': PDF_FILE_2,
'source_url': PDF_FILE_2,
'xml_url': XML_FILE_2
}])
_setup_mocks_for_pages(mocks, [1], file_count=2)
......@@ -219,7 +219,7 @@ class TestConfigurePipeline(BeamTest):
opt.save_tfrecords = True
with TestPipeline() as p:
mocks['ReadDictCsv'].return_value = beam.Create([{
'pdf_url': PDF_FILE_1,
'source_url': PDF_FILE_1,
'xml_url': XML_FILE_1
}])
_setup_mocks_for_pages(mocks, [1])
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment