diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py index 0816aea644060236b631c95fe02eb2ad608d4985..365ddc6ffd3198ddf7f21a0ee2ecbca7fb35fb75 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils.py @@ -221,6 +221,9 @@ def join_if_relative_path(base_path, path): def change_ext(path, old_ext, new_ext): if old_ext is None: old_ext = os.path.splitext(path)[1] + if old_ext == '.gz': + path = path[:-len(old_ext)] + old_ext = os.path.splitext(path)[1] if old_ext and path.endswith(old_ext): return path[:-len(old_ext)] + new_ext else: diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py index 59dfe62c67a0b2993ed2ef8a809ca9dee1e7cd32..e9b2389747ca44cd08eec7fd996e24b8113b5a95 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py @@ -10,6 +10,7 @@ from sciencebeam_gym.preprocess.preprocessing_utils import ( svg_page_to_blockified_png_bytes, group_file_pairs_by_parent_directory_or_name, convert_pdf_bytes_to_lxml, + change_ext, parse_page_range ) @@ -109,6 +110,16 @@ class TestConvertPdfBytesToLxml(object): ) assert lxml_content == LXML_CONTENT_1 +class TestChangeExt(object): + def test_should_replace_simple_ext_with_simple_ext(self): + assert change_ext('file.pdf', None, '.xml') == 'file.xml' + + def test_should_replace_simple_ext_with_combined_ext(self): + assert change_ext('file.pdf', None, '.svg.zip') == 'file.svg.zip' + + def test_should_remove_gz_ext_before_replacing_ext(self): + assert change_ext('file.pdf.gz', None, '.svg.zip') == 'file.svg.zip' + class TestPageRange(object): def test_should_parse_single_page_number_as_range(self): assert parse_page_range('1') == (1, 1)