From 989dc350f85826fdffc9f1cd995a332bfd0b3ac7 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Tue, 9 Jan 2018 17:57:23 +0000
Subject: [PATCH] fix output file extension for .pdf.gz did not remove .pdf

---
 sciencebeam_gym/preprocess/preprocessing_utils.py     |  3 +++
 .../preprocess/preprocessing_utils_test.py            | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py
index 0816aea..365ddc6 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -221,6 +221,9 @@ def join_if_relative_path(base_path, path):
 def change_ext(path, old_ext, new_ext):
   if old_ext is None:
     old_ext = os.path.splitext(path)[1]
+    if old_ext == '.gz':
+      path = path[:-len(old_ext)]
+      old_ext = os.path.splitext(path)[1]
   if old_ext and path.endswith(old_ext):
     return path[:-len(old_ext)] + new_ext
   else:
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
index 59dfe62..e9b2389 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
@@ -10,6 +10,7 @@ from sciencebeam_gym.preprocess.preprocessing_utils import (
   svg_page_to_blockified_png_bytes,
   group_file_pairs_by_parent_directory_or_name,
   convert_pdf_bytes_to_lxml,
+  change_ext,
   parse_page_range
 )
 
@@ -109,6 +110,16 @@ class TestConvertPdfBytesToLxml(object):
       )
       assert lxml_content == LXML_CONTENT_1
 
+class TestChangeExt(object):
+  def test_should_replace_simple_ext_with_simple_ext(self):
+    assert change_ext('file.pdf', None, '.xml') == 'file.xml'
+
+  def test_should_replace_simple_ext_with_combined_ext(self):
+    assert change_ext('file.pdf', None, '.svg.zip') == 'file.svg.zip'
+
+  def test_should_remove_gz_ext_before_replacing_ext(self):
+    assert change_ext('file.pdf.gz', None, '.svg.zip') == 'file.svg.zip'
+
 class TestPageRange(object):
   def test_should_parse_single_page_number_as_range(self):
     assert parse_page_range('1') == (1, 1)
-- 
GitLab