diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py
index 0816aea644060236b631c95fe02eb2ad608d4985..365ddc6ffd3198ddf7f21a0ee2ecbca7fb35fb75 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -221,6 +221,9 @@ def join_if_relative_path(base_path, path):
 def change_ext(path, old_ext, new_ext):
   if old_ext is None:
     old_ext = os.path.splitext(path)[1]
+    if old_ext == '.gz':
+      path = path[:-len(old_ext)]
+      old_ext = os.path.splitext(path)[1]
   if old_ext and path.endswith(old_ext):
     return path[:-len(old_ext)] + new_ext
   else:
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils_test.py b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
index 59dfe62c67a0b2993ed2ef8a809ca9dee1e7cd32..e9b2389747ca44cd08eec7fd996e24b8113b5a95 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils_test.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils_test.py
@@ -10,6 +10,7 @@ from sciencebeam_gym.preprocess.preprocessing_utils import (
   svg_page_to_blockified_png_bytes,
   group_file_pairs_by_parent_directory_or_name,
   convert_pdf_bytes_to_lxml,
+  change_ext,
   parse_page_range
 )
 
@@ -109,6 +110,16 @@ class TestConvertPdfBytesToLxml(object):
       )
       assert lxml_content == LXML_CONTENT_1
 
+class TestChangeExt(object):
+  def test_should_replace_simple_ext_with_simple_ext(self):
+    assert change_ext('file.pdf', None, '.xml') == 'file.xml'
+
+  def test_should_replace_simple_ext_with_combined_ext(self):
+    assert change_ext('file.pdf', None, '.svg.zip') == 'file.svg.zip'
+
+  def test_should_remove_gz_ext_before_replacing_ext(self):
+    assert change_ext('file.pdf.gz', None, '.svg.zip') == 'file.svg.zip'
+
 class TestPageRange(object):
   def test_should_parse_single_page_number_as_range(self):
     assert parse_page_range('1') == (1, 1)