Skip to content
Snippets Groups Projects
Unverified Commit cfd1a8bd authored by Daniel Ecer's avatar Daniel Ecer Committed by GitHub
Browse files

fixed xml when finding bounding boxes (#429)

* replace dagger html entity

* strip extra spaces at beginning of xml document
parent 7c73dfa0
No related branches found
No related tags found
No related merge requests found
......@@ -441,6 +441,14 @@ def get_cache(temp_dir: str, memory_cache_size: int):
])
def parse_and_fix_xml(xml_path: str) -> etree.ElementBase:
LOGGER.info('parsing XML file(%r)', os.path.basename(xml_path))
xml_data = read_bytes(xml_path)
xml_data = xml_data.lstrip()
xml_data = xml_data.replace(b'†', b'†')
return etree.fromstring(xml_data)
def process_single_document(
pdf_path: str,
image_paths: Optional[List[str]],
......@@ -461,8 +469,7 @@ def process_single_document(
pdf_images = get_images_from_pdf(pdf_path, pdf_scale_to=pdf_scale_to)
xml_root: Optional[etree.ElementBase] = None
if xml_path:
LOGGER.info('parsing XML file(%r)', os.path.basename(xml_path))
xml_root = etree.fromstring(read_bytes(xml_path))
xml_root = parse_and_fix_xml(xml_path)
image_descriptors = get_graphic_element_descriptors_from_xml_node(
xml_root,
parent_dirname=os.path.dirname(xml_path)
......
......@@ -31,6 +31,7 @@ from sciencebeam_gym.tools.image_annotation.find_bounding_boxes_utils import (
GraphicImageNotFoundError,
format_coords_attribute_value,
main,
parse_and_fix_xml,
parse_args,
save_annotated_images
)
......@@ -104,6 +105,29 @@ def save_images_as_pdf(path_or_io: Union[str, Path, IO], images: List[PIL.Image.
)
class TestParseAndFixXml:
def test_should_parse_valid_xml(self, tmp_path: Path):
xml_file = tmp_path / 'test.xml'
xml_file.write_text('<?xml version="1.0" encoding="UTF-8"?><xml>text</xml>')
xml_root = parse_and_fix_xml(str(xml_file))
assert xml_root.tag == 'xml'
assert xml_root.text == 'text'
def test_should_parse_xml_with_missing_dagger_entity(self, tmp_path: Path):
xml_file = tmp_path / 'test.xml'
xml_file.write_text('<?xml version="1.0" encoding="UTF-8"?><xml>text&dagger;</xml>')
xml_root = parse_and_fix_xml(str(xml_file))
assert xml_root.tag == 'xml'
assert xml_root.text == 'text\u2020'
def test_should_parse_xml_with_extra_spaces_in_the_beginning(self, tmp_path: Path):
xml_file = tmp_path / 'test.xml'
xml_file.write_text(' \n <?xml version="1.0" encoding="UTF-8"?>\n<xml>text</xml>')
xml_root = parse_and_fix_xml(str(xml_file))
assert xml_root.tag == 'xml'
assert xml_root.text == 'text'
class TestSaveAnnotatedImages:
def test_should_not_fail(
self,
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment