From 9d7f7a0950a071d40b0279d5d7bf5945bf007e5a Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Tue, 30 Jan 2018 12:14:25 +0000 Subject: [PATCH] remove special characters from author name --- .../inference_model/extract_to_xml.py | 37 +++++++++++++++---- .../inference_model/extract_to_xml_test.py | 31 +++++++++++++--- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/sciencebeam_gym/inference_model/extract_to_xml.py b/sciencebeam_gym/inference_model/extract_to_xml.py index f7559dc..9200807 100644 --- a/sciencebeam_gym/inference_model/extract_to_xml.py +++ b/sciencebeam_gym/inference_model/extract_to_xml.py @@ -77,12 +77,28 @@ def create_xml_text(xml_root, path, text): node.text = text return node +AUTHOR_JUNK_CHARS = ',.+*0123456789' + +def _clean_author_name(s): + i = len(s) + while ( + i > 0 and s[i - 1] in AUTHOR_JUNK_CHARS and + # don't remove dot after initials / upper character + (s[i - 1] != '.' or i < 2 or not s[i - 2].isupper()) + ): + i -= 1 + return s[:i] + class XmlMapping(object): - def __init__(self, xml_path, single_node=False, sub_mapping=None, attrib=None): + def __init__( + self, xml_path, single_node=False, sub_mapping=None, attrib=None, + clean_fn=None): + self.xml_path = xml_path self.single_node = single_node self.sub_mapping = sub_mapping self.attrib = attrib + self.clean_fn = clean_fn def _extract_items(parent_node, extracted_items, xml_mapping): previous_tag = None @@ -93,13 +109,16 @@ def _extract_items(parent_node, extracted_items, xml_mapping): if not mapping_entry: get_logger().warning('tag not configured: %s', tag) continue + extracted_text = extracted_item.text + if extracted_text and mapping_entry.clean_fn: + extracted_text = mapping_entry.clean_fn(extracted_text) path = mapping_entry.xml_path if mapping_entry.single_node: node = create_node_recursive(parent_node, path, exists_ok=True) if node.text is None: - node.text = extracted_item.text + node.text = extracted_text elif previous_tag == tag: - node.text += '\n' + extracted_item.text + node.text += '\n' + extracted_text else: get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag) else: @@ -110,7 +129,7 @@ def _extract_items(parent_node, extracted_items, xml_mapping): if extracted_item.sub_items and mapping_entry.sub_mapping: _extract_items(node, extracted_item.sub_items, mapping_entry.sub_mapping) else: - node.text = extracted_item.text + node.text = extracted_text previous_tag = tag def extracted_items_to_xml(extracted_items): @@ -118,11 +137,15 @@ def extracted_items_to_xml(extracted_items): Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True), Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True), Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR, sub_mapping={ - SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(SubXmlPaths.AUTHOR_GIVEN_NAMES), - SubTags.AUTHOR_SURNAME: XmlMapping(SubXmlPaths.AUTHOR_SURNAME) + SubTags.AUTHOR_GIVEN_NAMES: XmlMapping( + SubXmlPaths.AUTHOR_GIVEN_NAMES, clean_fn=_clean_author_name + ), + SubTags.AUTHOR_SURNAME: XmlMapping( + SubXmlPaths.AUTHOR_SURNAME, clean_fn=_clean_author_name + ) }, attrib={ 'contrib-type': 'author' - }), + }, clean_fn=_clean_author_name), Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF) } xml_root = E.article() diff --git a/sciencebeam_gym/inference_model/extract_to_xml_test.py b/sciencebeam_gym/inference_model/extract_to_xml_test.py index b955bb2..fcae0e7 100644 --- a/sciencebeam_gym/inference_model/extract_to_xml_test.py +++ b/sciencebeam_gym/inference_model/extract_to_xml_test.py @@ -83,16 +83,37 @@ class TestExtractedItemsToXml(object): def test_should_extract_author_surname_and_given_names_from_single_author(self): xml_root = extracted_items_to_xml([ - ExtractedItem(Tags.AUTHOR, TEXT_1, sub_items=[ - ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_2), - ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_3) + ExtractedItem(Tags.AUTHOR, ' '.join([TEXT_1, TEXT_2]), sub_items=[ + ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_1), + ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_2) ]) ]) assert xml_root is not None author = xml_root.find(XmlPaths.AUTHOR) assert author is not None - assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_2 - assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_3 + assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1 + assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2 + + def test_should_remove_special_characters_and_numbers_from_author(self): + special_num_chars = ',.+*0123456789' + xml_root = extracted_items_to_xml(_create_author_extracted_items( + TEXT_1 + special_num_chars, TEXT_2 + special_num_chars + )) + assert xml_root is not None + author = xml_root.find(XmlPaths.AUTHOR) + assert author is not None + assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1 + assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2 + + def test_should_not_remove_dot_after_initials_from_author(self): + xml_root = extracted_items_to_xml(_create_author_extracted_items( + 'Mr T.', 'E.' + )) + assert xml_root is not None + author = xml_root.find(XmlPaths.AUTHOR) + assert author is not None + assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T.' + assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == 'E.' def test_should_add_contrib_type_author_attribute(self): xml_root = extracted_items_to_xml(_create_author_extracted_items(TEXT_1, TEXT_2)) -- GitLab