remove special characters from author name

9d7f7a09 · Daniel Ecer · 78ac72a5 · 9d7f7a09 · 9d7f7a09
Commit 9d7f7a09 authored 7 years ago by Daniel Ecer
--- a/sciencebeam_gym/inference_model/extract_to_xml.py
+++ b/sciencebeam_gym/inference_model/extract_to_xml.py
@@ -77,12 +77,28 @@ def create_xml_text(xml_root, path, text):
  node.text = text
  return node

+AUTHOR_JUNK_CHARS = ',.+*0123456789'
+
+def _clean_author_name(s):
+  i = len(s)
+  while (
+    i > 0 and s[i - 1] in AUTHOR_JUNK_CHARS and
+    # don't remove dot after initials / upper character
+    (s[i - 1] != '.' or i < 2 or not s[i - 2].isupper())
+  ):
+    i -= 1
+  return s[:i]
+
 class XmlMapping(object):
-  def __init__(self, xml_path, single_node=False, sub_mapping=None, attrib=None):
+  def __init__(
+    self, xml_path, single_node=False, sub_mapping=None, attrib=None,
+    clean_fn=None):
+
    self.xml_path = xml_path
    self.single_node = single_node
    self.sub_mapping = sub_mapping
    self.attrib = attrib
+    self.clean_fn = clean_fn

 def _extract_items(parent_node, extracted_items, xml_mapping):
  previous_tag = None
@@ -93,13 +109,16 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
      if not mapping_entry:
        get_logger().warning('tag not configured: %s', tag)
        continue
+      extracted_text = extracted_item.text
+      if extracted_text and mapping_entry.clean_fn:
+        extracted_text = mapping_entry.clean_fn(extracted_text)
      path = mapping_entry.xml_path
      if mapping_entry.single_node:
        node = create_node_recursive(parent_node, path, exists_ok=True)
        if node.text is None:
-          node.text = extracted_item.text
+          node.text = extracted_text
        elif previous_tag == tag:
-          node.text += '\n' + extracted_item.text
+          node.text += '\n' + extracted_text
        else:
          get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag)
      else:
@@ -110,7 +129,7 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
        if extracted_item.sub_items and mapping_entry.sub_mapping:
          _extract_items(node, extracted_item.sub_items, mapping_entry.sub_mapping)
        else:
-          node.text = extracted_item.text
+          node.text = extracted_text
      previous_tag = tag

 def extracted_items_to_xml(extracted_items):
@@ -118,11 +137,15 @@ def extracted_items_to_xml(extracted_items):
    Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True),
    Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True),
    Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR, sub_mapping={
-      SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(SubXmlPaths.AUTHOR_GIVEN_NAMES),
-      SubTags.AUTHOR_SURNAME: XmlMapping(SubXmlPaths.AUTHOR_SURNAME)
+      SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(
+        SubXmlPaths.AUTHOR_GIVEN_NAMES, clean_fn=_clean_author_name
+      ),
+      SubTags.AUTHOR_SURNAME: XmlMapping(
+        SubXmlPaths.AUTHOR_SURNAME, clean_fn=_clean_author_name
+      )
    }, attrib={
      'contrib-type': 'author'
-    }),
+    }, clean_fn=_clean_author_name),
    Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF)
  }
  xml_root = E.article()

--- a/sciencebeam_gym/inference_model/extract_to_xml_test.py
+++ b/sciencebeam_gym/inference_model/extract_to_xml_test.py
@@ -83,16 +83,37 @@ class TestExtractedItemsToXml(object):

  def test_should_extract_author_surname_and_given_names_from_single_author(self):
    xml_root = extracted_items_to_xml([
-      ExtractedItem(Tags.AUTHOR, TEXT_1, sub_items=[
-        ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_2),
-        ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_3)
+      ExtractedItem(Tags.AUTHOR, ' '.join([TEXT_1, TEXT_2]), sub_items=[
+        ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_1),
+        ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_2)
      ])
    ])
    assert xml_root is not None
    author = xml_root.find(XmlPaths.AUTHOR)
    assert author is not None
-    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_2
-    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_3
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
+
+  def test_should_remove_special_characters_and_numbers_from_author(self):
+    special_num_chars = ',.+*0123456789'
+    xml_root = extracted_items_to_xml(_create_author_extracted_items(
+      TEXT_1 + special_num_chars, TEXT_2 + special_num_chars
+    ))
+    assert xml_root is not None
+    author = xml_root.find(XmlPaths.AUTHOR)
+    assert author is not None
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
+
+  def test_should_not_remove_dot_after_initials_from_author(self):
+    xml_root = extracted_items_to_xml(_create_author_extracted_items(
+      'Mr T.', 'E.'
+    ))
+    assert xml_root is not None
+    author = xml_root.find(XmlPaths.AUTHOR)
+    assert author is not None
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T.'
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == 'E.'

  def test_should_add_contrib_type_author_attribute(self):
    xml_root = extracted_items_to_xml(_create_author_extracted_items(TEXT_1, TEXT_2))