From 9d7f7a0950a071d40b0279d5d7bf5945bf007e5a Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Tue, 30 Jan 2018 12:14:25 +0000
Subject: [PATCH] remove special characters from author name

---
 .../inference_model/extract_to_xml.py         | 37 +++++++++++++++----
 .../inference_model/extract_to_xml_test.py    | 31 +++++++++++++---
 2 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/sciencebeam_gym/inference_model/extract_to_xml.py b/sciencebeam_gym/inference_model/extract_to_xml.py
index f7559dc..9200807 100644
--- a/sciencebeam_gym/inference_model/extract_to_xml.py
+++ b/sciencebeam_gym/inference_model/extract_to_xml.py
@@ -77,12 +77,28 @@ def create_xml_text(xml_root, path, text):
   node.text = text
   return node
 
+AUTHOR_JUNK_CHARS = ',.+*0123456789'
+
+def _clean_author_name(s):
+  i = len(s)
+  while (
+    i > 0 and s[i - 1] in AUTHOR_JUNK_CHARS and
+    # don't remove dot after initials / upper character
+    (s[i - 1] != '.' or i < 2 or not s[i - 2].isupper())
+  ):
+    i -= 1
+  return s[:i]
+
 class XmlMapping(object):
-  def __init__(self, xml_path, single_node=False, sub_mapping=None, attrib=None):
+  def __init__(
+    self, xml_path, single_node=False, sub_mapping=None, attrib=None,
+    clean_fn=None):
+
     self.xml_path = xml_path
     self.single_node = single_node
     self.sub_mapping = sub_mapping
     self.attrib = attrib
+    self.clean_fn = clean_fn
 
 def _extract_items(parent_node, extracted_items, xml_mapping):
   previous_tag = None
@@ -93,13 +109,16 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
       if not mapping_entry:
         get_logger().warning('tag not configured: %s', tag)
         continue
+      extracted_text = extracted_item.text
+      if extracted_text and mapping_entry.clean_fn:
+        extracted_text = mapping_entry.clean_fn(extracted_text)
       path = mapping_entry.xml_path
       if mapping_entry.single_node:
         node = create_node_recursive(parent_node, path, exists_ok=True)
         if node.text is None:
-          node.text = extracted_item.text
+          node.text = extracted_text
         elif previous_tag == tag:
-          node.text += '\n' + extracted_item.text
+          node.text += '\n' + extracted_text
         else:
           get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag)
       else:
@@ -110,7 +129,7 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
         if extracted_item.sub_items and mapping_entry.sub_mapping:
           _extract_items(node, extracted_item.sub_items, mapping_entry.sub_mapping)
         else:
-          node.text = extracted_item.text
+          node.text = extracted_text
       previous_tag = tag
 
 def extracted_items_to_xml(extracted_items):
@@ -118,11 +137,15 @@ def extracted_items_to_xml(extracted_items):
     Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True),
     Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True),
     Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR, sub_mapping={
-      SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(SubXmlPaths.AUTHOR_GIVEN_NAMES),
-      SubTags.AUTHOR_SURNAME: XmlMapping(SubXmlPaths.AUTHOR_SURNAME)
+      SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(
+        SubXmlPaths.AUTHOR_GIVEN_NAMES, clean_fn=_clean_author_name
+      ),
+      SubTags.AUTHOR_SURNAME: XmlMapping(
+        SubXmlPaths.AUTHOR_SURNAME, clean_fn=_clean_author_name
+      )
     }, attrib={
       'contrib-type': 'author'
-    }),
+    }, clean_fn=_clean_author_name),
     Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF)
   }
   xml_root = E.article()
diff --git a/sciencebeam_gym/inference_model/extract_to_xml_test.py b/sciencebeam_gym/inference_model/extract_to_xml_test.py
index b955bb2..fcae0e7 100644
--- a/sciencebeam_gym/inference_model/extract_to_xml_test.py
+++ b/sciencebeam_gym/inference_model/extract_to_xml_test.py
@@ -83,16 +83,37 @@ class TestExtractedItemsToXml(object):
 
   def test_should_extract_author_surname_and_given_names_from_single_author(self):
     xml_root = extracted_items_to_xml([
-      ExtractedItem(Tags.AUTHOR, TEXT_1, sub_items=[
-        ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_2),
-        ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_3)
+      ExtractedItem(Tags.AUTHOR, ' '.join([TEXT_1, TEXT_2]), sub_items=[
+        ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_1),
+        ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_2)
       ])
     ])
     assert xml_root is not None
     author = xml_root.find(XmlPaths.AUTHOR)
     assert author is not None
-    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_2
-    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_3
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
+
+  def test_should_remove_special_characters_and_numbers_from_author(self):
+    special_num_chars = ',.+*0123456789'
+    xml_root = extracted_items_to_xml(_create_author_extracted_items(
+      TEXT_1 + special_num_chars, TEXT_2 + special_num_chars
+    ))
+    assert xml_root is not None
+    author = xml_root.find(XmlPaths.AUTHOR)
+    assert author is not None
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
+
+  def test_should_not_remove_dot_after_initials_from_author(self):
+    xml_root = extracted_items_to_xml(_create_author_extracted_items(
+      'Mr T.', 'E.'
+    ))
+    assert xml_root is not None
+    author = xml_root.find(XmlPaths.AUTHOR)
+    assert author is not None
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T.'
+    assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == 'E.'
 
   def test_should_add_contrib_type_author_attribute(self):
     xml_root = extracted_items_to_xml(_create_author_extracted_items(TEXT_1, TEXT_2))
-- 
GitLab