Skip to content
Snippets Groups Projects
Commit 9d7f7a09 authored by Daniel Ecer's avatar Daniel Ecer
Browse files

remove special characters from author name

parent 78ac72a5
No related branches found
No related tags found
No related merge requests found
...@@ -77,12 +77,28 @@ def create_xml_text(xml_root, path, text): ...@@ -77,12 +77,28 @@ def create_xml_text(xml_root, path, text):
node.text = text node.text = text
return node return node
AUTHOR_JUNK_CHARS = ',.+*0123456789'
def _clean_author_name(s):
i = len(s)
while (
i > 0 and s[i - 1] in AUTHOR_JUNK_CHARS and
# don't remove dot after initials / upper character
(s[i - 1] != '.' or i < 2 or not s[i - 2].isupper())
):
i -= 1
return s[:i]
class XmlMapping(object): class XmlMapping(object):
def __init__(self, xml_path, single_node=False, sub_mapping=None, attrib=None): def __init__(
self, xml_path, single_node=False, sub_mapping=None, attrib=None,
clean_fn=None):
self.xml_path = xml_path self.xml_path = xml_path
self.single_node = single_node self.single_node = single_node
self.sub_mapping = sub_mapping self.sub_mapping = sub_mapping
self.attrib = attrib self.attrib = attrib
self.clean_fn = clean_fn
def _extract_items(parent_node, extracted_items, xml_mapping): def _extract_items(parent_node, extracted_items, xml_mapping):
previous_tag = None previous_tag = None
...@@ -93,13 +109,16 @@ def _extract_items(parent_node, extracted_items, xml_mapping): ...@@ -93,13 +109,16 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
if not mapping_entry: if not mapping_entry:
get_logger().warning('tag not configured: %s', tag) get_logger().warning('tag not configured: %s', tag)
continue continue
extracted_text = extracted_item.text
if extracted_text and mapping_entry.clean_fn:
extracted_text = mapping_entry.clean_fn(extracted_text)
path = mapping_entry.xml_path path = mapping_entry.xml_path
if mapping_entry.single_node: if mapping_entry.single_node:
node = create_node_recursive(parent_node, path, exists_ok=True) node = create_node_recursive(parent_node, path, exists_ok=True)
if node.text is None: if node.text is None:
node.text = extracted_item.text node.text = extracted_text
elif previous_tag == tag: elif previous_tag == tag:
node.text += '\n' + extracted_item.text node.text += '\n' + extracted_text
else: else:
get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag) get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag)
else: else:
...@@ -110,7 +129,7 @@ def _extract_items(parent_node, extracted_items, xml_mapping): ...@@ -110,7 +129,7 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
if extracted_item.sub_items and mapping_entry.sub_mapping: if extracted_item.sub_items and mapping_entry.sub_mapping:
_extract_items(node, extracted_item.sub_items, mapping_entry.sub_mapping) _extract_items(node, extracted_item.sub_items, mapping_entry.sub_mapping)
else: else:
node.text = extracted_item.text node.text = extracted_text
previous_tag = tag previous_tag = tag
def extracted_items_to_xml(extracted_items): def extracted_items_to_xml(extracted_items):
...@@ -118,11 +137,15 @@ def extracted_items_to_xml(extracted_items): ...@@ -118,11 +137,15 @@ def extracted_items_to_xml(extracted_items):
Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True), Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True),
Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True), Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True),
Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR, sub_mapping={ Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR, sub_mapping={
SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(SubXmlPaths.AUTHOR_GIVEN_NAMES), SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(
SubTags.AUTHOR_SURNAME: XmlMapping(SubXmlPaths.AUTHOR_SURNAME) SubXmlPaths.AUTHOR_GIVEN_NAMES, clean_fn=_clean_author_name
),
SubTags.AUTHOR_SURNAME: XmlMapping(
SubXmlPaths.AUTHOR_SURNAME, clean_fn=_clean_author_name
)
}, attrib={ }, attrib={
'contrib-type': 'author' 'contrib-type': 'author'
}), }, clean_fn=_clean_author_name),
Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF) Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF)
} }
xml_root = E.article() xml_root = E.article()
......
...@@ -83,16 +83,37 @@ class TestExtractedItemsToXml(object): ...@@ -83,16 +83,37 @@ class TestExtractedItemsToXml(object):
def test_should_extract_author_surname_and_given_names_from_single_author(self): def test_should_extract_author_surname_and_given_names_from_single_author(self):
xml_root = extracted_items_to_xml([ xml_root = extracted_items_to_xml([
ExtractedItem(Tags.AUTHOR, TEXT_1, sub_items=[ ExtractedItem(Tags.AUTHOR, ' '.join([TEXT_1, TEXT_2]), sub_items=[
ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_2), ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_1),
ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_3) ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_2)
]) ])
]) ])
assert xml_root is not None assert xml_root is not None
author = xml_root.find(XmlPaths.AUTHOR) author = xml_root.find(XmlPaths.AUTHOR)
assert author is not None assert author is not None
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_2 assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_3 assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
def test_should_remove_special_characters_and_numbers_from_author(self):
special_num_chars = ',.+*0123456789'
xml_root = extracted_items_to_xml(_create_author_extracted_items(
TEXT_1 + special_num_chars, TEXT_2 + special_num_chars
))
assert xml_root is not None
author = xml_root.find(XmlPaths.AUTHOR)
assert author is not None
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
def test_should_not_remove_dot_after_initials_from_author(self):
xml_root = extracted_items_to_xml(_create_author_extracted_items(
'Mr T.', 'E.'
))
assert xml_root is not None
author = xml_root.find(XmlPaths.AUTHOR)
assert author is not None
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T.'
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == 'E.'
def test_should_add_contrib_type_author_attribute(self): def test_should_add_contrib_type_author_attribute(self):
xml_root = extracted_items_to_xml(_create_author_extracted_items(TEXT_1, TEXT_2)) xml_root = extracted_items_to_xml(_create_author_extracted_items(TEXT_1, TEXT_2))
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment