Skip to content
Snippets Groups Projects
Commit 9d7f7a09 authored by Daniel Ecer's avatar Daniel Ecer
Browse files

remove special characters from author name

parent 78ac72a5
No related branches found
No related tags found
No related merge requests found
......@@ -77,12 +77,28 @@ def create_xml_text(xml_root, path, text):
node.text = text
return node
AUTHOR_JUNK_CHARS = ',.+*0123456789'
def _clean_author_name(s):
i = len(s)
while (
i > 0 and s[i - 1] in AUTHOR_JUNK_CHARS and
# don't remove dot after initials / upper character
(s[i - 1] != '.' or i < 2 or not s[i - 2].isupper())
):
i -= 1
return s[:i]
class XmlMapping(object):
def __init__(self, xml_path, single_node=False, sub_mapping=None, attrib=None):
def __init__(
self, xml_path, single_node=False, sub_mapping=None, attrib=None,
clean_fn=None):
self.xml_path = xml_path
self.single_node = single_node
self.sub_mapping = sub_mapping
self.attrib = attrib
self.clean_fn = clean_fn
def _extract_items(parent_node, extracted_items, xml_mapping):
previous_tag = None
......@@ -93,13 +109,16 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
if not mapping_entry:
get_logger().warning('tag not configured: %s', tag)
continue
extracted_text = extracted_item.text
if extracted_text and mapping_entry.clean_fn:
extracted_text = mapping_entry.clean_fn(extracted_text)
path = mapping_entry.xml_path
if mapping_entry.single_node:
node = create_node_recursive(parent_node, path, exists_ok=True)
if node.text is None:
node.text = extracted_item.text
node.text = extracted_text
elif previous_tag == tag:
node.text += '\n' + extracted_item.text
node.text += '\n' + extracted_text
else:
get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag)
else:
......@@ -110,7 +129,7 @@ def _extract_items(parent_node, extracted_items, xml_mapping):
if extracted_item.sub_items and mapping_entry.sub_mapping:
_extract_items(node, extracted_item.sub_items, mapping_entry.sub_mapping)
else:
node.text = extracted_item.text
node.text = extracted_text
previous_tag = tag
def extracted_items_to_xml(extracted_items):
......@@ -118,11 +137,15 @@ def extracted_items_to_xml(extracted_items):
Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True),
Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True),
Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR, sub_mapping={
SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(SubXmlPaths.AUTHOR_GIVEN_NAMES),
SubTags.AUTHOR_SURNAME: XmlMapping(SubXmlPaths.AUTHOR_SURNAME)
SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(
SubXmlPaths.AUTHOR_GIVEN_NAMES, clean_fn=_clean_author_name
),
SubTags.AUTHOR_SURNAME: XmlMapping(
SubXmlPaths.AUTHOR_SURNAME, clean_fn=_clean_author_name
)
}, attrib={
'contrib-type': 'author'
}),
}, clean_fn=_clean_author_name),
Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF)
}
xml_root = E.article()
......
......@@ -83,16 +83,37 @@ class TestExtractedItemsToXml(object):
def test_should_extract_author_surname_and_given_names_from_single_author(self):
xml_root = extracted_items_to_xml([
ExtractedItem(Tags.AUTHOR, TEXT_1, sub_items=[
ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_2),
ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_3)
ExtractedItem(Tags.AUTHOR, ' '.join([TEXT_1, TEXT_2]), sub_items=[
ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_1),
ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_2)
])
])
assert xml_root is not None
author = xml_root.find(XmlPaths.AUTHOR)
assert author is not None
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_2
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_3
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
def test_should_remove_special_characters_and_numbers_from_author(self):
special_num_chars = ',.+*0123456789'
xml_root = extracted_items_to_xml(_create_author_extracted_items(
TEXT_1 + special_num_chars, TEXT_2 + special_num_chars
))
assert xml_root is not None
author = xml_root.find(XmlPaths.AUTHOR)
assert author is not None
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
def test_should_not_remove_dot_after_initials_from_author(self):
xml_root = extracted_items_to_xml(_create_author_extracted_items(
'Mr T.', 'E.'
))
assert xml_root is not None
author = xml_root.find(XmlPaths.AUTHOR)
assert author is not None
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T.'
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == 'E.'
def test_should_add_contrib_type_author_attribute(self):
xml_root = extracted_items_to_xml(_create_author_extracted_items(TEXT_1, TEXT_2))
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment