Skip to content
Snippets Groups Projects
Commit b06d0f6c authored by Daniel Ecer's avatar Daniel Ecer
Browse files

extract author given-names and surname using sub tags

parent 2dac94ce
No related branches found
No related tags found
No related merge requests found
......@@ -8,16 +8,20 @@ def get_logger():
return logging.getLogger(__name__)
class ExtractedItem(object):
def __init__(self, tag, text, tag_prefix=None):
def __init__(self, tag, text, tokens=None, tag_prefix=None, sub_items=None):
self.tag = tag
self.tag_prefix = tag_prefix
self.text = text
self.tokens = tokens or []
self.sub_items = sub_items or []
def extend(self, other_item):
return ExtractedItem(
self.tag,
self.text + '\n' + other_item.text,
tag_prefix=self.tag_prefix
tokens=self.tokens + other_item.tokens,
tag_prefix=self.tag_prefix,
sub_items=self.sub_items + other_item.sub_items
)
def get_lines(structured_document):
......@@ -25,12 +29,14 @@ def get_lines(structured_document):
for line in structured_document.get_lines_of_page(page):
yield line
def extract_from_annotated_tokens(structured_document, tokens, tag_scope=None):
def extract_from_annotated_tokens(structured_document, tokens, tag_scope=None, level=None):
previous_tokens = []
previous_tag = None
previous_tag_prefix = None
for token in tokens:
tag_prefix, tag = structured_document.get_tag_prefix_and_value(token, scope=tag_scope)
tag_prefix, tag = structured_document.get_tag_prefix_and_value(
token, scope=tag_scope, level=level
)
if not previous_tokens:
previous_tokens = [token]
previous_tag = tag
......@@ -41,6 +47,7 @@ def extract_from_annotated_tokens(structured_document, tokens, tag_scope=None):
yield ExtractedItem(
previous_tag,
' '.join(structured_document.get_text(t) for t in previous_tokens),
tokens=previous_tokens,
tag_prefix=previous_tag_prefix
)
previous_tokens = [token]
......@@ -50,9 +57,22 @@ def extract_from_annotated_tokens(structured_document, tokens, tag_scope=None):
yield ExtractedItem(
previous_tag,
' '.join(structured_document.get_text(t) for t in previous_tokens),
tag_prefix=previous_tag_prefix
tokens=previous_tokens,
tag_prefix=previous_tag_prefix
)
def with_sub_items(structured_document, extracted_item, tag_scope=None):
return ExtractedItem(
extracted_item.tag,
extracted_item.text,
tokens=extracted_item.tokens,
tag_prefix=extracted_item.tag_prefix,
sub_items=list(extract_from_annotated_tokens(
structured_document, extracted_item.tokens,
tag_scope=tag_scope, level=2
))
)
def extract_from_annotated_lines(structured_document, lines, tag_scope=None):
previous_item = None
for line in lines:
......@@ -62,12 +82,12 @@ def extract_from_annotated_lines(structured_document, lines, tag_scope=None):
if previous_item.tag == item.tag and item.tag_prefix != B_TAG_PREFIX:
previous_item = previous_item.extend(item)
else:
yield previous_item
yield with_sub_items(structured_document, previous_item, tag_scope=tag_scope)
previous_item = item
else:
previous_item = item
if previous_item is not None:
yield previous_item
yield with_sub_items(structured_document, previous_item, tag_scope=tag_scope)
def extract_from_annotated_document(structured_document, tag_scope=None):
return extract_from_annotated_lines(
......
......@@ -53,6 +53,12 @@ def annotated_tokens(tokens, tag):
def annotated_line(tokens, tag):
return with_tag(to_line(tokens), tag)
def _token_with_sub_tag(text, tag=None, tag_prefix=None, sub_tag=None, sub_tag_prefix=None):
token = SimpleToken(text, tag=tag, tag_prefix=tag_prefix)
if sub_tag:
token.set_tag(sub_tag, prefix=sub_tag_prefix, level=2)
return token
class TestExtractFromAnnotatedDocument(object):
def test_should_not_fail_on_empty_document(self):
structured_document = SimpleStructuredDocument()
......@@ -153,3 +159,98 @@ class TestExtractFromAnnotatedDocument(object):
(TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3])),
(TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]))
]
def test_should_extract_sub_tags_from_single_item(self):
tokens = [
_token_with_sub_tag(
VALUE_1,
tag=TAG_1, tag_prefix=B_TAG_PREFIX,
sub_tag=TAG_2, sub_tag_prefix=B_TAG_PREFIX
),
_token_with_sub_tag(
VALUE_2,
tag=TAG_1, tag_prefix=I_TAG_PREFIX,
sub_tag=TAG_2, sub_tag_prefix=I_TAG_PREFIX
),
_token_with_sub_tag(
VALUE_3,
tag=TAG_1, tag_prefix=I_TAG_PREFIX,
sub_tag=TAG_3, sub_tag_prefix=B_TAG_PREFIX
)
]
structured_document = SimpleStructuredDocument(lines=[SimpleLine(tokens)])
extracted_items = list(extract_from_annotated_document(structured_document))
result = [
(
x.tag, x.text, [(sub.tag, sub.text) for sub in x.sub_items]
) for x in extracted_items
]
get_logger().debug('result: %s', result)
assert result == [
(
TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]),
[
(TAG_2, ' '.join([VALUE_1, VALUE_2])),
(TAG_3, VALUE_3)
]
)
]
def test_should_extract_sub_tags_from_multiple_items(self):
tokens = [
_token_with_sub_tag(
VALUE_1,
tag=TAG_1, tag_prefix=B_TAG_PREFIX,
sub_tag=TAG_2, sub_tag_prefix=B_TAG_PREFIX
),
_token_with_sub_tag(
VALUE_2,
tag=TAG_1, tag_prefix=I_TAG_PREFIX,
sub_tag=TAG_2, sub_tag_prefix=I_TAG_PREFIX
),
_token_with_sub_tag(
VALUE_3,
tag=TAG_1, tag_prefix=I_TAG_PREFIX,
sub_tag=TAG_3, sub_tag_prefix=B_TAG_PREFIX
),
_token_with_sub_tag(
VALUE_1,
tag=TAG_1, tag_prefix=B_TAG_PREFIX,
sub_tag=TAG_2, sub_tag_prefix=B_TAG_PREFIX
),
_token_with_sub_tag(
VALUE_2,
tag=TAG_1, tag_prefix=I_TAG_PREFIX,
sub_tag=TAG_3, sub_tag_prefix=B_TAG_PREFIX
),
_token_with_sub_tag(
VALUE_3,
tag=TAG_1, tag_prefix=I_TAG_PREFIX,
sub_tag=TAG_3, sub_tag_prefix=I_TAG_PREFIX
)
]
structured_document = SimpleStructuredDocument(lines=[SimpleLine(tokens)])
extracted_items = list(extract_from_annotated_document(structured_document))
result = [
(
x.tag, x.text, [(sub.tag, sub.text) for sub in x.sub_items]
) for x in extracted_items
]
get_logger().debug('result: %s', result)
assert result == [
(
TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]),
[
(TAG_2, ' '.join([VALUE_1, VALUE_2])),
(TAG_3, VALUE_3)
]
),
(
TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]),
[
(TAG_2, VALUE_1),
(TAG_3, ' '.join([VALUE_2, VALUE_3]))
]
)
]
......@@ -28,6 +28,14 @@ class XmlPaths(object):
AUTHOR = 'front/article-meta/contrib-group/contrib/name'
AUTHOR_AFF = 'front/article-meta/contrib-group/aff'
class SubTags(object):
AUTHOR_SURNAME = 'surname'
AUTHOR_GIVEN_NAMES = 'givennames'
class SubXmlPaths(object):
AUTHOR_SURNAME = 'surname'
AUTHOR_GIVEN_NAMES = 'given-names'
def get_logger():
return logging.getLogger(__name__)
......@@ -53,27 +61,29 @@ def create_node_recursive(xml_root, path, exists_ok=False):
parent_node.append(node)
return node
def create_xml_text(xml_root, path, text):
def create_and_append_xml_node(xml_root, path):
parent, base = rsplit_xml_path(path)
parent_node = create_node_recursive(xml_root, parent, exists_ok=True)
parent_node = (
create_node_recursive(xml_root, parent, exists_ok=True)
if parent
else xml_root
)
node = etree.Element(base)
node.text = text
parent_node.append(node)
return node
def create_xml_text(xml_root, path, text):
node = create_and_append_xml_node(xml_root, path)
node.text = text
return node
class XmlMapping(object):
def __init__(self, xml_path, single_node=False):
def __init__(self, xml_path, single_node=False, sub_mapping=None):
self.xml_path = xml_path
self.single_node = single_node
self.sub_mapping = sub_mapping
def extracted_items_to_xml(extracted_items):
xml_mapping = {
Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True),
Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True),
Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR),
Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF)
}
xml_root = E.article()
def _extract_items(parent_node, extracted_items, xml_mapping):
previous_tag = None
for extracted_item in extracted_items:
tag = extracted_item.tag
......@@ -83,8 +93,11 @@ def extracted_items_to_xml(extracted_items):
get_logger().warning('tag not configured: %s', tag)
continue
path = mapping_entry.xml_path
if mapping_entry.single_node:
node = create_node_recursive(xml_root, path, exists_ok=True)
if extracted_item.sub_items and mapping_entry.sub_mapping:
node = create_and_append_xml_node(parent_node, path)
_extract_items(node, extracted_item.sub_items, mapping_entry.sub_mapping)
elif mapping_entry.single_node:
node = create_node_recursive(parent_node, path, exists_ok=True)
if node.text is None:
node.text = extracted_item.text
elif previous_tag == tag:
......@@ -92,8 +105,21 @@ def extracted_items_to_xml(extracted_items):
else:
get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag)
else:
create_xml_text(xml_root, path, extracted_item.text)
create_xml_text(parent_node, path, extracted_item.text)
previous_tag = tag
def extracted_items_to_xml(extracted_items):
xml_mapping = {
Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True),
Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True),
Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR, sub_mapping={
SubTags.AUTHOR_GIVEN_NAMES: XmlMapping(SubXmlPaths.AUTHOR_GIVEN_NAMES),
SubTags.AUTHOR_SURNAME: XmlMapping(SubXmlPaths.AUTHOR_SURNAME)
}),
Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF)
}
xml_root = E.article()
_extract_items(xml_root, extracted_items, xml_mapping)
return xml_root
def extract_structured_document_to_xml(structured_document, tag_scope=None):
......
......@@ -18,6 +18,8 @@ from sciencebeam_gym.inference_model.extract_to_xml import (
extracted_items_to_xml,
Tags,
XmlPaths,
SubTags,
SubXmlPaths,
main
)
......@@ -71,6 +73,19 @@ class TestExtractedItemsToXml(object):
assert xml_root is not None
assert get_text_content_list(xml_root.findall(XmlPaths.AUTHOR)) == [TEXT_1, TEXT_2]
def test_should_extract_author_surname_and_given_names_from_single_author(self):
xml_root = extracted_items_to_xml([
ExtractedItem(Tags.AUTHOR, TEXT_1, sub_items=[
ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES, TEXT_2),
ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_3)
])
])
assert xml_root is not None
author = xml_root.find(XmlPaths.AUTHOR)
assert author is not None
assert get_text_content(author.find(SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_2
assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == TEXT_3
class TestMain(object):
def test_should_extract_from_simple_annotated_document(self):
with TemporaryDirectory() as path:
......
......@@ -87,8 +87,8 @@ class AbstractStructuredDocument(object, with_metaclass(ABCMeta)):
other_structured_document, other_token
)
def get_tag_prefix_and_value(self, parent, scope=None):
return split_tag_prefix(self.get_tag(parent, scope=scope))
def get_tag_prefix_and_value(self, parent, scope=None, level=None):
return split_tag_prefix(self.get_tag(parent, scope=scope, level=level))
def get_tag_value(self, parent, scope=None):
return self.get_tag_prefix_and_value(parent, scope=scope)[1]
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment