Newer
Older
import argparse
import logging
from io import BytesIO
from lxml import etree
from lxml.builder import E
from sciencebeam_gym.utils.tf import (
FileIO
)
from sciencebeam_gym.structured_document.lxml import (
LxmlStructuredDocument
)
from sciencebeam_gym.inference_model.extract_from_annotated_document import (
extract_from_annotated_document
)
class Tags(object):
TITLE = 'manuscript_title'
class XmlPaths(object):
TITLE = 'front/article-meta/title-group/article-title'
AUTHOR = 'front/article-meta/contrib-group/contrib/name'
AUTHOR_AFF = 'front/article-meta/contrib-group/aff'
def get_logger():
return logging.getLogger(__name__)
def rsplit_xml_path(path):
i = path.rfind('/')
if i >= 0:
return path[0:i], path[i + 1:]
else:
return None, path
def create_node_recursive(xml_root, path, exists_ok=False):
node = xml_root.find(path)
if not exists_ok:
raise RuntimeError('xml node already exists: %s' % path)
return node
parent, base = rsplit_xml_path(path)
if parent:
parent_node = create_node_recursive(xml_root, parent, exists_ok=True)
else:
parent_node = xml_root
node = etree.Element(base)
parent_node.append(node)
return node
def create_xml_text(xml_root, path, text):
parent, base = rsplit_xml_path(path)
parent_node = create_node_recursive(xml_root, parent, exists_ok=True)
node = etree.Element(base)
node.text = text
parent_node.append(node)
return node
class XmlMapping(object):
def __init__(self, xml_path, single_node=False):
self.xml_path = xml_path
self.single_node = single_node
def extracted_items_to_xml(extracted_items):
xml_mapping = {
Tags.TITLE: XmlMapping(XmlPaths.TITLE, single_node=True),
Tags.ABSTRACT: XmlMapping(XmlPaths.ABSTRACT, single_node=True),
Tags.AUTHOR: XmlMapping(XmlPaths.AUTHOR),
Tags.AUTHOR_AFF: XmlMapping(XmlPaths.AUTHOR_AFF)
for extracted_item in extracted_items:
tag = extracted_item.tag
if tag:
mapping_entry = xml_mapping.get(tag)
if not mapping_entry:
get_logger().warning('tag not configured: %s', tag)
continue
path = mapping_entry.xml_path
if mapping_entry.single_node:
node = create_node_recursive(xml_root, path, exists_ok=True)
if node.text is None:
node.text = extracted_item.text
elif previous_tag == tag:
node.text += '\n' + extracted_item.text
else:
get_logger().debug('ignoring tag %s, after tag %s', tag, previous_tag)
else:
create_xml_text(xml_root, path, extracted_item.text)
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
return xml_root
def extract_structured_document_to_xml(structured_document):
return extracted_items_to_xml(
extract_from_annotated_document(structured_document)
)
def read_all(path, mode):
with FileIO(path, mode) as f:
return f.read()
def parse_args(argv=None):
parser = argparse.ArgumentParser('Extract JATSy XML from annotated LXML')
parser.add_argument(
'--lxml-path', type=str, required=True,
help='path to lxml document'
)
parser.add_argument(
'--output-path', type=str, required=True,
help='output path to annotated document'
)
parser.add_argument(
'--debug', action='store_true', default=False,
help='enable debug output'
)
return parser.parse_args(argv)
def main(argv=None):
args = parse_args(argv)
if args.debug:
logging.getLogger().setLevel('DEBUG')
structured_document = LxmlStructuredDocument(
etree.parse(BytesIO(read_all(args.lxml_path, 'rb')))
)
xml_root = extract_structured_document_to_xml(structured_document)
get_logger().info('writing result to: %s', args.output_path)
with FileIO(args.output_path, 'w') as out_f:
out_f.write(etree.tostring(xml_root, pretty_print=True))
if __name__ == '__main__':
logging.basicConfig(level='INFO')
main()