Newer
Older
import argparse
import logging
from io import BytesIO
from lxml import etree
from lxml.builder import E
from sciencebeam_gym.utils.tf import (
FileIO
)
from sciencebeam_gym.structured_document.lxml import (
LxmlStructuredDocument
)
from sciencebeam_gym.inference_model.extract_from_annotated_document import (
extract_from_annotated_document
)
class Tags(object):
TITLE = 'manuscript_title'
class XmlPaths(object):
TITLE = 'front/article-meta/title-group/article-title'
def get_logger():
return logging.getLogger(__name__)
def rsplit_xml_path(path):
i = path.rfind('/')
if i >= 0:
return path[0:i], path[i + 1:]
else:
return None, path
def create_node_recursive(xml_root, path, exists_ok=False):
node = xml_root.find(path)
if not exists_ok:
raise RuntimeError('xml node already exists: %s' % path)
return node
parent, base = rsplit_xml_path(path)
if parent:
parent_node = create_node_recursive(xml_root, parent, exists_ok=True)
else:
parent_node = xml_root
node = etree.Element(base)
parent_node.append(node)
return node
def set_xml_text(xml_root, path, text):
node = create_node_recursive(xml_root, path, exists_ok=True)
if node.text is None:
node.text = text
else:
node.text += '\n' + text
return node
def extracted_items_to_xml(extracted_items):
simple_xml_mapping = {
Tags.TITLE: XmlPaths.TITLE,
Tags.ABSTRACT: XmlPaths.ABSTRACT
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
}
xml_root = E.article()
for extracted_item in extracted_items:
tag = extracted_item.tag
if tag:
path = simple_xml_mapping.get(tag)
if not path:
get_logger().warning('tag not configured: %s', tag)
continue
set_xml_text(xml_root, path, extracted_item.text)
return xml_root
def extract_structured_document_to_xml(structured_document):
return extracted_items_to_xml(
extract_from_annotated_document(structured_document)
)
def read_all(path, mode):
with FileIO(path, mode) as f:
return f.read()
def parse_args(argv=None):
parser = argparse.ArgumentParser('Extract JATSy XML from annotated LXML')
parser.add_argument(
'--lxml-path', type=str, required=True,
help='path to lxml document'
)
parser.add_argument(
'--output-path', type=str, required=True,
help='output path to annotated document'
)
parser.add_argument(
'--debug', action='store_true', default=False,
help='enable debug output'
)
return parser.parse_args(argv)
def main(argv=None):
args = parse_args(argv)
if args.debug:
logging.getLogger().setLevel('DEBUG')
structured_document = LxmlStructuredDocument(
etree.parse(BytesIO(read_all(args.lxml_path, 'rb')))
)
xml_root = extract_structured_document_to_xml(structured_document)
get_logger().info('writing result to: %s', args.output_path)
with FileIO(args.output_path, 'w') as out_f:
out_f.write(etree.tostring(xml_root, pretty_print=True))
if __name__ == '__main__':
logging.basicConfig(level='INFO')
main()