Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import argparse
import logging
from io import BytesIO
from lxml import etree
from lxml.builder import E
from sciencebeam_gym.utils.tf import (
FileIO
)
from sciencebeam_gym.structured_document.lxml import (
LxmlStructuredDocument
)
from sciencebeam_gym.inference_model.extract_from_annotated_document import (
extract_from_annotated_document
)
class Tags(object):
TITLE = 'manuscript_title'
class XmlPaths(object):
TITLE = 'front/article-meta/title-group/article-title'
def get_logger():
return logging.getLogger(__name__)
def rsplit_xml_path(path):
i = path.rfind('/')
if i >= 0:
return path[0:i], path[i + 1:]
else:
return None, path
def create_node_recursive(xml_root, path, exists_ok=False):
node = xml_root.find(path)
if node:
if not exists_ok:
raise RuntimeError('xml node already exists: %s' % path)
return node
parent, base = rsplit_xml_path(path)
if parent:
parent_node = create_node_recursive(xml_root, parent, exists_ok=True)
else:
parent_node = xml_root
node = etree.Element(base)
parent_node.append(node)
return node
def set_xml_text(xml_root, path, text):
node = create_node_recursive(xml_root, path, exists_ok=False)
node.text = text
return node
def extracted_items_to_xml(extracted_items):
simple_xml_mapping = {
Tags.TITLE: XmlPaths.TITLE
}
xml_root = E.article()
for extracted_item in extracted_items:
tag = extracted_item.tag
if tag:
path = simple_xml_mapping.get(tag)
if not path:
get_logger().warning('tag not configured: %s', tag)
continue
set_xml_text(xml_root, path, extracted_item.text)
return xml_root
def extract_structured_document_to_xml(structured_document):
return extracted_items_to_xml(
extract_from_annotated_document(structured_document)
)
def read_all(path, mode):
with FileIO(path, mode) as f:
return f.read()
def parse_args(argv=None):
parser = argparse.ArgumentParser('Extract JATSy XML from annotated LXML')
parser.add_argument(
'--lxml-path', type=str, required=True,
help='path to lxml document'
)
parser.add_argument(
'--output-path', type=str, required=True,
help='output path to annotated document'
)
parser.add_argument(
'--debug', action='store_true', default=False,
help='enable debug output'
)
return parser.parse_args(argv)
def main(argv=None):
args = parse_args(argv)
if args.debug:
logging.getLogger().setLevel('DEBUG')
structured_document = LxmlStructuredDocument(
etree.parse(BytesIO(read_all(args.lxml_path, 'rb')))
)
xml_root = extract_structured_document_to_xml(structured_document)
get_logger().info('writing result to: %s', args.output_path)
with FileIO(args.output_path, 'w') as out_f:
out_f.write(etree.tostring(xml_root, pretty_print=True))
if __name__ == '__main__':
logging.basicConfig(level='INFO')
main()