Skip to content
Snippets Groups Projects
extract_to_xml.py 2.84 KiB
Newer Older
import argparse
import logging
from io import BytesIO

from lxml import etree
from lxml.builder import E

from sciencebeam_gym.utils.tf import (
  FileIO
)

from sciencebeam_gym.structured_document.lxml import (
  LxmlStructuredDocument
)

from sciencebeam_gym.inference_model.extract_from_annotated_document import (
  extract_from_annotated_document
)

class Tags(object):
  TITLE = 'manuscript_title'

class XmlPaths(object):
  TITLE = 'front/article-meta/title-group/article-title'

def get_logger():
  return logging.getLogger(__name__)

def rsplit_xml_path(path):
  i = path.rfind('/')
  if i >= 0:
    return path[0:i], path[i + 1:]
  else:
    return None, path

def create_node_recursive(xml_root, path, exists_ok=False):
  node = xml_root.find(path)
  if node:
    if not exists_ok:
      raise RuntimeError('xml node already exists: %s' % path)
    return node
  parent, base = rsplit_xml_path(path)
  if parent:
    parent_node = create_node_recursive(xml_root, parent, exists_ok=True)
  else:
    parent_node = xml_root
  node = etree.Element(base)
  parent_node.append(node)
  return node

def set_xml_text(xml_root, path, text):
  node = create_node_recursive(xml_root, path, exists_ok=False)
  node.text = text
  return node

def extracted_items_to_xml(extracted_items):
  simple_xml_mapping = {
    Tags.TITLE: XmlPaths.TITLE
  }
  xml_root = E.article()
  for extracted_item in extracted_items:
    tag = extracted_item.tag
    if tag:
      path = simple_xml_mapping.get(tag)
      if not path:
        get_logger().warning('tag not configured: %s', tag)
        continue
      set_xml_text(xml_root, path, extracted_item.text)
  return xml_root

def extract_structured_document_to_xml(structured_document):
  return extracted_items_to_xml(
    extract_from_annotated_document(structured_document)
  )

def read_all(path, mode):
  with FileIO(path, mode) as f:
    return f.read()

def parse_args(argv=None):
  parser = argparse.ArgumentParser('Extract JATSy XML from annotated LXML')
  parser.add_argument(
    '--lxml-path', type=str, required=True,
    help='path to lxml document'
  )

  parser.add_argument(
    '--output-path', type=str, required=True,
    help='output path to annotated document'
  )

  parser.add_argument(
    '--debug', action='store_true', default=False,
    help='enable debug output'
  )

  return parser.parse_args(argv)

def main(argv=None):
  args = parse_args(argv)

  if args.debug:
    logging.getLogger().setLevel('DEBUG')

  structured_document = LxmlStructuredDocument(
    etree.parse(BytesIO(read_all(args.lxml_path, 'rb')))
  )

  xml_root = extract_structured_document_to_xml(structured_document)

  get_logger().info('writing result to: %s', args.output_path)
  with FileIO(args.output_path, 'w') as out_f:
    out_f.write(etree.tostring(xml_root, pretty_print=True))

if __name__ == '__main__':
  logging.basicConfig(level='INFO')

  main()