From 2cbb04330c7ec1026b88fbf368bb9e41e28a5a15 Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 19 Jan 2018 12:00:34 +0000 Subject: [PATCH] moved target annotation and parsing to separate module --- .../annotation/matching_annotator.py | 270 +---------- .../annotation/matching_annotator_test.py | 443 +---------------- .../annotation/target_annotation.py | 272 +++++++++++ .../annotation/target_annotation_test.py | 453 ++++++++++++++++++ sciencebeam_gym/preprocess/lxml_to_svg.py | 5 +- .../preprocess/preprocessing_pipeline.py | 2 +- .../preprocess/preprocessing_utils.py | 5 +- sciencebeam_gym/utils/string.py | 9 + 8 files changed, 753 insertions(+), 706 deletions(-) create mode 100644 sciencebeam_gym/preprocess/annotation/target_annotation.py create mode 100644 sciencebeam_gym/preprocess/annotation/target_annotation_test.py create mode 100644 sciencebeam_gym/utils/string.py diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator.py b/sciencebeam_gym/preprocess/annotation/matching_annotator.py index 0fd68aa..688b5f0 100644 --- a/sciencebeam_gym/preprocess/annotation/matching_annotator.py +++ b/sciencebeam_gym/preprocess/annotation/matching_annotator.py @@ -1,25 +1,23 @@ from __future__ import division import logging -import re -import json import csv from builtins import str as text -from itertools import tee, chain, islice +from itertools import tee, islice from future.utils import python_2_unicode_compatible -import six from six.moves import zip_longest -from six.moves.configparser import ConfigParser # pylint: disable=E0401 - -from lxml import etree from sciencebeam_gym.utils.csv import ( csv_delimiter_by_filename, write_csv_row ) +from sciencebeam_gym.utils.string import ( + LazyStr +) + from sciencebeam_gym.alignment.align import ( LocalSequenceMatcher, SimpleScoring @@ -29,18 +27,10 @@ from sciencebeam_gym.alignment.WordSequenceMatcher import ( ) from sciencebeam_gym.utils.collection import ( - filter_truthy, - strip_all, iter_flatten, extract_from_dict ) -from sciencebeam_gym.utils.xml import ( - get_text_content, - get_text_content_list, - get_immediate_text -) - from sciencebeam_gym.preprocess.annotation.annotator import ( AbstractAnnotator ) @@ -70,27 +60,6 @@ def normalise_str_or_list(x): else: return normalise_str(x) -class XmlMappingSuffix(object): - REGEX = '.regex' - MATCH_MULTIPLE = '.match-multiple' - BONDING = '.bonding' - CHILDREN = '.children' - CHILDREN_CONCAT = '.children.concat' - CHILDREN_RANGE = '.children.range' - UNMATCHED_PARENT_TEXT = '.unmatched-parent-text' - PRIORITY = '.priority' - -@python_2_unicode_compatible -class TargetAnnotation(object): - def __init__(self, value, name, match_multiple=False, bonding=False): - self.value = value - self.name = name - self.match_multiple = match_multiple - self.bonding = bonding - - def __str__(self): - return u'{} (match_multiple={}): {}'.format(self.name, self.match_multiple, self.value) - class SequenceWrapper(object): def __init__(self, structured_document, tokens, str_filter_f=None): self.structured_document = structured_document @@ -173,14 +142,6 @@ class SequenceMatch(object): self.index2_range[1] ) -@python_2_unicode_compatible -class LazyStr(object): - def __init__(self, fn): - self.fn = fn - - def __str__(self): - return self.fn() - def len_index_range(index_range): return index_range[1] - index_range[0] @@ -679,227 +640,6 @@ def find_best_matches( LazyStr(lambda: ' '.join(str(choice.position) for choice in too_distant_choices)) ) -def parse_xml_mapping(xml_mapping_filename): - with open(xml_mapping_filename, 'r') as f: - config = ConfigParser() - if six.PY3: - config.read_file(f) - else: - config.readfp(f) - return { - k: dict(config.items(k)) - for k in config.sections() - } - -def apply_pattern(s, compiled_pattern): - m = compiled_pattern.match(s) - if m: - get_logger().debug('regex match: %s -> %s', compiled_pattern, m.groups()) - return m.group(1) - return s - -def iter_parents(children): - for child in children: - p = child.getparent() - if p is not None: - yield p - -def exclude_parents(children): - if not isinstance(children, list): - children = list(children) - all_parents = set(iter_parents(children)) - return [child for child in children if not child in all_parents] - -def extract_children_source_list(parent, children_source_list): - used_nodes = set() - values = [] - for children_source in children_source_list: - xpath = children_source.get('xpath') - if xpath: - matching_nodes = exclude_parents(parent.xpath(xpath)) - if not matching_nodes: - get_logger().debug( - 'child xpath does not match any item, skipping: xpath=%s (xml=%s)', - xpath, - LazyStr(lambda: str(etree.tostring(parent))) - ) - used_nodes = set() - values = [] - break - used_nodes |= set(matching_nodes) - value = ' '.join(get_text_content_list(matching_nodes)) - else: - value = children_source.get('value') - values.append(value or '') - return values, used_nodes - -def extract_children_concat(parent, children_concat): - used_nodes = set() - values = [] - get_logger().debug('children_concat: %s', children_concat) - for children_concat_item in children_concat: - temp_values, temp_used_nodes = extract_children_source_list( - parent, children_concat_item - ) - used_nodes |= temp_used_nodes - if temp_values: - values.append(''.join(temp_values)) - return values, used_nodes - -def extract_children_range(parent, children_range): - used_nodes = set() - values = [] - standalone_values = [] - get_logger().debug('children_range: %s', children_range) - for range_item in children_range: - temp_values, temp_used_nodes = extract_children_source_list( - parent, [range_item.get('min'), range_item.get('max')] - ) - if len(temp_values) == 2: - temp_values = strip_all(temp_values) - if all(s.isdigit() for s in temp_values): - num_values = [int(s) for s in temp_values] - range_values = [str(x) for x in range(num_values[0], num_values[1] + 1)] - if range_item.get('standalone'): - standalone_values.extend(range_values) - else: - values.extend(range_values) - used_nodes |= temp_used_nodes - else: - get_logger().info('values not integers: %s', temp_values) - return values, standalone_values, used_nodes - -def parse_xpaths(s): - return strip_all(s.strip().split('\n')) if s else None - -def match_xpaths(parent, xpaths): - return chain(*[parent.xpath(s) for s in xpaths]) - -def extract_children( - parent, children_xpaths, children_concat, children_range, unmatched_parent_text): - - concat_values_list, concat_used_nodes = extract_children_concat(parent, children_concat) - range_values_list, standalone_values, range_used_nodes = ( - extract_children_range(parent, children_range) - ) - used_nodes = concat_used_nodes | range_used_nodes - - other_child_nodes = [ - node for node in match_xpaths(parent, children_xpaths) - if not node in used_nodes - ] - other_child_nodes_excl_parents = exclude_parents(other_child_nodes) - text_content_list = filter_truthy(strip_all( - get_text_content_list(other_child_nodes_excl_parents) + - concat_values_list + range_values_list - )) - if len(other_child_nodes_excl_parents) != len(other_child_nodes): - other_child_nodes_excl_parents_set = set(other_child_nodes_excl_parents) - for child in other_child_nodes: - if child not in other_child_nodes_excl_parents_set: - text_values = filter_truthy(strip_all(get_immediate_text(child))) - text_content_list.extend(text_values) - if unmatched_parent_text: - value = get_text_content( - parent, - exclude=set(other_child_nodes) | used_nodes - ).strip() - if value and not value in text_content_list: - text_content_list.append(value) - return text_content_list, standalone_values - -def parse_json_with_default(s, default_value): - return json.loads(s) if s else default_value - -def xml_root_to_target_annotations(xml_root, xml_mapping): - if not xml_root.tag in xml_mapping: - raise Exception("unrecognised tag: {} (available: {})".format( - xml_root.tag, xml_mapping.sections()) - ) - - mapping = xml_mapping[xml_root.tag] - - field_names = [k for k in mapping.keys() if '.' not in k] - get_mapping_flag = lambda k, suffix: mapping.get(k + suffix) == 'true' - get_match_multiple = lambda k: get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE) - get_bonding_flag = lambda k: get_mapping_flag(k, XmlMappingSuffix.BONDING) - get_unmatched_parent_text_flag = ( - lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT) - ) - - get_logger().debug('fields: %s', field_names) - - target_annotations_with_pos = [] - xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())} - for k in field_names: - match_multiple = get_match_multiple(k) - bonding = get_bonding_flag(k) - unmatched_parent_text = get_unmatched_parent_text_flag(k) - children_xpaths = parse_xpaths(mapping.get(k + XmlMappingSuffix.CHILDREN)) - children_concat = parse_json_with_default( - mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), [] - ) - children_range = parse_json_with_default( - mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), [] - ) - re_pattern = mapping.get(k + XmlMappingSuffix.REGEX) - re_compiled_pattern = re.compile(re_pattern) if re_pattern else None - priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0')) - - xpaths = parse_xpaths(mapping[k]) - get_logger().debug('xpaths(%s): %s', k, xpaths) - for e in match_xpaths(xml_root, xpaths): - e_pos = xml_pos_by_node.get(e) - if children_xpaths: - text_content_list, standalone_values = extract_children( - e, children_xpaths, children_concat, children_range, unmatched_parent_text - ) - else: - text_content_list = filter_truthy(strip_all([get_text_content(e)])) - standalone_values = [] - if re_compiled_pattern: - text_content_list = filter_truthy([ - apply_pattern(s, re_compiled_pattern) for s in text_content_list - ]) - if text_content_list: - value = ( - text_content_list[0] - if len(text_content_list) == 1 - else sorted(text_content_list, key=lambda s: -len(s)) - ) - target_annotations_with_pos.append(( - (-priority, e_pos), - TargetAnnotation( - value, - k, - match_multiple=match_multiple, - bonding=bonding - ) - )) - if standalone_values: - for i, standalone_value in enumerate(standalone_values): - target_annotations_with_pos.append(( - (-priority, e_pos, i), - TargetAnnotation( - standalone_value, - k, - match_multiple=match_multiple, - bonding=bonding - ) - )) - target_annotations_with_pos = sorted( - target_annotations_with_pos, - key=lambda x: x[0] - ) - get_logger().debug('target_annotations_with_pos:\n%s', target_annotations_with_pos) - target_annotations = [ - x[1] for x in target_annotations_with_pos - ] - get_logger().debug('target_annotations:\n%s', '\n'.join([ - ' ' + str(a) for a in target_annotations - ])) - return target_annotations - class CsvMatchDetailReporter(object): def __init__(self, fp, filename=None, fields=None): self.fp = fp diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py index 4f44d3c..11094a8 100644 --- a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py +++ b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py @@ -1,25 +1,22 @@ from __future__ import division -import json - -from lxml.builder import E - from sciencebeam_gym.structured_document import ( SimpleStructuredDocument, SimpleLine, SimpleToken ) +from sciencebeam_gym.preprocess.annotation.target_annotation import ( + TargetAnnotation +) + from sciencebeam_gym.preprocess.annotation.matching_annotator import ( MatchingAnnotator, - TargetAnnotation, - xml_root_to_target_annotations, FuzzyMatchResult, fuzzy_match, THIN_SPACE, EN_DASH, - EM_DASH, - XmlMappingSuffix + EM_DASH ) from sciencebeam_gym.utils.collection import ( @@ -191,436 +188,6 @@ class TestFuzzyMatchResult(object): assert fm_2.a_index_range() == (0, 1) assert fm_2.b_index_range() == (0, 1) -class TestXmlRootToTargetAnnotations(object): - def test_should_return_empty_target_annotations_for_empty_xml(self): - xml_root = E.article( - ) - xml_mapping = { - 'article': { - 'title': 'title' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert target_annotations == [] - - def test_should_return_empty_target_annotations_for_no_matching_annotations(self): - xml_root = E.article( - E.other(SOME_VALUE) - ) - xml_mapping = { - 'article': { - TAG1: 'title' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert target_annotations == [] - - def test_should_return_matching_target_annotations(self): - xml_root = E.article( - E.title(SOME_VALUE) - ) - xml_mapping = { - 'article': { - TAG1: 'title' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert len(target_annotations) == 1 - assert target_annotations[0].name == TAG1 - assert target_annotations[0].value == SOME_VALUE - - def test_should_apply_regex_to_result(self): - xml_root = E.article( - E.title('1.1. ' + SOME_VALUE) - ) - xml_mapping = { - 'article': { - TAG1: 'title', - TAG1 + XmlMappingSuffix.REGEX: r'(?:\d+\.?)* ?(.*)' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert len(target_annotations) == 1 - assert target_annotations[0].name == TAG1 - assert target_annotations[0].value == SOME_VALUE - - def test_should_apply_match_multiple_flag(self): - xml_root = E.article( - E.title(SOME_VALUE) - ) - xml_mapping = { - 'article': { - TAG1: 'title', - TAG1 + XmlMappingSuffix.MATCH_MULTIPLE: 'true' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [t.match_multiple for t in target_annotations] == [True] - - def test_should_not_apply_match_multiple_flag_if_not_set(self): - xml_root = E.article( - E.title(SOME_VALUE) - ) - xml_mapping = { - 'article': { - TAG1: 'title' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [t.match_multiple for t in target_annotations] == [False] - - def test_should_apply_match_bonding_flag(self): - xml_root = E.article( - E.title(SOME_VALUE) - ) - xml_mapping = { - 'article': { - TAG1: 'title', - TAG1 + XmlMappingSuffix.BONDING: 'true' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [t.bonding for t in target_annotations] == [True] - - def test_should_not_apply_match_bonding_flag_if_not_set(self): - xml_root = E.article( - E.title(SOME_VALUE) - ) - xml_mapping = { - 'article': { - TAG1: 'title' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [t.bonding for t in target_annotations] == [False] - - def test_should_use_multiple_xpaths(self): - xml_root = E.article( - E.entry( - E.child1(SOME_VALUE), - E.child2(SOME_VALUE_2) - ) - ) - xml_mapping = { - 'article': { - TAG1: '\n{}\n{}\n'.format( - 'entry/child1', - 'entry/child2' - ) - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, SOME_VALUE), - (TAG1, SOME_VALUE_2) - ] - - def test_should_apply_children_xpaths_and_sort_by_value_descending(self): - xml_root = E.article( - E.entry( - E.child1(SOME_SHORTER_VALUE), - E.child2(SOME_LONGER_VALUE) - ), - E.entry( - E.child1(SOME_LONGER_VALUE) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: './/*' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]), - (TAG1, SOME_LONGER_VALUE) - ] - - def test_should_apply_children_xpaths_and_exclude_parents(self): - xml_root = E.article( - E.entry( - E.parent( - E.child2(SOME_LONGER_VALUE), - E.child1(SOME_SHORTER_VALUE) - ) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: './/*' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]) - ] - - def test_should_apply_children_xpaths_and_include_parent_text_between_matched_children(self): - xml_root = E.article( - E.entry( - E.parent( - E.child2(SOME_LONGER_VALUE), - SOME_VALUE, - E.child1(SOME_SHORTER_VALUE) - ) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: './/*' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [SOME_LONGER_VALUE, SOME_VALUE, SOME_SHORTER_VALUE]) - ] - - def test_should_apply_multiple_children_xpaths_and_include_parent_text_if_enabled(self): - xml_root = E.article( - E.entry( - E.child1(SOME_SHORTER_VALUE), - SOME_LONGER_VALUE - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: '\n{}\n{}\n'.format('.//*', '.'), - TAG1 + XmlMappingSuffix.UNMATCHED_PARENT_TEXT: 'true' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]) - ] - - def test_should_apply_concat_children(self): - num_values = ['101', '202'] - xml_root = E.article( - E.entry( - E.parent( - E.child1(SOME_VALUE), - E.fpage(num_values[0]), - E.lpage(num_values[1]) - ) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: './/*', - TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{ - 'xpath': './/fpage' - }, { - 'value': '-' - }, { - 'xpath': './/lpage' - }]]) - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [SOME_VALUE, '-'.join(num_values)]) - ] - - def test_should_not_apply_concat_children_if_one_node_was_not_found(self): - num_values = ['101', '202'] - xml_root = E.article( - E.entry( - E.parent( - E.child1(SOME_VALUE), - E.fpage(num_values[0]), - E.lpage(num_values[1]) - ) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: './/*', - TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{ - 'xpath': './/fpage' - }, { - 'value': '-' - }, { - 'xpath': './/unknown' - }]]) - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [SOME_VALUE, num_values[0], num_values[1]]) - ] - - def test_should_apply_range_children(self): - num_values = [101, 102, 103, 104, 105, 106, 107] - xml_root = E.article( - E.entry( - E.child1(SOME_VALUE), - E.fpage(str(min(num_values))), - E.lpage(str(max(num_values))) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', - TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ - 'min': { - 'xpath': 'fpage' - }, - 'max': { - 'xpath': 'lpage' - } - }]) - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [str(x) for x in num_values]) - ] - - def test_should_apply_range_children_as_separate_target_annotations(self): - num_values = [101, 102, 103, 104, 105, 106, 107] - xml_root = E.article( - E.entry( - E.child1(SOME_VALUE), - E.fpage(str(min(num_values))), - E.lpage(str(max(num_values))) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', - TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ - 'min': { - 'xpath': 'fpage' - }, - 'max': { - 'xpath': 'lpage' - }, - 'standalone': True - }]) - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, str(x)) - for x in num_values - ] - - def test_should_not_apply_range_children_if_xpath_not_matching(self): - num_values = [101, 102, 103, 104, 105, 106, 107] - fpage = str(min(num_values)) - lpage = str(max(num_values)) - xml_root = E.article( - E.entry( - E.child1(SOME_VALUE), - E.fpage(fpage), - E.lpage(lpage) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|unknown', - TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ - 'min': { - 'xpath': 'fpage' - }, - 'max': { - 'xpath': 'unknown' - } - }]) - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, fpage) - ] - - def test_should_not_apply_range_children_if_value_is_not_integer(self): - fpage = 'abc' - lpage = 'xyz' - xml_root = E.article( - E.entry( - E.child1(SOME_VALUE), - E.fpage(fpage), - E.lpage(lpage) - ) - ) - xml_mapping = { - 'article': { - TAG1: 'entry', - TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', - TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ - 'min': { - 'xpath': 'fpage' - }, - 'max': { - 'xpath': 'lpage' - } - }]) - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(t.name, t.value) for t in target_annotations] == [ - (TAG1, [fpage, lpage]) - ] - - def test_should_return_full_text(self): - xml_root = E.article( - E.title( - 'some ', - E.other('embedded'), - ' text' - ) - ) - xml_mapping = { - 'article': { - TAG1: 'title' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert len(target_annotations) == 1 - assert target_annotations[0].name == TAG1 - assert target_annotations[0].value == 'some embedded text' - - def test_should_return_target_annotations_in_order_of_xml(self): - xml_root = E.article( - E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'), - ) - xml_mapping = { - 'article': { - TAG1: 'tag1', - TAG2: 'tag2' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(ta.name, ta.value) for ta in target_annotations] == [ - (TAG1, 'tag1.1'), (TAG2, 'tag2.1'), (TAG1, 'tag1.2'), (TAG2, 'tag2.2') - ] - - def test_should_return_target_annotations_in_order_of_priority_first(self): - xml_root = E.article( - E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'), - ) - xml_mapping = { - 'article': { - TAG1: 'tag1', - TAG2: 'tag2', - TAG2 + XmlMappingSuffix.PRIORITY: '1' - } - } - target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) - assert [(ta.name, ta.value) for ta in target_annotations] == [ - (TAG2, 'tag2.1'), (TAG2, 'tag2.2'), (TAG1, 'tag1.1'), (TAG1, 'tag1.2') - ] - class TestMatchingAnnotator(object): def test_should_not_fail_on_empty_document(self): doc = SimpleStructuredDocument(lines=[]) diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation.py b/sciencebeam_gym/preprocess/annotation/target_annotation.py new file mode 100644 index 0000000..5e6fcb9 --- /dev/null +++ b/sciencebeam_gym/preprocess/annotation/target_annotation.py @@ -0,0 +1,272 @@ +import logging +import json +import re +from itertools import chain + +from future.utils import python_2_unicode_compatible + +import six +from six.moves.configparser import ConfigParser # pylint: disable=E0401 + +from lxml import etree + +from sciencebeam_gym.utils.string import ( + LazyStr +) + +from sciencebeam_gym.utils.xml import ( + get_text_content, + get_text_content_list, + get_immediate_text +) + +from sciencebeam_gym.utils.collection import ( + filter_truthy, + strip_all +) + +def get_logger(): + return logging.getLogger(__name__) + +class XmlMappingSuffix(object): + REGEX = '.regex' + MATCH_MULTIPLE = '.match-multiple' + BONDING = '.bonding' + CHILDREN = '.children' + CHILDREN_CONCAT = '.children.concat' + CHILDREN_RANGE = '.children.range' + UNMATCHED_PARENT_TEXT = '.unmatched-parent-text' + PRIORITY = '.priority' + +@python_2_unicode_compatible +class TargetAnnotation(object): + def __init__(self, value, name, match_multiple=False, bonding=False): + self.value = value + self.name = name + self.match_multiple = match_multiple + self.bonding = bonding + + def __str__(self): + return u'{} (match_multiple={}): {}'.format(self.name, self.match_multiple, self.value) + + +def parse_xml_mapping(xml_mapping_filename): + with open(xml_mapping_filename, 'r') as f: + config = ConfigParser() + if six.PY3: + config.read_file(f) + else: + config.readfp(f) + return { + k: dict(config.items(k)) + for k in config.sections() + } + +def apply_pattern(s, compiled_pattern): + m = compiled_pattern.match(s) + if m: + get_logger().debug('regex match: %s -> %s', compiled_pattern, m.groups()) + return m.group(1) + return s + +def iter_parents(children): + for child in children: + p = child.getparent() + if p is not None: + yield p + +def exclude_parents(children): + if not isinstance(children, list): + children = list(children) + all_parents = set(iter_parents(children)) + return [child for child in children if not child in all_parents] + +def extract_children_source_list(parent, children_source_list): + used_nodes = set() + values = [] + for children_source in children_source_list: + xpath = children_source.get('xpath') + if xpath: + matching_nodes = exclude_parents(parent.xpath(xpath)) + if not matching_nodes: + get_logger().debug( + 'child xpath does not match any item, skipping: xpath=%s (xml=%s)', + xpath, + LazyStr(lambda: str(etree.tostring(parent))) + ) + used_nodes = set() + values = [] + break + used_nodes |= set(matching_nodes) + value = ' '.join(get_text_content_list(matching_nodes)) + else: + value = children_source.get('value') + values.append(value or '') + return values, used_nodes + +def extract_children_concat(parent, children_concat): + used_nodes = set() + values = [] + get_logger().debug('children_concat: %s', children_concat) + for children_concat_item in children_concat: + temp_values, temp_used_nodes = extract_children_source_list( + parent, children_concat_item + ) + used_nodes |= temp_used_nodes + if temp_values: + values.append(''.join(temp_values)) + return values, used_nodes + +def extract_children_range(parent, children_range): + used_nodes = set() + values = [] + standalone_values = [] + get_logger().debug('children_range: %s', children_range) + for range_item in children_range: + temp_values, temp_used_nodes = extract_children_source_list( + parent, [range_item.get('min'), range_item.get('max')] + ) + if len(temp_values) == 2: + temp_values = strip_all(temp_values) + if all(s.isdigit() for s in temp_values): + num_values = [int(s) for s in temp_values] + range_values = [str(x) for x in range(num_values[0], num_values[1] + 1)] + if range_item.get('standalone'): + standalone_values.extend(range_values) + else: + values.extend(range_values) + used_nodes |= temp_used_nodes + else: + get_logger().info('values not integers: %s', temp_values) + return values, standalone_values, used_nodes + +def parse_xpaths(s): + return strip_all(s.strip().split('\n')) if s else None + +def match_xpaths(parent, xpaths): + return chain(*[parent.xpath(s) for s in xpaths]) + +def extract_children( + parent, children_xpaths, children_concat, children_range, unmatched_parent_text): + + concat_values_list, concat_used_nodes = extract_children_concat(parent, children_concat) + range_values_list, standalone_values, range_used_nodes = ( + extract_children_range(parent, children_range) + ) + used_nodes = concat_used_nodes | range_used_nodes + + other_child_nodes = [ + node for node in match_xpaths(parent, children_xpaths) + if not node in used_nodes + ] + other_child_nodes_excl_parents = exclude_parents(other_child_nodes) + text_content_list = filter_truthy(strip_all( + get_text_content_list(other_child_nodes_excl_parents) + + concat_values_list + range_values_list + )) + if len(other_child_nodes_excl_parents) != len(other_child_nodes): + other_child_nodes_excl_parents_set = set(other_child_nodes_excl_parents) + for child in other_child_nodes: + if child not in other_child_nodes_excl_parents_set: + text_values = filter_truthy(strip_all(get_immediate_text(child))) + text_content_list.extend(text_values) + if unmatched_parent_text: + value = get_text_content( + parent, + exclude=set(other_child_nodes) | used_nodes + ).strip() + if value and not value in text_content_list: + text_content_list.append(value) + return text_content_list, standalone_values + +def parse_json_with_default(s, default_value): + return json.loads(s) if s else default_value + +def xml_root_to_target_annotations(xml_root, xml_mapping): + if not xml_root.tag in xml_mapping: + raise Exception("unrecognised tag: {} (available: {})".format( + xml_root.tag, xml_mapping.sections()) + ) + + mapping = xml_mapping[xml_root.tag] + + field_names = [k for k in mapping.keys() if '.' not in k] + get_mapping_flag = lambda k, suffix: mapping.get(k + suffix) == 'true' + get_match_multiple = lambda k: get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE) + get_bonding_flag = lambda k: get_mapping_flag(k, XmlMappingSuffix.BONDING) + get_unmatched_parent_text_flag = ( + lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT) + ) + + get_logger().debug('fields: %s', field_names) + + target_annotations_with_pos = [] + xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())} + for k in field_names: + match_multiple = get_match_multiple(k) + bonding = get_bonding_flag(k) + unmatched_parent_text = get_unmatched_parent_text_flag(k) + children_xpaths = parse_xpaths(mapping.get(k + XmlMappingSuffix.CHILDREN)) + children_concat = parse_json_with_default( + mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), [] + ) + children_range = parse_json_with_default( + mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), [] + ) + re_pattern = mapping.get(k + XmlMappingSuffix.REGEX) + re_compiled_pattern = re.compile(re_pattern) if re_pattern else None + priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0')) + + xpaths = parse_xpaths(mapping[k]) + get_logger().debug('xpaths(%s): %s', k, xpaths) + for e in match_xpaths(xml_root, xpaths): + e_pos = xml_pos_by_node.get(e) + if children_xpaths: + text_content_list, standalone_values = extract_children( + e, children_xpaths, children_concat, children_range, unmatched_parent_text + ) + else: + text_content_list = filter_truthy(strip_all([get_text_content(e)])) + standalone_values = [] + if re_compiled_pattern: + text_content_list = filter_truthy([ + apply_pattern(s, re_compiled_pattern) for s in text_content_list + ]) + if text_content_list: + value = ( + text_content_list[0] + if len(text_content_list) == 1 + else sorted(text_content_list, key=lambda s: -len(s)) + ) + target_annotations_with_pos.append(( + (-priority, e_pos), + TargetAnnotation( + value, + k, + match_multiple=match_multiple, + bonding=bonding + ) + )) + if standalone_values: + for i, standalone_value in enumerate(standalone_values): + target_annotations_with_pos.append(( + (-priority, e_pos, i), + TargetAnnotation( + standalone_value, + k, + match_multiple=match_multiple, + bonding=bonding + ) + )) + target_annotations_with_pos = sorted( + target_annotations_with_pos, + key=lambda x: x[0] + ) + get_logger().debug('target_annotations_with_pos:\n%s', target_annotations_with_pos) + target_annotations = [ + x[1] for x in target_annotations_with_pos + ] + get_logger().debug('target_annotations:\n%s', '\n'.join([ + ' ' + str(a) for a in target_annotations + ])) + return target_annotations diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation_test.py b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py new file mode 100644 index 0000000..38ec32a --- /dev/null +++ b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py @@ -0,0 +1,453 @@ +from __future__ import division + +import json + +from lxml.builder import E + +from sciencebeam_gym.preprocess.annotation.target_annotation import ( + TargetAnnotation, + xml_root_to_target_annotations, + XmlMappingSuffix +) + +from sciencebeam_gym.utils.collection import ( + flatten +) + +TAG1 = 'tag1' +TAG2 = 'tag2' + +SOME_VALUE = 'some value' +SOME_VALUE_2 = 'some value2' +SOME_LONGER_VALUE = 'some longer value1' +SOME_SHORTER_VALUE = 'value1' + +class TestXmlRootToTargetAnnotations(object): + def test_should_return_empty_target_annotations_for_empty_xml(self): + xml_root = E.article( + ) + xml_mapping = { + 'article': { + 'title': 'title' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert target_annotations == [] + + def test_should_return_empty_target_annotations_for_no_matching_annotations(self): + xml_root = E.article( + E.other(SOME_VALUE) + ) + xml_mapping = { + 'article': { + TAG1: 'title' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert target_annotations == [] + + def test_should_return_matching_target_annotations(self): + xml_root = E.article( + E.title(SOME_VALUE) + ) + xml_mapping = { + 'article': { + TAG1: 'title' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert len(target_annotations) == 1 + assert target_annotations[0].name == TAG1 + assert target_annotations[0].value == SOME_VALUE + + def test_should_apply_regex_to_result(self): + xml_root = E.article( + E.title('1.1. ' + SOME_VALUE) + ) + xml_mapping = { + 'article': { + TAG1: 'title', + TAG1 + XmlMappingSuffix.REGEX: r'(?:\d+\.?)* ?(.*)' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert len(target_annotations) == 1 + assert target_annotations[0].name == TAG1 + assert target_annotations[0].value == SOME_VALUE + + def test_should_apply_match_multiple_flag(self): + xml_root = E.article( + E.title(SOME_VALUE) + ) + xml_mapping = { + 'article': { + TAG1: 'title', + TAG1 + XmlMappingSuffix.MATCH_MULTIPLE: 'true' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [t.match_multiple for t in target_annotations] == [True] + + def test_should_not_apply_match_multiple_flag_if_not_set(self): + xml_root = E.article( + E.title(SOME_VALUE) + ) + xml_mapping = { + 'article': { + TAG1: 'title' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [t.match_multiple for t in target_annotations] == [False] + + def test_should_apply_match_bonding_flag(self): + xml_root = E.article( + E.title(SOME_VALUE) + ) + xml_mapping = { + 'article': { + TAG1: 'title', + TAG1 + XmlMappingSuffix.BONDING: 'true' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [t.bonding for t in target_annotations] == [True] + + def test_should_not_apply_match_bonding_flag_if_not_set(self): + xml_root = E.article( + E.title(SOME_VALUE) + ) + xml_mapping = { + 'article': { + TAG1: 'title' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [t.bonding for t in target_annotations] == [False] + + def test_should_use_multiple_xpaths(self): + xml_root = E.article( + E.entry( + E.child1(SOME_VALUE), + E.child2(SOME_VALUE_2) + ) + ) + xml_mapping = { + 'article': { + TAG1: '\n{}\n{}\n'.format( + 'entry/child1', + 'entry/child2' + ) + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, SOME_VALUE), + (TAG1, SOME_VALUE_2) + ] + + def test_should_apply_children_xpaths_and_sort_by_value_descending(self): + xml_root = E.article( + E.entry( + E.child1(SOME_SHORTER_VALUE), + E.child2(SOME_LONGER_VALUE) + ), + E.entry( + E.child1(SOME_LONGER_VALUE) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: './/*' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]), + (TAG1, SOME_LONGER_VALUE) + ] + + def test_should_apply_children_xpaths_and_exclude_parents(self): + xml_root = E.article( + E.entry( + E.parent( + E.child2(SOME_LONGER_VALUE), + E.child1(SOME_SHORTER_VALUE) + ) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: './/*' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]) + ] + + def test_should_apply_children_xpaths_and_include_parent_text_between_matched_children(self): + xml_root = E.article( + E.entry( + E.parent( + E.child2(SOME_LONGER_VALUE), + SOME_VALUE, + E.child1(SOME_SHORTER_VALUE) + ) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: './/*' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [SOME_LONGER_VALUE, SOME_VALUE, SOME_SHORTER_VALUE]) + ] + + def test_should_apply_multiple_children_xpaths_and_include_parent_text_if_enabled(self): + xml_root = E.article( + E.entry( + E.child1(SOME_SHORTER_VALUE), + SOME_LONGER_VALUE + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: '\n{}\n{}\n'.format('.//*', '.'), + TAG1 + XmlMappingSuffix.UNMATCHED_PARENT_TEXT: 'true' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]) + ] + + def test_should_apply_concat_children(self): + num_values = ['101', '202'] + xml_root = E.article( + E.entry( + E.parent( + E.child1(SOME_VALUE), + E.fpage(num_values[0]), + E.lpage(num_values[1]) + ) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: './/*', + TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{ + 'xpath': './/fpage' + }, { + 'value': '-' + }, { + 'xpath': './/lpage' + }]]) + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [SOME_VALUE, '-'.join(num_values)]) + ] + + def test_should_not_apply_concat_children_if_one_node_was_not_found(self): + num_values = ['101', '202'] + xml_root = E.article( + E.entry( + E.parent( + E.child1(SOME_VALUE), + E.fpage(num_values[0]), + E.lpage(num_values[1]) + ) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: './/*', + TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{ + 'xpath': './/fpage' + }, { + 'value': '-' + }, { + 'xpath': './/unknown' + }]]) + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [SOME_VALUE, num_values[0], num_values[1]]) + ] + + def test_should_apply_range_children(self): + num_values = [101, 102, 103, 104, 105, 106, 107] + xml_root = E.article( + E.entry( + E.child1(SOME_VALUE), + E.fpage(str(min(num_values))), + E.lpage(str(max(num_values))) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', + TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ + 'min': { + 'xpath': 'fpage' + }, + 'max': { + 'xpath': 'lpage' + } + }]) + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [str(x) for x in num_values]) + ] + + def test_should_apply_range_children_as_separate_target_annotations(self): + num_values = [101, 102, 103, 104, 105, 106, 107] + xml_root = E.article( + E.entry( + E.child1(SOME_VALUE), + E.fpage(str(min(num_values))), + E.lpage(str(max(num_values))) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', + TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ + 'min': { + 'xpath': 'fpage' + }, + 'max': { + 'xpath': 'lpage' + }, + 'standalone': True + }]) + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, str(x)) + for x in num_values + ] + + def test_should_not_apply_range_children_if_xpath_not_matching(self): + num_values = [101, 102, 103, 104, 105, 106, 107] + fpage = str(min(num_values)) + lpage = str(max(num_values)) + xml_root = E.article( + E.entry( + E.child1(SOME_VALUE), + E.fpage(fpage), + E.lpage(lpage) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|unknown', + TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ + 'min': { + 'xpath': 'fpage' + }, + 'max': { + 'xpath': 'unknown' + } + }]) + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, fpage) + ] + + def test_should_not_apply_range_children_if_value_is_not_integer(self): + fpage = 'abc' + lpage = 'xyz' + xml_root = E.article( + E.entry( + E.child1(SOME_VALUE), + E.fpage(fpage), + E.lpage(lpage) + ) + ) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', + TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ + 'min': { + 'xpath': 'fpage' + }, + 'max': { + 'xpath': 'lpage' + } + }]) + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, [fpage, lpage]) + ] + + def test_should_return_full_text(self): + xml_root = E.article( + E.title( + 'some ', + E.other('embedded'), + ' text' + ) + ) + xml_mapping = { + 'article': { + TAG1: 'title' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert len(target_annotations) == 1 + assert target_annotations[0].name == TAG1 + assert target_annotations[0].value == 'some embedded text' + + def test_should_return_target_annotations_in_order_of_xml(self): + xml_root = E.article( + E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'), + ) + xml_mapping = { + 'article': { + TAG1: 'tag1', + TAG2: 'tag2' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(ta.name, ta.value) for ta in target_annotations] == [ + (TAG1, 'tag1.1'), (TAG2, 'tag2.1'), (TAG1, 'tag1.2'), (TAG2, 'tag2.2') + ] + + def test_should_return_target_annotations_in_order_of_priority_first(self): + xml_root = E.article( + E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'), + ) + xml_mapping = { + 'article': { + TAG1: 'tag1', + TAG2: 'tag2', + TAG2 + XmlMappingSuffix.PRIORITY: '1' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert [(ta.name, ta.value) for ta in target_annotations] == [ + (TAG2, 'tag2.1'), (TAG2, 'tag2.2'), (TAG1, 'tag1.1'), (TAG1, 'tag1.2') + ] diff --git a/sciencebeam_gym/preprocess/lxml_to_svg.py b/sciencebeam_gym/preprocess/lxml_to_svg.py index ce75da2..ca472da 100644 --- a/sciencebeam_gym/preprocess/lxml_to_svg.py +++ b/sciencebeam_gym/preprocess/lxml_to_svg.py @@ -20,7 +20,10 @@ from sciencebeam_gym.preprocess.annotation.annotator import ( from sciencebeam_gym.preprocess.annotation.matching_annotator import ( MatchingAnnotator, - CsvMatchDetailReporter, + CsvMatchDetailReporter +) + +from sciencebeam_gym.preprocess.annotation.target_annotation import ( parse_xml_mapping, xml_root_to_target_annotations ) diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py index 57c42ae..4229426 100644 --- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py +++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py @@ -41,7 +41,7 @@ from sciencebeam_gym.structured_document.svg import ( SvgStructuredDocument ) -from sciencebeam_gym.preprocess.annotation.matching_annotator import ( +from sciencebeam_gym.preprocess.annotation.target_annotation import ( parse_xml_mapping ) diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py index f1a6d7c..9af3ba4 100644 --- a/sciencebeam_gym/preprocess/preprocessing_utils.py +++ b/sciencebeam_gym/preprocess/preprocessing_utils.py @@ -52,7 +52,10 @@ from sciencebeam_gym.alignment.align import ( ) from sciencebeam_gym.preprocess.annotation.matching_annotator import ( - MatchingAnnotator, + MatchingAnnotator +) + +from sciencebeam_gym.preprocess.annotation.target_annotation import ( xml_root_to_target_annotations ) diff --git a/sciencebeam_gym/utils/string.py b/sciencebeam_gym/utils/string.py new file mode 100644 index 0000000..00aa95d --- /dev/null +++ b/sciencebeam_gym/utils/string.py @@ -0,0 +1,9 @@ +from future.utils import python_2_unicode_compatible + +@python_2_unicode_compatible +class LazyStr(object): + def __init__(self, fn): + self.fn = fn + + def __str__(self): + return self.fn() -- GitLab