diff --git a/annot-xml-front.conf b/annot-xml-front.conf index 04028376919ab4dbf5180fea5cf1c13fa0f1664e..f1eb8f2103e65880584ff1ba54572e092079e6b0 100644 --- a/annot-xml-front.conf +++ b/annot-xml-front.conf @@ -33,8 +33,10 @@ author_aff.children = .//* author_aff.unmatched-parent-text = true author_aff.bonding = true author_aff.match-multiple = true +author_aff.extract-regex = .*\b(\d+)\b.* author_aff.sub.sup = ./sup author_aff.sub.addrline = ./addr-line +author_aff.sub.addrline.extract-regex = .*\b(\d+)\b.* author_aff.sub.country = ./country author_aff.sub.extlink = ./ext-link diff --git a/annot-xml-full.conf b/annot-xml-full.conf index 166d7b03350992dd9be5097cd28d72a1591f221b..829ed72d6af601b1f8b6e452b0d0e06ca76589a7 100644 --- a/annot-xml-full.conf +++ b/annot-xml-full.conf @@ -33,8 +33,10 @@ author_aff.children = .//* author_aff.unmatched-parent-text = true author_aff.bonding = true author_aff.match-multiple = true +author_aff.extract-regex = .*\b(\d+)\b.* author_aff.sub.sup = ./sup author_aff.sub.addrline = ./addr-line +author_aff.sub.addrline.extract-regex = .*\b(\d+)\b.* author_aff.sub.country = ./country author_aff.sub.extlink = ./ext-link diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation.py b/sciencebeam_gym/preprocess/annotation/target_annotation.py index 76ad6721a6c27af480aed939b3904ef03c813d68..073358a33540cfbfb522555c10ca5ea3314e3d36 100644 --- a/sciencebeam_gym/preprocess/annotation/target_annotation.py +++ b/sciencebeam_gym/preprocess/annotation/target_annotation.py @@ -31,6 +31,7 @@ def get_logger(): class XmlMappingSuffix(object): REGEX = '.regex' + EXTRACT_REGEX = '.extract-regex' MATCH_MULTIPLE = '.match-multiple' BONDING = '.bonding' REQUIRE_NEXT = '.require-next' @@ -99,6 +100,19 @@ def get_stripped_text_content(node, **kwargs): def get_stripped_text_content_list(nodes, **kwargs): return [get_stripped_text_content(node, **kwargs) for node in nodes] +def iter_flatten_if_nested(a): + for x in a: + if isinstance(x, list): + for y in iter_flatten_if_nested(x): + yield y + else: + yield x + +def flatten_if_nested(a): + if not a: + return a + return list(iter_flatten_if_nested(a)) + def apply_pattern(s, compiled_pattern): m = compiled_pattern.match(s) if m: @@ -227,17 +241,60 @@ def get_prefixed_dict_values(d, key_prefix): } def get_sub_mapping(mapping, tag): - return get_prefixed_dict_values(mapping, tag + XmlMappingSuffix.SUB + '.') + return { + k: v + for k, v in get_prefixed_dict_values(mapping, tag + XmlMappingSuffix.SUB + '.').items() + if '.' not in k + } -def extract_sub_annotations(parent_node, sub_xpaths): +def re_compile_or_none(pattern): + return re.compile(pattern) if pattern else None + +def extract_using_regex(s, compiled_pattern): + result = None + start = 0 + for m in compiled_pattern.finditer(s): + m_start = m.start(1) + m_end = m.end(1) + m_text = m.group(1) + get_logger().debug('extract match: %d:%d, %s', m_start, m_end, m_text) + if result is None: + result = [] + if start < m_start: + result.append(s[start:m_start].strip()) + result.append(m_text) + start = m_end + 1 + if result is None: + return s + if start < len(s): + result.append(s[start:].strip()) + if len(result) == 1: + return result[0] + # also include the full string + result.append(s) + return result + +def extract_sub_annotations(parent_node, sub_xpaths, mapping, parent_key): if not sub_xpaths: return sub_annotations = [] for sub_tag, sub_xpath in sub_xpaths.items(): + sub_key_prefix = parent_key + XmlMappingSuffix.SUB + '.' + sub_tag + extract_re_compiled_pattern = re_compile_or_none( + mapping.get(sub_key_prefix + XmlMappingSuffix.EXTRACT_REGEX) + ) + get_logger().debug('sub_key_prefix: %s', sub_key_prefix) + get_logger().debug( + 'extract_re_compiled_pattern (%s, %s): %s', + parent_key, sub_tag, extract_re_compiled_pattern + ) + for e in match_xpaths(parent_node, [sub_xpath]): value = get_stripped_text_content(e) if value: value = strip_whitespace(value).strip() + if extract_re_compiled_pattern is not None and value: + value = extract_using_regex(value, extract_re_compiled_pattern) if value: sub_annotations.append(TargetAnnotation(value, sub_tag)) return sub_annotations @@ -275,8 +332,14 @@ def xml_root_to_target_annotations(xml_root, xml_mapping): children_range = parse_json_with_default( mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), [] ) - re_pattern = mapping.get(k + XmlMappingSuffix.REGEX) - re_compiled_pattern = re.compile(re_pattern) if re_pattern else None + re_compiled_pattern = re_compile_or_none( + mapping.get(k + XmlMappingSuffix.REGEX) + ) + extract_re_compiled_pattern = re_compile_or_none( + mapping.get(k + XmlMappingSuffix.EXTRACT_REGEX) + ) + get_logger().debug('extract_re_compiled_pattern (%s): %s', k, extract_re_compiled_pattern) + priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0')) sub_xpaths = get_sub_mapping(mapping, k) get_logger().debug('sub_xpaths (%s): %s', k, sub_xpaths) @@ -286,7 +349,7 @@ def xml_root_to_target_annotations(xml_root, xml_mapping): for e in match_xpaths(xml_root, xpaths): e_pos = xml_pos_by_node.get(e) - sub_annotations = extract_sub_annotations(e, sub_xpaths) + sub_annotations = extract_sub_annotations(e, sub_xpaths, mapping, k) get_logger().debug('sub_annotations (%s): %s', k, sub_annotations) if children_xpaths: @@ -300,6 +363,11 @@ def xml_root_to_target_annotations(xml_root, xml_mapping): text_content_list = filter_truthy([ apply_pattern(s, re_compiled_pattern) for s in text_content_list ]) + if extract_re_compiled_pattern: + text_content_list = filter_truthy([ + extract_using_regex(s, extract_re_compiled_pattern) for s in text_content_list + ]) + text_content_list = flatten_if_nested(text_content_list) if text_content_list: value = ( text_content_list[0] diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation_test.py b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py index 9113352e0ee68b4de3e7087b4d9a6c47b2d7c5ad..a3fdadf1b2976551db0801f91e9a6a42c431f1b1 100644 --- a/sciencebeam_gym/preprocess/annotation/target_annotation_test.py +++ b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py @@ -499,6 +499,77 @@ class TestXmlRootToTargetAnnotations(object): ('value', SOME_VALUE_2) ] + def test_should_extract_numbers_from_value_after_text(self): + xml_root = E.article(E.entry( + E.value(SOME_VALUE + ' 12345') + )) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert len(target_annotations) == 1 + assert [(t.name, set(t.value)) for t in target_annotations] == [ + (TAG1, {SOME_VALUE + ' 12345', SOME_VALUE, '12345'}) + ] + + def test_should_extract_single_value_if_its_the_only_value(self): + xml_root = E.article(E.entry( + E.value('12345') + )) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert len(target_annotations) == 1 + assert [(t.name, t.value) for t in target_annotations] == [ + (TAG1, '12345') + ] + + def test_should_unnest_extract_value_from_children(self): + xml_root = E.article(E.entry( + E.value(SOME_VALUE + ' 12345'), + E.value(SOME_VALUE_2 + ' 54321') + )) + xml_mapping = { + 'article': { + TAG1: 'entry', + TAG1 + XmlMappingSuffix.CHILDREN: r'.//*', + TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert len(target_annotations) == 1 + assert [(t.name, set(t.value)) for t in target_annotations] == [ + (TAG1, { + SOME_VALUE + ' 12345', SOME_VALUE, '12345', + SOME_VALUE_2 + ' 54321', SOME_VALUE_2, '54321' + }) + ] + + def test_should_extract_numbers_from_sub_value_after_text(self): + xml_root = E.article(E.entry( + E.value(SOME_VALUE + ' 12345') + )) + sub_key = TAG1 + XmlMappingSuffix.SUB + '.value' + xml_mapping = { + 'article': { + TAG1: 'entry', + sub_key: './value', + sub_key + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*' + } + } + target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping) + assert len(target_annotations) == 1 + assert [(t.name, set(t.value)) for t in target_annotations[0].sub_annotations] == [ + ('value', {SOME_VALUE + ' 12345', SOME_VALUE, '12345'}) + ] + def test_should_return_full_text(self): xml_root = E.article( E.title(