From 2cbb04330c7ec1026b88fbf368bb9e41e28a5a15 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 19 Jan 2018 12:00:34 +0000
Subject: [PATCH] moved target annotation and parsing to separate module

---
 .../annotation/matching_annotator.py          | 270 +----------
 .../annotation/matching_annotator_test.py     | 443 +----------------
 .../annotation/target_annotation.py           | 272 +++++++++++
 .../annotation/target_annotation_test.py      | 453 ++++++++++++++++++
 sciencebeam_gym/preprocess/lxml_to_svg.py     |   5 +-
 .../preprocess/preprocessing_pipeline.py      |   2 +-
 .../preprocess/preprocessing_utils.py         |   5 +-
 sciencebeam_gym/utils/string.py               |   9 +
 8 files changed, 753 insertions(+), 706 deletions(-)
 create mode 100644 sciencebeam_gym/preprocess/annotation/target_annotation.py
 create mode 100644 sciencebeam_gym/preprocess/annotation/target_annotation_test.py
 create mode 100644 sciencebeam_gym/utils/string.py

diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator.py b/sciencebeam_gym/preprocess/annotation/matching_annotator.py
index 0fd68aa..688b5f0 100644
--- a/sciencebeam_gym/preprocess/annotation/matching_annotator.py
+++ b/sciencebeam_gym/preprocess/annotation/matching_annotator.py
@@ -1,25 +1,23 @@
 from __future__ import division
 
 import logging
-import re
-import json
 import csv
 from builtins import str as text
-from itertools import tee, chain, islice
+from itertools import tee, islice
 
 from future.utils import python_2_unicode_compatible
 
-import six
 from six.moves import zip_longest
-from six.moves.configparser import ConfigParser # pylint: disable=E0401
-
-from lxml import etree
 
 from sciencebeam_gym.utils.csv import (
   csv_delimiter_by_filename,
   write_csv_row
 )
 
+from sciencebeam_gym.utils.string import (
+  LazyStr
+)
+
 from sciencebeam_gym.alignment.align import (
   LocalSequenceMatcher,
   SimpleScoring
@@ -29,18 +27,10 @@ from sciencebeam_gym.alignment.WordSequenceMatcher import (
 )
 
 from sciencebeam_gym.utils.collection import (
-  filter_truthy,
-  strip_all,
   iter_flatten,
   extract_from_dict
 )
 
-from sciencebeam_gym.utils.xml import (
-  get_text_content,
-  get_text_content_list,
-  get_immediate_text
-)
-
 from sciencebeam_gym.preprocess.annotation.annotator import (
   AbstractAnnotator
 )
@@ -70,27 +60,6 @@ def normalise_str_or_list(x):
   else:
     return normalise_str(x)
 
-class XmlMappingSuffix(object):
-  REGEX = '.regex'
-  MATCH_MULTIPLE = '.match-multiple'
-  BONDING = '.bonding'
-  CHILDREN = '.children'
-  CHILDREN_CONCAT = '.children.concat'
-  CHILDREN_RANGE = '.children.range'
-  UNMATCHED_PARENT_TEXT = '.unmatched-parent-text'
-  PRIORITY = '.priority'
-
-@python_2_unicode_compatible
-class TargetAnnotation(object):
-  def __init__(self, value, name, match_multiple=False, bonding=False):
-    self.value = value
-    self.name = name
-    self.match_multiple = match_multiple
-    self.bonding = bonding
-
-  def __str__(self):
-    return u'{} (match_multiple={}): {}'.format(self.name, self.match_multiple, self.value)
-
 class SequenceWrapper(object):
   def __init__(self, structured_document, tokens, str_filter_f=None):
     self.structured_document = structured_document
@@ -173,14 +142,6 @@ class SequenceMatch(object):
       self.index2_range[1]
     )
 
-@python_2_unicode_compatible
-class LazyStr(object):
-  def __init__(self, fn):
-    self.fn = fn
-
-  def __str__(self):
-    return self.fn()
-
 def len_index_range(index_range):
   return index_range[1] - index_range[0]
 
@@ -679,227 +640,6 @@ def find_best_matches(
       LazyStr(lambda: ' '.join(str(choice.position) for choice in too_distant_choices))
     )
 
-def parse_xml_mapping(xml_mapping_filename):
-  with open(xml_mapping_filename, 'r') as f:
-    config = ConfigParser()
-    if six.PY3:
-      config.read_file(f)
-    else:
-      config.readfp(f)
-    return {
-      k: dict(config.items(k))
-      for k in config.sections()
-    }
-
-def apply_pattern(s, compiled_pattern):
-  m = compiled_pattern.match(s)
-  if m:
-    get_logger().debug('regex match: %s -> %s', compiled_pattern, m.groups())
-    return m.group(1)
-  return s
-
-def iter_parents(children):
-  for child in children:
-    p = child.getparent()
-    if p is not None:
-      yield p
-
-def exclude_parents(children):
-  if not isinstance(children, list):
-    children = list(children)
-  all_parents = set(iter_parents(children))
-  return [child for child in children if not child in all_parents]
-
-def extract_children_source_list(parent, children_source_list):
-  used_nodes = set()
-  values = []
-  for children_source in children_source_list:
-    xpath = children_source.get('xpath')
-    if xpath:
-      matching_nodes = exclude_parents(parent.xpath(xpath))
-      if not matching_nodes:
-        get_logger().debug(
-          'child xpath does not match any item, skipping: xpath=%s (xml=%s)',
-          xpath,
-          LazyStr(lambda: str(etree.tostring(parent)))
-        )
-        used_nodes = set()
-        values = []
-        break
-      used_nodes |= set(matching_nodes)
-      value = ' '.join(get_text_content_list(matching_nodes))
-    else:
-      value = children_source.get('value')
-    values.append(value or '')
-  return values, used_nodes
-
-def extract_children_concat(parent, children_concat):
-  used_nodes = set()
-  values = []
-  get_logger().debug('children_concat: %s', children_concat)
-  for children_concat_item in children_concat:
-    temp_values, temp_used_nodes = extract_children_source_list(
-      parent, children_concat_item
-    )
-    used_nodes |= temp_used_nodes
-    if temp_values:
-      values.append(''.join(temp_values))
-  return values, used_nodes
-
-def extract_children_range(parent, children_range):
-  used_nodes = set()
-  values = []
-  standalone_values = []
-  get_logger().debug('children_range: %s', children_range)
-  for range_item in children_range:
-    temp_values, temp_used_nodes = extract_children_source_list(
-      parent, [range_item.get('min'), range_item.get('max')]
-    )
-    if len(temp_values) == 2:
-      temp_values = strip_all(temp_values)
-      if all(s.isdigit() for s in temp_values):
-        num_values = [int(s) for s in temp_values]
-        range_values = [str(x) for x in range(num_values[0], num_values[1] + 1)]
-        if range_item.get('standalone'):
-          standalone_values.extend(range_values)
-        else:
-          values.extend(range_values)
-        used_nodes |= temp_used_nodes
-      else:
-        get_logger().info('values not integers: %s', temp_values)
-  return values, standalone_values, used_nodes
-
-def parse_xpaths(s):
-  return strip_all(s.strip().split('\n')) if s else None
-
-def match_xpaths(parent, xpaths):
-  return chain(*[parent.xpath(s) for s in xpaths])
-
-def extract_children(
-  parent, children_xpaths, children_concat, children_range, unmatched_parent_text):
-
-  concat_values_list, concat_used_nodes = extract_children_concat(parent, children_concat)
-  range_values_list, standalone_values, range_used_nodes = (
-    extract_children_range(parent, children_range)
-  )
-  used_nodes = concat_used_nodes | range_used_nodes
-
-  other_child_nodes = [
-    node for node in match_xpaths(parent, children_xpaths)
-    if not node in used_nodes
-  ]
-  other_child_nodes_excl_parents = exclude_parents(other_child_nodes)
-  text_content_list = filter_truthy(strip_all(
-    get_text_content_list(other_child_nodes_excl_parents) +
-    concat_values_list + range_values_list
-  ))
-  if len(other_child_nodes_excl_parents) != len(other_child_nodes):
-    other_child_nodes_excl_parents_set = set(other_child_nodes_excl_parents)
-    for child in other_child_nodes:
-      if child not in other_child_nodes_excl_parents_set:
-        text_values = filter_truthy(strip_all(get_immediate_text(child)))
-        text_content_list.extend(text_values)
-  if unmatched_parent_text:
-    value = get_text_content(
-      parent,
-      exclude=set(other_child_nodes) | used_nodes
-    ).strip()
-    if value and not value in text_content_list:
-      text_content_list.append(value)
-  return text_content_list, standalone_values
-
-def parse_json_with_default(s, default_value):
-  return json.loads(s) if s else default_value
-
-def xml_root_to_target_annotations(xml_root, xml_mapping):
-  if not xml_root.tag in xml_mapping:
-    raise Exception("unrecognised tag: {} (available: {})".format(
-      xml_root.tag, xml_mapping.sections())
-    )
-
-  mapping = xml_mapping[xml_root.tag]
-
-  field_names = [k for k in mapping.keys() if '.' not in k]
-  get_mapping_flag = lambda k, suffix: mapping.get(k + suffix) == 'true'
-  get_match_multiple = lambda k: get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE)
-  get_bonding_flag = lambda k: get_mapping_flag(k, XmlMappingSuffix.BONDING)
-  get_unmatched_parent_text_flag = (
-    lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT)
-  )
-
-  get_logger().debug('fields: %s', field_names)
-
-  target_annotations_with_pos = []
-  xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())}
-  for k in field_names:
-    match_multiple = get_match_multiple(k)
-    bonding = get_bonding_flag(k)
-    unmatched_parent_text = get_unmatched_parent_text_flag(k)
-    children_xpaths = parse_xpaths(mapping.get(k + XmlMappingSuffix.CHILDREN))
-    children_concat = parse_json_with_default(
-      mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), []
-    )
-    children_range = parse_json_with_default(
-      mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), []
-    )
-    re_pattern = mapping.get(k + XmlMappingSuffix.REGEX)
-    re_compiled_pattern = re.compile(re_pattern) if re_pattern else None
-    priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0'))
-
-    xpaths = parse_xpaths(mapping[k])
-    get_logger().debug('xpaths(%s): %s', k, xpaths)
-    for e in match_xpaths(xml_root, xpaths):
-      e_pos = xml_pos_by_node.get(e)
-      if children_xpaths:
-        text_content_list, standalone_values = extract_children(
-          e, children_xpaths, children_concat, children_range, unmatched_parent_text
-        )
-      else:
-        text_content_list = filter_truthy(strip_all([get_text_content(e)]))
-        standalone_values = []
-      if re_compiled_pattern:
-        text_content_list = filter_truthy([
-          apply_pattern(s, re_compiled_pattern) for s in text_content_list
-        ])
-      if text_content_list:
-        value = (
-          text_content_list[0]
-          if len(text_content_list) == 1
-          else sorted(text_content_list, key=lambda s: -len(s))
-        )
-        target_annotations_with_pos.append((
-          (-priority, e_pos),
-          TargetAnnotation(
-            value,
-            k,
-            match_multiple=match_multiple,
-            bonding=bonding
-          )
-        ))
-      if standalone_values:
-        for i, standalone_value in enumerate(standalone_values):
-          target_annotations_with_pos.append((
-            (-priority, e_pos, i),
-            TargetAnnotation(
-              standalone_value,
-              k,
-              match_multiple=match_multiple,
-              bonding=bonding
-            )
-          ))
-  target_annotations_with_pos = sorted(
-    target_annotations_with_pos,
-    key=lambda x: x[0]
-  )
-  get_logger().debug('target_annotations_with_pos:\n%s', target_annotations_with_pos)
-  target_annotations = [
-    x[1] for x in target_annotations_with_pos
-  ]
-  get_logger().debug('target_annotations:\n%s', '\n'.join([
-    ' ' + str(a) for a in target_annotations
-  ]))
-  return target_annotations
-
 class CsvMatchDetailReporter(object):
   def __init__(self, fp, filename=None, fields=None):
     self.fp = fp
diff --git a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
index 4f44d3c..11094a8 100644
--- a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
+++ b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
@@ -1,25 +1,22 @@
 from __future__ import division
 
-import json
-
-from lxml.builder import E
-
 from sciencebeam_gym.structured_document import (
   SimpleStructuredDocument,
   SimpleLine,
   SimpleToken
 )
 
+from sciencebeam_gym.preprocess.annotation.target_annotation import (
+  TargetAnnotation
+)
+
 from sciencebeam_gym.preprocess.annotation.matching_annotator import (
   MatchingAnnotator,
-  TargetAnnotation,
-  xml_root_to_target_annotations,
   FuzzyMatchResult,
   fuzzy_match,
   THIN_SPACE,
   EN_DASH,
-  EM_DASH,
-  XmlMappingSuffix
+  EM_DASH
 )
 
 from sciencebeam_gym.utils.collection import (
@@ -191,436 +188,6 @@ class TestFuzzyMatchResult(object):
     assert fm_2.a_index_range() == (0, 1)
     assert fm_2.b_index_range() == (0, 1)
 
-class TestXmlRootToTargetAnnotations(object):
-  def test_should_return_empty_target_annotations_for_empty_xml(self):
-    xml_root = E.article(
-    )
-    xml_mapping = {
-      'article': {
-        'title': 'title'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert target_annotations == []
-
-  def test_should_return_empty_target_annotations_for_no_matching_annotations(self):
-    xml_root = E.article(
-      E.other(SOME_VALUE)
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert target_annotations == []
-
-  def test_should_return_matching_target_annotations(self):
-    xml_root = E.article(
-      E.title(SOME_VALUE)
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert len(target_annotations) == 1
-    assert target_annotations[0].name == TAG1
-    assert target_annotations[0].value == SOME_VALUE
-
-  def test_should_apply_regex_to_result(self):
-    xml_root = E.article(
-      E.title('1.1. ' + SOME_VALUE)
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title',
-        TAG1 + XmlMappingSuffix.REGEX: r'(?:\d+\.?)* ?(.*)'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert len(target_annotations) == 1
-    assert target_annotations[0].name == TAG1
-    assert target_annotations[0].value == SOME_VALUE
-
-  def test_should_apply_match_multiple_flag(self):
-    xml_root = E.article(
-      E.title(SOME_VALUE)
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title',
-        TAG1 + XmlMappingSuffix.MATCH_MULTIPLE: 'true'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [t.match_multiple for t in target_annotations] == [True]
-
-  def test_should_not_apply_match_multiple_flag_if_not_set(self):
-    xml_root = E.article(
-      E.title(SOME_VALUE)
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [t.match_multiple for t in target_annotations] == [False]
-
-  def test_should_apply_match_bonding_flag(self):
-    xml_root = E.article(
-      E.title(SOME_VALUE)
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title',
-        TAG1 + XmlMappingSuffix.BONDING: 'true'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [t.bonding for t in target_annotations] == [True]
-
-  def test_should_not_apply_match_bonding_flag_if_not_set(self):
-    xml_root = E.article(
-      E.title(SOME_VALUE)
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [t.bonding for t in target_annotations] == [False]
-
-  def test_should_use_multiple_xpaths(self):
-    xml_root = E.article(
-      E.entry(
-        E.child1(SOME_VALUE),
-        E.child2(SOME_VALUE_2)
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: '\n{}\n{}\n'.format(
-          'entry/child1',
-          'entry/child2'
-        )
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, SOME_VALUE),
-      (TAG1, SOME_VALUE_2)
-    ]
-
-  def test_should_apply_children_xpaths_and_sort_by_value_descending(self):
-    xml_root = E.article(
-      E.entry(
-        E.child1(SOME_SHORTER_VALUE),
-        E.child2(SOME_LONGER_VALUE)
-      ),
-      E.entry(
-        E.child1(SOME_LONGER_VALUE)
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: './/*'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]),
-      (TAG1, SOME_LONGER_VALUE)
-    ]
-
-  def test_should_apply_children_xpaths_and_exclude_parents(self):
-    xml_root = E.article(
-      E.entry(
-        E.parent(
-          E.child2(SOME_LONGER_VALUE),
-          E.child1(SOME_SHORTER_VALUE)
-        )
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: './/*'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])
-    ]
-
-  def test_should_apply_children_xpaths_and_include_parent_text_between_matched_children(self):
-    xml_root = E.article(
-      E.entry(
-        E.parent(
-          E.child2(SOME_LONGER_VALUE),
-          SOME_VALUE,
-          E.child1(SOME_SHORTER_VALUE)
-        )
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: './/*'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [SOME_LONGER_VALUE, SOME_VALUE, SOME_SHORTER_VALUE])
-    ]
-
-  def test_should_apply_multiple_children_xpaths_and_include_parent_text_if_enabled(self):
-    xml_root = E.article(
-      E.entry(
-        E.child1(SOME_SHORTER_VALUE),
-        SOME_LONGER_VALUE
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: '\n{}\n{}\n'.format('.//*', '.'),
-        TAG1 + XmlMappingSuffix.UNMATCHED_PARENT_TEXT: 'true'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])
-    ]
-
-  def test_should_apply_concat_children(self):
-    num_values = ['101', '202']
-    xml_root = E.article(
-      E.entry(
-        E.parent(
-          E.child1(SOME_VALUE),
-          E.fpage(num_values[0]),
-          E.lpage(num_values[1])
-        )
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: './/*',
-        TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{
-          'xpath': './/fpage'
-        }, {
-          'value': '-'
-        }, {
-          'xpath': './/lpage'
-        }]])
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [SOME_VALUE, '-'.join(num_values)])
-    ]
-
-  def test_should_not_apply_concat_children_if_one_node_was_not_found(self):
-    num_values = ['101', '202']
-    xml_root = E.article(
-      E.entry(
-        E.parent(
-          E.child1(SOME_VALUE),
-          E.fpage(num_values[0]),
-          E.lpage(num_values[1])
-        )
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: './/*',
-        TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{
-          'xpath': './/fpage'
-        }, {
-          'value': '-'
-        }, {
-          'xpath': './/unknown'
-        }]])
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [SOME_VALUE, num_values[0], num_values[1]])
-    ]
-
-  def test_should_apply_range_children(self):
-    num_values = [101, 102, 103, 104, 105, 106, 107]
-    xml_root = E.article(
-      E.entry(
-        E.child1(SOME_VALUE),
-        E.fpage(str(min(num_values))),
-        E.lpage(str(max(num_values)))
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage',
-        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
-          'min': {
-            'xpath': 'fpage'
-          },
-          'max': {
-            'xpath': 'lpage'
-          }
-        }])
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [str(x) for x in num_values])
-    ]
-
-  def test_should_apply_range_children_as_separate_target_annotations(self):
-    num_values = [101, 102, 103, 104, 105, 106, 107]
-    xml_root = E.article(
-      E.entry(
-        E.child1(SOME_VALUE),
-        E.fpage(str(min(num_values))),
-        E.lpage(str(max(num_values)))
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage',
-        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
-          'min': {
-            'xpath': 'fpage'
-          },
-          'max': {
-            'xpath': 'lpage'
-          },
-          'standalone': True
-        }])
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, str(x))
-      for x in num_values
-    ]
-
-  def test_should_not_apply_range_children_if_xpath_not_matching(self):
-    num_values = [101, 102, 103, 104, 105, 106, 107]
-    fpage = str(min(num_values))
-    lpage = str(max(num_values))
-    xml_root = E.article(
-      E.entry(
-        E.child1(SOME_VALUE),
-        E.fpage(fpage),
-        E.lpage(lpage)
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|unknown',
-        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
-          'min': {
-            'xpath': 'fpage'
-          },
-          'max': {
-            'xpath': 'unknown'
-          }
-        }])
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, fpage)
-    ]
-
-  def test_should_not_apply_range_children_if_value_is_not_integer(self):
-    fpage = 'abc'
-    lpage = 'xyz'
-    xml_root = E.article(
-      E.entry(
-        E.child1(SOME_VALUE),
-        E.fpage(fpage),
-        E.lpage(lpage)
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'entry',
-        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage',
-        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
-          'min': {
-            'xpath': 'fpage'
-          },
-          'max': {
-            'xpath': 'lpage'
-          }
-        }])
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(t.name, t.value) for t in target_annotations] == [
-      (TAG1, [fpage, lpage])
-    ]
-
-  def test_should_return_full_text(self):
-    xml_root = E.article(
-      E.title(
-        'some ',
-        E.other('embedded'),
-        ' text'
-      )
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'title'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert len(target_annotations) == 1
-    assert target_annotations[0].name == TAG1
-    assert target_annotations[0].value == 'some embedded text'
-
-  def test_should_return_target_annotations_in_order_of_xml(self):
-    xml_root = E.article(
-      E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'),
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'tag1',
-        TAG2: 'tag2'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(ta.name, ta.value) for ta in target_annotations] == [
-      (TAG1, 'tag1.1'), (TAG2, 'tag2.1'), (TAG1, 'tag1.2'), (TAG2, 'tag2.2')
-    ]
-
-  def test_should_return_target_annotations_in_order_of_priority_first(self):
-    xml_root = E.article(
-      E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'),
-    )
-    xml_mapping = {
-      'article': {
-        TAG1: 'tag1',
-        TAG2: 'tag2',
-        TAG2 + XmlMappingSuffix.PRIORITY: '1'
-      }
-    }
-    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
-    assert [(ta.name, ta.value) for ta in target_annotations] == [
-      (TAG2, 'tag2.1'), (TAG2, 'tag2.2'), (TAG1, 'tag1.1'), (TAG1, 'tag1.2')
-    ]
-
 class TestMatchingAnnotator(object):
   def test_should_not_fail_on_empty_document(self):
     doc = SimpleStructuredDocument(lines=[])
diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation.py b/sciencebeam_gym/preprocess/annotation/target_annotation.py
new file mode 100644
index 0000000..5e6fcb9
--- /dev/null
+++ b/sciencebeam_gym/preprocess/annotation/target_annotation.py
@@ -0,0 +1,272 @@
+import logging
+import json
+import re
+from itertools import chain
+
+from future.utils import python_2_unicode_compatible
+
+import six
+from six.moves.configparser import ConfigParser # pylint: disable=E0401
+
+from lxml import etree
+
+from sciencebeam_gym.utils.string import (
+  LazyStr
+)
+
+from sciencebeam_gym.utils.xml import (
+  get_text_content,
+  get_text_content_list,
+  get_immediate_text
+)
+
+from sciencebeam_gym.utils.collection import (
+  filter_truthy,
+  strip_all
+)
+
+def get_logger():
+  return logging.getLogger(__name__)
+
+class XmlMappingSuffix(object):
+  REGEX = '.regex'
+  MATCH_MULTIPLE = '.match-multiple'
+  BONDING = '.bonding'
+  CHILDREN = '.children'
+  CHILDREN_CONCAT = '.children.concat'
+  CHILDREN_RANGE = '.children.range'
+  UNMATCHED_PARENT_TEXT = '.unmatched-parent-text'
+  PRIORITY = '.priority'
+
+@python_2_unicode_compatible
+class TargetAnnotation(object):
+  def __init__(self, value, name, match_multiple=False, bonding=False):
+    self.value = value
+    self.name = name
+    self.match_multiple = match_multiple
+    self.bonding = bonding
+
+  def __str__(self):
+    return u'{} (match_multiple={}): {}'.format(self.name, self.match_multiple, self.value)
+
+
+def parse_xml_mapping(xml_mapping_filename):
+  with open(xml_mapping_filename, 'r') as f:
+    config = ConfigParser()
+    if six.PY3:
+      config.read_file(f)
+    else:
+      config.readfp(f)
+    return {
+      k: dict(config.items(k))
+      for k in config.sections()
+    }
+
+def apply_pattern(s, compiled_pattern):
+  m = compiled_pattern.match(s)
+  if m:
+    get_logger().debug('regex match: %s -> %s', compiled_pattern, m.groups())
+    return m.group(1)
+  return s
+
+def iter_parents(children):
+  for child in children:
+    p = child.getparent()
+    if p is not None:
+      yield p
+
+def exclude_parents(children):
+  if not isinstance(children, list):
+    children = list(children)
+  all_parents = set(iter_parents(children))
+  return [child for child in children if not child in all_parents]
+
+def extract_children_source_list(parent, children_source_list):
+  used_nodes = set()
+  values = []
+  for children_source in children_source_list:
+    xpath = children_source.get('xpath')
+    if xpath:
+      matching_nodes = exclude_parents(parent.xpath(xpath))
+      if not matching_nodes:
+        get_logger().debug(
+          'child xpath does not match any item, skipping: xpath=%s (xml=%s)',
+          xpath,
+          LazyStr(lambda: str(etree.tostring(parent)))
+        )
+        used_nodes = set()
+        values = []
+        break
+      used_nodes |= set(matching_nodes)
+      value = ' '.join(get_text_content_list(matching_nodes))
+    else:
+      value = children_source.get('value')
+    values.append(value or '')
+  return values, used_nodes
+
+def extract_children_concat(parent, children_concat):
+  used_nodes = set()
+  values = []
+  get_logger().debug('children_concat: %s', children_concat)
+  for children_concat_item in children_concat:
+    temp_values, temp_used_nodes = extract_children_source_list(
+      parent, children_concat_item
+    )
+    used_nodes |= temp_used_nodes
+    if temp_values:
+      values.append(''.join(temp_values))
+  return values, used_nodes
+
+def extract_children_range(parent, children_range):
+  used_nodes = set()
+  values = []
+  standalone_values = []
+  get_logger().debug('children_range: %s', children_range)
+  for range_item in children_range:
+    temp_values, temp_used_nodes = extract_children_source_list(
+      parent, [range_item.get('min'), range_item.get('max')]
+    )
+    if len(temp_values) == 2:
+      temp_values = strip_all(temp_values)
+      if all(s.isdigit() for s in temp_values):
+        num_values = [int(s) for s in temp_values]
+        range_values = [str(x) for x in range(num_values[0], num_values[1] + 1)]
+        if range_item.get('standalone'):
+          standalone_values.extend(range_values)
+        else:
+          values.extend(range_values)
+        used_nodes |= temp_used_nodes
+      else:
+        get_logger().info('values not integers: %s', temp_values)
+  return values, standalone_values, used_nodes
+
+def parse_xpaths(s):
+  return strip_all(s.strip().split('\n')) if s else None
+
+def match_xpaths(parent, xpaths):
+  return chain(*[parent.xpath(s) for s in xpaths])
+
+def extract_children(
+  parent, children_xpaths, children_concat, children_range, unmatched_parent_text):
+
+  concat_values_list, concat_used_nodes = extract_children_concat(parent, children_concat)
+  range_values_list, standalone_values, range_used_nodes = (
+    extract_children_range(parent, children_range)
+  )
+  used_nodes = concat_used_nodes | range_used_nodes
+
+  other_child_nodes = [
+    node for node in match_xpaths(parent, children_xpaths)
+    if not node in used_nodes
+  ]
+  other_child_nodes_excl_parents = exclude_parents(other_child_nodes)
+  text_content_list = filter_truthy(strip_all(
+    get_text_content_list(other_child_nodes_excl_parents) +
+    concat_values_list + range_values_list
+  ))
+  if len(other_child_nodes_excl_parents) != len(other_child_nodes):
+    other_child_nodes_excl_parents_set = set(other_child_nodes_excl_parents)
+    for child in other_child_nodes:
+      if child not in other_child_nodes_excl_parents_set:
+        text_values = filter_truthy(strip_all(get_immediate_text(child)))
+        text_content_list.extend(text_values)
+  if unmatched_parent_text:
+    value = get_text_content(
+      parent,
+      exclude=set(other_child_nodes) | used_nodes
+    ).strip()
+    if value and not value in text_content_list:
+      text_content_list.append(value)
+  return text_content_list, standalone_values
+
+def parse_json_with_default(s, default_value):
+  return json.loads(s) if s else default_value
+
+def xml_root_to_target_annotations(xml_root, xml_mapping):
+  if not xml_root.tag in xml_mapping:
+    raise Exception("unrecognised tag: {} (available: {})".format(
+      xml_root.tag, xml_mapping.sections())
+    )
+
+  mapping = xml_mapping[xml_root.tag]
+
+  field_names = [k for k in mapping.keys() if '.' not in k]
+  get_mapping_flag = lambda k, suffix: mapping.get(k + suffix) == 'true'
+  get_match_multiple = lambda k: get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE)
+  get_bonding_flag = lambda k: get_mapping_flag(k, XmlMappingSuffix.BONDING)
+  get_unmatched_parent_text_flag = (
+    lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT)
+  )
+
+  get_logger().debug('fields: %s', field_names)
+
+  target_annotations_with_pos = []
+  xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())}
+  for k in field_names:
+    match_multiple = get_match_multiple(k)
+    bonding = get_bonding_flag(k)
+    unmatched_parent_text = get_unmatched_parent_text_flag(k)
+    children_xpaths = parse_xpaths(mapping.get(k + XmlMappingSuffix.CHILDREN))
+    children_concat = parse_json_with_default(
+      mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), []
+    )
+    children_range = parse_json_with_default(
+      mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), []
+    )
+    re_pattern = mapping.get(k + XmlMappingSuffix.REGEX)
+    re_compiled_pattern = re.compile(re_pattern) if re_pattern else None
+    priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0'))
+
+    xpaths = parse_xpaths(mapping[k])
+    get_logger().debug('xpaths(%s): %s', k, xpaths)
+    for e in match_xpaths(xml_root, xpaths):
+      e_pos = xml_pos_by_node.get(e)
+      if children_xpaths:
+        text_content_list, standalone_values = extract_children(
+          e, children_xpaths, children_concat, children_range, unmatched_parent_text
+        )
+      else:
+        text_content_list = filter_truthy(strip_all([get_text_content(e)]))
+        standalone_values = []
+      if re_compiled_pattern:
+        text_content_list = filter_truthy([
+          apply_pattern(s, re_compiled_pattern) for s in text_content_list
+        ])
+      if text_content_list:
+        value = (
+          text_content_list[0]
+          if len(text_content_list) == 1
+          else sorted(text_content_list, key=lambda s: -len(s))
+        )
+        target_annotations_with_pos.append((
+          (-priority, e_pos),
+          TargetAnnotation(
+            value,
+            k,
+            match_multiple=match_multiple,
+            bonding=bonding
+          )
+        ))
+      if standalone_values:
+        for i, standalone_value in enumerate(standalone_values):
+          target_annotations_with_pos.append((
+            (-priority, e_pos, i),
+            TargetAnnotation(
+              standalone_value,
+              k,
+              match_multiple=match_multiple,
+              bonding=bonding
+            )
+          ))
+  target_annotations_with_pos = sorted(
+    target_annotations_with_pos,
+    key=lambda x: x[0]
+  )
+  get_logger().debug('target_annotations_with_pos:\n%s', target_annotations_with_pos)
+  target_annotations = [
+    x[1] for x in target_annotations_with_pos
+  ]
+  get_logger().debug('target_annotations:\n%s', '\n'.join([
+    ' ' + str(a) for a in target_annotations
+  ]))
+  return target_annotations
diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation_test.py b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py
new file mode 100644
index 0000000..38ec32a
--- /dev/null
+++ b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py
@@ -0,0 +1,453 @@
+from __future__ import division
+
+import json
+
+from lxml.builder import E
+
+from sciencebeam_gym.preprocess.annotation.target_annotation import (
+  TargetAnnotation,
+  xml_root_to_target_annotations,
+  XmlMappingSuffix
+)
+
+from sciencebeam_gym.utils.collection import (
+  flatten
+)
+
+TAG1 = 'tag1'
+TAG2 = 'tag2'
+
+SOME_VALUE = 'some value'
+SOME_VALUE_2 = 'some value2'
+SOME_LONGER_VALUE = 'some longer value1'
+SOME_SHORTER_VALUE = 'value1'
+
+class TestXmlRootToTargetAnnotations(object):
+  def test_should_return_empty_target_annotations_for_empty_xml(self):
+    xml_root = E.article(
+    )
+    xml_mapping = {
+      'article': {
+        'title': 'title'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert target_annotations == []
+
+  def test_should_return_empty_target_annotations_for_no_matching_annotations(self):
+    xml_root = E.article(
+      E.other(SOME_VALUE)
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert target_annotations == []
+
+  def test_should_return_matching_target_annotations(self):
+    xml_root = E.article(
+      E.title(SOME_VALUE)
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert len(target_annotations) == 1
+    assert target_annotations[0].name == TAG1
+    assert target_annotations[0].value == SOME_VALUE
+
+  def test_should_apply_regex_to_result(self):
+    xml_root = E.article(
+      E.title('1.1. ' + SOME_VALUE)
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title',
+        TAG1 + XmlMappingSuffix.REGEX: r'(?:\d+\.?)* ?(.*)'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert len(target_annotations) == 1
+    assert target_annotations[0].name == TAG1
+    assert target_annotations[0].value == SOME_VALUE
+
+  def test_should_apply_match_multiple_flag(self):
+    xml_root = E.article(
+      E.title(SOME_VALUE)
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title',
+        TAG1 + XmlMappingSuffix.MATCH_MULTIPLE: 'true'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [t.match_multiple for t in target_annotations] == [True]
+
+  def test_should_not_apply_match_multiple_flag_if_not_set(self):
+    xml_root = E.article(
+      E.title(SOME_VALUE)
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [t.match_multiple for t in target_annotations] == [False]
+
+  def test_should_apply_match_bonding_flag(self):
+    xml_root = E.article(
+      E.title(SOME_VALUE)
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title',
+        TAG1 + XmlMappingSuffix.BONDING: 'true'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [t.bonding for t in target_annotations] == [True]
+
+  def test_should_not_apply_match_bonding_flag_if_not_set(self):
+    xml_root = E.article(
+      E.title(SOME_VALUE)
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [t.bonding for t in target_annotations] == [False]
+
+  def test_should_use_multiple_xpaths(self):
+    xml_root = E.article(
+      E.entry(
+        E.child1(SOME_VALUE),
+        E.child2(SOME_VALUE_2)
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: '\n{}\n{}\n'.format(
+          'entry/child1',
+          'entry/child2'
+        )
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, SOME_VALUE),
+      (TAG1, SOME_VALUE_2)
+    ]
+
+  def test_should_apply_children_xpaths_and_sort_by_value_descending(self):
+    xml_root = E.article(
+      E.entry(
+        E.child1(SOME_SHORTER_VALUE),
+        E.child2(SOME_LONGER_VALUE)
+      ),
+      E.entry(
+        E.child1(SOME_LONGER_VALUE)
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: './/*'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE]),
+      (TAG1, SOME_LONGER_VALUE)
+    ]
+
+  def test_should_apply_children_xpaths_and_exclude_parents(self):
+    xml_root = E.article(
+      E.entry(
+        E.parent(
+          E.child2(SOME_LONGER_VALUE),
+          E.child1(SOME_SHORTER_VALUE)
+        )
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: './/*'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])
+    ]
+
+  def test_should_apply_children_xpaths_and_include_parent_text_between_matched_children(self):
+    xml_root = E.article(
+      E.entry(
+        E.parent(
+          E.child2(SOME_LONGER_VALUE),
+          SOME_VALUE,
+          E.child1(SOME_SHORTER_VALUE)
+        )
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: './/*'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [SOME_LONGER_VALUE, SOME_VALUE, SOME_SHORTER_VALUE])
+    ]
+
+  def test_should_apply_multiple_children_xpaths_and_include_parent_text_if_enabled(self):
+    xml_root = E.article(
+      E.entry(
+        E.child1(SOME_SHORTER_VALUE),
+        SOME_LONGER_VALUE
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: '\n{}\n{}\n'.format('.//*', '.'),
+        TAG1 + XmlMappingSuffix.UNMATCHED_PARENT_TEXT: 'true'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])
+    ]
+
+  def test_should_apply_concat_children(self):
+    num_values = ['101', '202']
+    xml_root = E.article(
+      E.entry(
+        E.parent(
+          E.child1(SOME_VALUE),
+          E.fpage(num_values[0]),
+          E.lpage(num_values[1])
+        )
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: './/*',
+        TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{
+          'xpath': './/fpage'
+        }, {
+          'value': '-'
+        }, {
+          'xpath': './/lpage'
+        }]])
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [SOME_VALUE, '-'.join(num_values)])
+    ]
+
+  def test_should_not_apply_concat_children_if_one_node_was_not_found(self):
+    num_values = ['101', '202']
+    xml_root = E.article(
+      E.entry(
+        E.parent(
+          E.child1(SOME_VALUE),
+          E.fpage(num_values[0]),
+          E.lpage(num_values[1])
+        )
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: './/*',
+        TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{
+          'xpath': './/fpage'
+        }, {
+          'value': '-'
+        }, {
+          'xpath': './/unknown'
+        }]])
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [SOME_VALUE, num_values[0], num_values[1]])
+    ]
+
+  def test_should_apply_range_children(self):
+    num_values = [101, 102, 103, 104, 105, 106, 107]
+    xml_root = E.article(
+      E.entry(
+        E.child1(SOME_VALUE),
+        E.fpage(str(min(num_values))),
+        E.lpage(str(max(num_values)))
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage',
+        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
+          'min': {
+            'xpath': 'fpage'
+          },
+          'max': {
+            'xpath': 'lpage'
+          }
+        }])
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [str(x) for x in num_values])
+    ]
+
+  def test_should_apply_range_children_as_separate_target_annotations(self):
+    num_values = [101, 102, 103, 104, 105, 106, 107]
+    xml_root = E.article(
+      E.entry(
+        E.child1(SOME_VALUE),
+        E.fpage(str(min(num_values))),
+        E.lpage(str(max(num_values)))
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage',
+        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
+          'min': {
+            'xpath': 'fpage'
+          },
+          'max': {
+            'xpath': 'lpage'
+          },
+          'standalone': True
+        }])
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, str(x))
+      for x in num_values
+    ]
+
+  def test_should_not_apply_range_children_if_xpath_not_matching(self):
+    num_values = [101, 102, 103, 104, 105, 106, 107]
+    fpage = str(min(num_values))
+    lpage = str(max(num_values))
+    xml_root = E.article(
+      E.entry(
+        E.child1(SOME_VALUE),
+        E.fpage(fpage),
+        E.lpage(lpage)
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|unknown',
+        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
+          'min': {
+            'xpath': 'fpage'
+          },
+          'max': {
+            'xpath': 'unknown'
+          }
+        }])
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, fpage)
+    ]
+
+  def test_should_not_apply_range_children_if_value_is_not_integer(self):
+    fpage = 'abc'
+    lpage = 'xyz'
+    xml_root = E.article(
+      E.entry(
+        E.child1(SOME_VALUE),
+        E.fpage(fpage),
+        E.lpage(lpage)
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage',
+        TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{
+          'min': {
+            'xpath': 'fpage'
+          },
+          'max': {
+            'xpath': 'lpage'
+          }
+        }])
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, [fpage, lpage])
+    ]
+
+  def test_should_return_full_text(self):
+    xml_root = E.article(
+      E.title(
+        'some ',
+        E.other('embedded'),
+        ' text'
+      )
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'title'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert len(target_annotations) == 1
+    assert target_annotations[0].name == TAG1
+    assert target_annotations[0].value == 'some embedded text'
+
+  def test_should_return_target_annotations_in_order_of_xml(self):
+    xml_root = E.article(
+      E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'),
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'tag1',
+        TAG2: 'tag2'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(ta.name, ta.value) for ta in target_annotations] == [
+      (TAG1, 'tag1.1'), (TAG2, 'tag2.1'), (TAG1, 'tag1.2'), (TAG2, 'tag2.2')
+    ]
+
+  def test_should_return_target_annotations_in_order_of_priority_first(self):
+    xml_root = E.article(
+      E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'),
+    )
+    xml_mapping = {
+      'article': {
+        TAG1: 'tag1',
+        TAG2: 'tag2',
+        TAG2 + XmlMappingSuffix.PRIORITY: '1'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert [(ta.name, ta.value) for ta in target_annotations] == [
+      (TAG2, 'tag2.1'), (TAG2, 'tag2.2'), (TAG1, 'tag1.1'), (TAG1, 'tag1.2')
+    ]
diff --git a/sciencebeam_gym/preprocess/lxml_to_svg.py b/sciencebeam_gym/preprocess/lxml_to_svg.py
index ce75da2..ca472da 100644
--- a/sciencebeam_gym/preprocess/lxml_to_svg.py
+++ b/sciencebeam_gym/preprocess/lxml_to_svg.py
@@ -20,7 +20,10 @@ from sciencebeam_gym.preprocess.annotation.annotator import (
 
 from sciencebeam_gym.preprocess.annotation.matching_annotator import (
   MatchingAnnotator,
-  CsvMatchDetailReporter,
+  CsvMatchDetailReporter
+)
+
+from sciencebeam_gym.preprocess.annotation.target_annotation import (
   parse_xml_mapping,
   xml_root_to_target_annotations
 )
diff --git a/sciencebeam_gym/preprocess/preprocessing_pipeline.py b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
index 57c42ae..4229426 100644
--- a/sciencebeam_gym/preprocess/preprocessing_pipeline.py
+++ b/sciencebeam_gym/preprocess/preprocessing_pipeline.py
@@ -41,7 +41,7 @@ from sciencebeam_gym.structured_document.svg import (
   SvgStructuredDocument
 )
 
-from sciencebeam_gym.preprocess.annotation.matching_annotator import (
+from sciencebeam_gym.preprocess.annotation.target_annotation import (
   parse_xml_mapping
 )
 
diff --git a/sciencebeam_gym/preprocess/preprocessing_utils.py b/sciencebeam_gym/preprocess/preprocessing_utils.py
index f1a6d7c..9af3ba4 100644
--- a/sciencebeam_gym/preprocess/preprocessing_utils.py
+++ b/sciencebeam_gym/preprocess/preprocessing_utils.py
@@ -52,7 +52,10 @@ from sciencebeam_gym.alignment.align import (
 )
 
 from sciencebeam_gym.preprocess.annotation.matching_annotator import (
-  MatchingAnnotator,
+  MatchingAnnotator
+)
+
+from sciencebeam_gym.preprocess.annotation.target_annotation import (
   xml_root_to_target_annotations
 )
 
diff --git a/sciencebeam_gym/utils/string.py b/sciencebeam_gym/utils/string.py
new file mode 100644
index 0000000..00aa95d
--- /dev/null
+++ b/sciencebeam_gym/utils/string.py
@@ -0,0 +1,9 @@
+from future.utils import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class LazyStr(object):
+  def __init__(self, fn):
+    self.fn = fn
+
+  def __str__(self):
+    return self.fn()
-- 
GitLab