diff --git a/annot-xml-front.conf b/annot-xml-front.conf
index 04028376919ab4dbf5180fea5cf1c13fa0f1664e..f1eb8f2103e65880584ff1ba54572e092079e6b0 100644
--- a/annot-xml-front.conf
+++ b/annot-xml-front.conf
@@ -33,8 +33,10 @@ author_aff.children = .//*
 author_aff.unmatched-parent-text = true
 author_aff.bonding = true
 author_aff.match-multiple = true
+author_aff.extract-regex = .*\b(\d+)\b.*
 author_aff.sub.sup = ./sup
 author_aff.sub.addrline = ./addr-line
+author_aff.sub.addrline.extract-regex = .*\b(\d+)\b.*
 author_aff.sub.country = ./country
 author_aff.sub.extlink = ./ext-link
 
diff --git a/annot-xml-full.conf b/annot-xml-full.conf
index 166d7b03350992dd9be5097cd28d72a1591f221b..829ed72d6af601b1f8b6e452b0d0e06ca76589a7 100644
--- a/annot-xml-full.conf
+++ b/annot-xml-full.conf
@@ -33,8 +33,10 @@ author_aff.children = .//*
 author_aff.unmatched-parent-text = true
 author_aff.bonding = true
 author_aff.match-multiple = true
+author_aff.extract-regex = .*\b(\d+)\b.*
 author_aff.sub.sup = ./sup
 author_aff.sub.addrline = ./addr-line
+author_aff.sub.addrline.extract-regex = .*\b(\d+)\b.*
 author_aff.sub.country = ./country
 author_aff.sub.extlink = ./ext-link
 
diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation.py b/sciencebeam_gym/preprocess/annotation/target_annotation.py
index 76ad6721a6c27af480aed939b3904ef03c813d68..073358a33540cfbfb522555c10ca5ea3314e3d36 100644
--- a/sciencebeam_gym/preprocess/annotation/target_annotation.py
+++ b/sciencebeam_gym/preprocess/annotation/target_annotation.py
@@ -31,6 +31,7 @@ def get_logger():
 
 class XmlMappingSuffix(object):
   REGEX = '.regex'
+  EXTRACT_REGEX = '.extract-regex'
   MATCH_MULTIPLE = '.match-multiple'
   BONDING = '.bonding'
   REQUIRE_NEXT = '.require-next'
@@ -99,6 +100,19 @@ def get_stripped_text_content(node, **kwargs):
 def get_stripped_text_content_list(nodes, **kwargs):
   return [get_stripped_text_content(node, **kwargs) for node in nodes]
 
+def iter_flatten_if_nested(a):
+  for x in a:
+    if isinstance(x, list):
+      for y in iter_flatten_if_nested(x):
+        yield y
+    else:
+      yield x
+
+def flatten_if_nested(a):
+  if not a:
+    return a
+  return list(iter_flatten_if_nested(a))
+
 def apply_pattern(s, compiled_pattern):
   m = compiled_pattern.match(s)
   if m:
@@ -227,17 +241,60 @@ def get_prefixed_dict_values(d, key_prefix):
   }
 
 def get_sub_mapping(mapping, tag):
-  return get_prefixed_dict_values(mapping, tag + XmlMappingSuffix.SUB + '.')
+  return {
+    k: v
+    for k, v in get_prefixed_dict_values(mapping, tag + XmlMappingSuffix.SUB + '.').items()
+    if '.' not in k
+  }
 
-def extract_sub_annotations(parent_node, sub_xpaths):
+def re_compile_or_none(pattern):
+  return re.compile(pattern) if pattern else None
+
+def extract_using_regex(s, compiled_pattern):
+  result = None
+  start = 0
+  for m in compiled_pattern.finditer(s):
+    m_start = m.start(1)
+    m_end = m.end(1)
+    m_text = m.group(1)
+    get_logger().debug('extract match: %d:%d, %s', m_start, m_end, m_text)
+    if result is None:
+      result = []
+    if start < m_start:
+      result.append(s[start:m_start].strip())
+    result.append(m_text)
+    start = m_end + 1
+  if result is None:
+    return s
+  if start < len(s):
+    result.append(s[start:].strip())
+  if len(result) == 1:
+    return result[0]
+  # also include the full string
+  result.append(s)
+  return result
+
+def extract_sub_annotations(parent_node, sub_xpaths, mapping, parent_key):
   if not sub_xpaths:
     return
   sub_annotations = []
   for sub_tag, sub_xpath in sub_xpaths.items():
+    sub_key_prefix = parent_key + XmlMappingSuffix.SUB + '.' + sub_tag
+    extract_re_compiled_pattern = re_compile_or_none(
+      mapping.get(sub_key_prefix + XmlMappingSuffix.EXTRACT_REGEX)
+    )
+    get_logger().debug('sub_key_prefix: %s', sub_key_prefix)
+    get_logger().debug(
+      'extract_re_compiled_pattern (%s, %s): %s',
+      parent_key, sub_tag, extract_re_compiled_pattern
+    )
+
     for e in match_xpaths(parent_node, [sub_xpath]):
       value = get_stripped_text_content(e)
       if value:
         value = strip_whitespace(value).strip()
+      if extract_re_compiled_pattern is not None and value:
+        value = extract_using_regex(value, extract_re_compiled_pattern)
       if value:
         sub_annotations.append(TargetAnnotation(value, sub_tag))
   return sub_annotations
@@ -275,8 +332,14 @@ def xml_root_to_target_annotations(xml_root, xml_mapping):
     children_range = parse_json_with_default(
       mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), []
     )
-    re_pattern = mapping.get(k + XmlMappingSuffix.REGEX)
-    re_compiled_pattern = re.compile(re_pattern) if re_pattern else None
+    re_compiled_pattern = re_compile_or_none(
+      mapping.get(k + XmlMappingSuffix.REGEX)
+    )
+    extract_re_compiled_pattern = re_compile_or_none(
+      mapping.get(k + XmlMappingSuffix.EXTRACT_REGEX)
+    )
+    get_logger().debug('extract_re_compiled_pattern (%s): %s', k, extract_re_compiled_pattern)
+
     priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0'))
     sub_xpaths = get_sub_mapping(mapping, k)
     get_logger().debug('sub_xpaths (%s): %s', k, sub_xpaths)
@@ -286,7 +349,7 @@ def xml_root_to_target_annotations(xml_root, xml_mapping):
     for e in match_xpaths(xml_root, xpaths):
       e_pos = xml_pos_by_node.get(e)
 
-      sub_annotations = extract_sub_annotations(e, sub_xpaths)
+      sub_annotations = extract_sub_annotations(e, sub_xpaths, mapping, k)
       get_logger().debug('sub_annotations (%s): %s', k, sub_annotations)
 
       if children_xpaths:
@@ -300,6 +363,11 @@ def xml_root_to_target_annotations(xml_root, xml_mapping):
         text_content_list = filter_truthy([
           apply_pattern(s, re_compiled_pattern) for s in text_content_list
         ])
+      if extract_re_compiled_pattern:
+        text_content_list = filter_truthy([
+          extract_using_regex(s, extract_re_compiled_pattern) for s in text_content_list
+        ])
+      text_content_list = flatten_if_nested(text_content_list)
       if text_content_list:
         value = (
           text_content_list[0]
diff --git a/sciencebeam_gym/preprocess/annotation/target_annotation_test.py b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py
index 9113352e0ee68b4de3e7087b4d9a6c47b2d7c5ad..a3fdadf1b2976551db0801f91e9a6a42c431f1b1 100644
--- a/sciencebeam_gym/preprocess/annotation/target_annotation_test.py
+++ b/sciencebeam_gym/preprocess/annotation/target_annotation_test.py
@@ -499,6 +499,77 @@ class TestXmlRootToTargetAnnotations(object):
       ('value', SOME_VALUE_2)
     ]
 
+  def test_should_extract_numbers_from_value_after_text(self):
+    xml_root = E.article(E.entry(
+      E.value(SOME_VALUE + ' 12345')
+    ))
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert len(target_annotations) == 1
+    assert [(t.name, set(t.value)) for t in target_annotations] == [
+      (TAG1, {SOME_VALUE + ' 12345', SOME_VALUE, '12345'})
+    ]
+
+  def test_should_extract_single_value_if_its_the_only_value(self):
+    xml_root = E.article(E.entry(
+      E.value('12345')
+    ))
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert len(target_annotations) == 1
+    assert [(t.name, t.value) for t in target_annotations] == [
+      (TAG1, '12345')
+    ]
+
+  def test_should_unnest_extract_value_from_children(self):
+    xml_root = E.article(E.entry(
+      E.value(SOME_VALUE + ' 12345'),
+      E.value(SOME_VALUE_2 + ' 54321')
+    ))
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        TAG1 + XmlMappingSuffix.CHILDREN: r'.//*',
+        TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert len(target_annotations) == 1
+    assert [(t.name, set(t.value)) for t in target_annotations] == [
+      (TAG1, {
+        SOME_VALUE + ' 12345', SOME_VALUE, '12345',
+        SOME_VALUE_2 + ' 54321', SOME_VALUE_2, '54321'
+      })
+    ]
+
+  def test_should_extract_numbers_from_sub_value_after_text(self):
+    xml_root = E.article(E.entry(
+      E.value(SOME_VALUE + ' 12345')
+    ))
+    sub_key = TAG1 + XmlMappingSuffix.SUB + '.value'
+    xml_mapping = {
+      'article': {
+        TAG1: 'entry',
+        sub_key: './value',
+        sub_key + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*'
+      }
+    }
+    target_annotations = xml_root_to_target_annotations(xml_root, xml_mapping)
+    assert len(target_annotations) == 1
+    assert [(t.name, set(t.value)) for t in target_annotations[0].sub_annotations] == [
+      ('value', {SOME_VALUE + ' 12345', SOME_VALUE, '12345'})
+    ]
+
   def test_should_return_full_text(self):
     xml_root = E.article(
       E.title(