check that section_title is actually close to the paragraph found next

2dac94ce · Daniel Ecer · 4767e80e · 2dac94ce · 2dac94ce
Commit 2dac94ce authored 7 years ago by Daniel Ecer
--- a/sciencebeam_gym/preprocess/annotation/matching_annotator.py
+++ b/sciencebeam_gym/preprocess/annotation/matching_annotator.py
@@ -495,6 +495,18 @@ def sorted_matches_by_position(matches):
    key=lambda m: (m.seq2.position, m.index2_range)
  )

+def matches_position_range(matches):
+  positions = [m.seq2.position for m in matches]
+  return min(positions), max(positions)
+
+def distance_between_matches(matches1, matches2):
+  matches1_start, matches1_end = matches_position_range(matches1)
+  matches2_start, matches2_end = matches_position_range(matches2)
+  return min(
+    abs(matches2_start - matches1_end),
+    abs(matches1_start - matches2_end)
+  )
+
 def _apply_sub_annotations(
  target_annotation, structured_document, matching_tokens,
  match_detail_reporter, use_tag_begin_prefix):
@@ -632,7 +644,10 @@ class MatchingAnnotator(AbstractAnnotator):
          conditional_match = None
          break
        get_logger().info('matches: %s', matches)
-        if conditional_match:
+        if (
+          conditional_match and
+          distance_between_matches(matches, conditional_match['matches']) <= 1
+        ):
          _apply_annotations_to_matches(
            conditional_match['target_annotation'],
            structured_document,

--- a/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
+++ b/sciencebeam_gym/preprocess/annotation/matching_annotator_test.py
@@ -430,7 +430,7 @@ class TestMatchingAnnotator(object):
      [B_TAG_2] + [I_TAG_2] * (len(tag2_tokens) - 1)
    )

-  def test_should_annotate_short_section_title_followed_by_paragraph_on_same_line(self):
+  def test_should_annotate_short_section_title_followed_by_paragraph(self):
    section_title_text = 'section title'
    section_paragraph_text = 'paragraph text to come here.'
    section_title_tokens = _tokens_for_text(section_title_text + '.')
@@ -453,7 +453,29 @@ class TestMatchingAnnotator(object):
      ['section_paragraph'] * len(section_paragraph_tokens)
    )

-  def test_should_not_annotate_short_section_title_not_followed_by_paragraph_on_same_line(self):
+  def test_should_not_annotate_short_section_title_not_followed_by_paragraph(self):
+    section_title_text = 'section title'
+    section_title_tokens = _tokens_for_text(section_title_text + '.')
+    section_paragraph_text = 'paragraph text to come here.'
+    section_paragraph_tokens = _tokens_for_text(section_paragraph_text)
+    tokens_per_line = [
+      section_title_tokens + _tokens_for_text('other text to come here.'),
+      _tokens_for_text('more unrelated text.'),
+      _tokens_for_text('even more.'),
+      section_paragraph_tokens
+    ]
+    target_annotations = [
+      TargetAnnotation(section_title_text, 'section_title', require_next=True),
+      TargetAnnotation(section_paragraph_text, 'section_paragraph')
+    ]
+    doc = _document_for_tokens(tokens_per_line)
+    MatchingAnnotator(target_annotations).annotate(doc)
+    assert (
+      _get_tags_of_tokens(section_title_tokens) ==
+      [None] * len(section_title_tokens)
+    )
+
+  def test_should_not_annotate_short_section_title_if_paragraph_follows_later(self):
    section_title_text = 'section title'
    section_title_tokens = _tokens_for_text(section_title_text + '.')
    other_tokens = _tokens_for_text('other text to come here.')