Skip to content
Snippets Groups Projects
Commit 2dac94ce authored by Daniel Ecer's avatar Daniel Ecer
Browse files

check that section_title is actually close to the paragraph found next

parent 4767e80e
No related branches found
No related tags found
No related merge requests found
......@@ -495,6 +495,18 @@ def sorted_matches_by_position(matches):
key=lambda m: (m.seq2.position, m.index2_range)
)
def matches_position_range(matches):
positions = [m.seq2.position for m in matches]
return min(positions), max(positions)
def distance_between_matches(matches1, matches2):
matches1_start, matches1_end = matches_position_range(matches1)
matches2_start, matches2_end = matches_position_range(matches2)
return min(
abs(matches2_start - matches1_end),
abs(matches1_start - matches2_end)
)
def _apply_sub_annotations(
target_annotation, structured_document, matching_tokens,
match_detail_reporter, use_tag_begin_prefix):
......@@ -632,7 +644,10 @@ class MatchingAnnotator(AbstractAnnotator):
conditional_match = None
break
get_logger().info('matches: %s', matches)
if conditional_match:
if (
conditional_match and
distance_between_matches(matches, conditional_match['matches']) <= 1
):
_apply_annotations_to_matches(
conditional_match['target_annotation'],
structured_document,
......
......@@ -430,7 +430,7 @@ class TestMatchingAnnotator(object):
[B_TAG_2] + [I_TAG_2] * (len(tag2_tokens) - 1)
)
def test_should_annotate_short_section_title_followed_by_paragraph_on_same_line(self):
def test_should_annotate_short_section_title_followed_by_paragraph(self):
section_title_text = 'section title'
section_paragraph_text = 'paragraph text to come here.'
section_title_tokens = _tokens_for_text(section_title_text + '.')
......@@ -453,7 +453,29 @@ class TestMatchingAnnotator(object):
['section_paragraph'] * len(section_paragraph_tokens)
)
def test_should_not_annotate_short_section_title_not_followed_by_paragraph_on_same_line(self):
def test_should_not_annotate_short_section_title_not_followed_by_paragraph(self):
section_title_text = 'section title'
section_title_tokens = _tokens_for_text(section_title_text + '.')
section_paragraph_text = 'paragraph text to come here.'
section_paragraph_tokens = _tokens_for_text(section_paragraph_text)
tokens_per_line = [
section_title_tokens + _tokens_for_text('other text to come here.'),
_tokens_for_text('more unrelated text.'),
_tokens_for_text('even more.'),
section_paragraph_tokens
]
target_annotations = [
TargetAnnotation(section_title_text, 'section_title', require_next=True),
TargetAnnotation(section_paragraph_text, 'section_paragraph')
]
doc = _document_for_tokens(tokens_per_line)
MatchingAnnotator(target_annotations).annotate(doc)
assert (
_get_tags_of_tokens(section_title_tokens) ==
[None] * len(section_title_tokens)
)
def test_should_not_annotate_short_section_title_if_paragraph_follows_later(self):
section_title_text = 'section title'
section_title_tokens = _tokens_for_text(section_title_text + '.')
other_tokens = _tokens_for_text('other text to come here.')
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment