From 20b389c343d3ed5f0ccb2d8d4d9a7739a047e60d Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Thu, 1 Feb 2018 08:41:02 +0000 Subject: [PATCH] treat asterisk as junk; treat repeating characters as junk --- sciencebeam_gym/preprocess/annotation/fuzzy_match.py | 4 +++- sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py index 924e945..7f4f595 100644 --- a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py +++ b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py @@ -30,7 +30,9 @@ def len_index_range(index_range): # Treat space or comma after a dot, or a dot after a letter as junk DEFAULT_ISJUNK = lambda s, i: ( (i > 0 and s[i - 1] == '.' and (s[i] == ' ' or s[i] == ',')) or - (i > 0 and s[i - 1].isalpha() and s[i] == '.') + (i > 0 and s[i - 1].isalpha() and s[i] == '.') or + (i > 0 and s[i - 1] == s[i]) or + s[i] == '*' ) DOT_IS_JUNK = lambda s, i: s[i] == '.' diff --git a/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py b/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py index 2b6dd86..5b52a30 100644 --- a/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py +++ b/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py @@ -20,6 +20,12 @@ class TestRemoveJunk(object): def test_should_remove_dots_after_capitals(self): assert remove_junk('P.O. Box', DOT_IS_JUNK) == 'PO Box' + def test_should_remove_asterisk_after_capitals(self): + assert remove_junk('Mr Beam*') == 'Mr Beam' + + def test_should_remove_repeating_characters(self): + assert remove_junk('Mr Beeeam') == 'Mr Beam' + class TestInvertIndexRanges(object): def test_should_return_empty_for_empty_range(self): assert list(invert_index_ranges([], 0, 0)) == list([]) -- GitLab