From 20b389c343d3ed5f0ccb2d8d4d9a7739a047e60d Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Thu, 1 Feb 2018 08:41:02 +0000
Subject: [PATCH] treat asterisk as junk; treat repeating characters as junk

---
 sciencebeam_gym/preprocess/annotation/fuzzy_match.py      | 4 +++-
 sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py
index 924e945..7f4f595 100644
--- a/sciencebeam_gym/preprocess/annotation/fuzzy_match.py
+++ b/sciencebeam_gym/preprocess/annotation/fuzzy_match.py
@@ -30,7 +30,9 @@ def len_index_range(index_range):
 # Treat space or comma after a dot, or a dot after a letter as junk
 DEFAULT_ISJUNK = lambda s, i: (
   (i > 0 and s[i - 1] == '.' and (s[i] == ' ' or s[i] == ',')) or
-  (i > 0 and s[i - 1].isalpha() and s[i] == '.')
+  (i > 0 and s[i - 1].isalpha() and s[i] == '.') or
+  (i > 0 and s[i - 1] == s[i]) or
+  s[i] == '*'
 )
 
 DOT_IS_JUNK = lambda s, i: s[i] == '.'
diff --git a/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py b/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py
index 2b6dd86..5b52a30 100644
--- a/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py
+++ b/sciencebeam_gym/preprocess/annotation/fuzzy_match_test.py
@@ -20,6 +20,12 @@ class TestRemoveJunk(object):
   def test_should_remove_dots_after_capitals(self):
     assert remove_junk('P.O. Box', DOT_IS_JUNK) == 'PO Box'
 
+  def test_should_remove_asterisk_after_capitals(self):
+    assert remove_junk('Mr Beam*') == 'Mr Beam'
+
+  def test_should_remove_repeating_characters(self):
+    assert remove_junk('Mr Beeeam') == 'Mr Beam'
+
 class TestInvertIndexRanges(object):
   def test_should_return_empty_for_empty_range(self):
     assert list(invert_index_ranges([], 0, 0)) == list([])
-- 
GitLab