Newer
Older
from sciencebeam_gym.preprocess.annotation.fuzzy_match import (
remove_junk,
fuzzy_match,
DOT_IS_JUNK
def setup_module():
logging.basicConfig(level='DEBUG')
class TestRemoveJunk(object):
def test_should_keep_str_without_junk(self):
assert remove_junk('abc', DOT_IS_JUNK) == 'abc'
def test_should_remove_dots_after_capitals(self):
assert remove_junk('P.O. Box', DOT_IS_JUNK) == 'PO Box'
def test_should_remove_asterisk_after_capitals(self):
assert remove_junk('Mr Beam*') == 'Mr Beam'
def test_should_remove_repeating_characters(self):
assert remove_junk('Mr Beeeam') == 'Mr Beam'
class TestInvertIndexRanges(object):
def test_should_return_empty_for_empty_range(self):
assert list(invert_index_ranges([], 0, 0)) == list([])
def test_should_return_whole_range_for_empty_range_list(self):
assert list(invert_index_ranges([], 0, 10)) == list([(0, 10)])
def test_should_exclude_range_in_the_beginning(self):
assert list(invert_index_ranges([(0, 3)], 0, 10)) == list([(3, 10)])
def test_should_exclude_range_in_the_beginning_beyond_start(self):
assert list(invert_index_ranges([(0, 13)], 10, 20)) == list([(13, 20)])
def test_should_exclude_range_in_the_middle(self):
assert list(invert_index_ranges([(4, 7)], 0, 10)) == list([(0, 4), (7, 10)])
def test_should_exclude_range_at_the_end(self):
assert list(invert_index_ranges([(7, 10)], 0, 10)) == list([(0, 7)])
def test_should_exclude_range_at_the_end_beyond_end(self):
assert list(invert_index_ranges([(7, 100)], 0, 10)) == list([(0, 7)])
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class TestFuzzyMatch(object):
def test_match_count_should_be_the_same_independent_of_order(self):
s1 = 'this is a some sequence'
choice = 'this is another sequence'
fm_1 = fuzzy_match(s1, choice)
fm_2 = fuzzy_match(choice, s1)
assert fm_1.match_count() == fm_2.match_count()
class TestFuzzyMatchResult(object):
def test_exact_match(self):
fm = FuzzyMatchResult('abc', 'abc', [(0, 0, 3)])
assert fm.has_match()
assert fm.match_count() == 3
assert fm.ratio() == 1.0
assert fm.a_ratio() == 1.0
assert fm.b_ratio() == 1.0
assert fm.b_gap_ratio() == 1.0
assert fm.a_index_range() == (0, 3)
assert fm.b_index_range() == (0, 3)
def test_no_match(self):
fm = FuzzyMatchResult('abc', 'xyz', [])
assert not fm.has_match()
assert fm.match_count() == 0
def test_partial_match(self):
fm = FuzzyMatchResult('abx', 'aby', [(0, 0, 2)])
assert fm.has_match()
assert fm.match_count() == 2
assert fm.ratio() == 1.0
assert fm.a_ratio() == 2 / 3
assert fm.b_ratio() == 2 / 3
assert fm.b_gap_ratio() == 2 / 3
assert fm.a_index_range() == (0, 2)
assert fm.b_index_range() == (0, 2)
def test_partial_match_ignore_junk_at_the_end_of_a(self):
fm = FuzzyMatchResult('ab.', 'ab', [(0, 0, 2)], isjunk=lambda s, i: s[i] == '.')
assert fm.has_match()
assert fm.match_count() == 2
assert fm.ratio() == 1.0
assert fm.a_ratio() == 1.0
assert fm.b_ratio() == 1.0
assert fm.b_gap_ratio() == 1.0
assert fm.a_index_range() == (0, 2)
assert fm.b_index_range() == (0, 2)
def test_partial_match_ignore_junk_at_the_end_of_b(self):
fm = FuzzyMatchResult('ab', 'ab.', [(0, 0, 2)], isjunk=lambda s, i: s[i] == '.')
assert fm.has_match()
assert fm.match_count() == 2
assert fm.ratio() == 1.0
assert fm.a_ratio() == 1.0
assert fm.b_ratio() == 1.0
assert fm.b_gap_ratio() == 1.0
assert fm.a_index_range() == (0, 2)
assert fm.b_index_range() == (0, 2)
def test_partial_match_ignore_junk_in_the_middle_of_a(self):
fm = FuzzyMatchResult('a.b', 'ab', [(0, 0, 1), (2, 1, 1)], isjunk=lambda s, i: s[i] == '.')
assert fm.has_match()
assert fm.match_count() == 2
assert fm.ratio() == 1.0
assert fm.a_ratio() == 1.0
assert fm.b_ratio() == 1.0
assert fm.b_gap_ratio() == 1.0
assert fm.a_index_range() == (0, 3)
assert fm.b_index_range() == (0, 2)
def test_partial_match_ignore_junk_in_the_middle_of_b(self):
fm = FuzzyMatchResult('ab', 'a.b', [(0, 0, 1), (1, 2, 1)], isjunk=lambda s, i: s[i] == '.')
assert fm.has_match()
assert fm.match_count() == 2
assert fm.ratio() == 1.0
assert fm.a_ratio() == 1.0
assert fm.b_ratio() == 1.0
assert fm.b_gap_ratio() == 1.0
assert fm.a_index_range() == (0, 2)
assert fm.b_index_range() == (0, 3)
def test_should_not_double_count_matching_junk(self):
fm = FuzzyMatchResult('a.b', 'a.b', [(0, 0, 3)], isjunk=lambda s, i: s[i] == '.')
assert fm.has_match()
assert fm.match_count() == 3
assert fm.ratio() == 1.0
assert fm.a_ratio() == 1.0
assert fm.b_ratio() == 1.0
assert fm.b_gap_ratio() == 1.0
assert fm.a_index_range() == (0, 3)
assert fm.b_index_range() == (0, 3)
def test_a_split_no_match(self):
fm = FuzzyMatchResult('abc', 'xyz', [])
fm_1, fm_2 = fm.a_split_at(2)
assert not fm_1.has_match()
assert fm_1.a == 'ab'
assert fm_1.b == 'xyz'
assert not fm_2.has_match()
assert fm_2.a == 'c'
assert fm_2.b == 'xyz'
def test_b_split_no_match(self):
fm = FuzzyMatchResult('abc', 'xyz', [])
fm_1, fm_2 = fm.b_split_at(2)
assert not fm_1.has_match()
assert fm_1.a == 'abc'
assert fm_1.b == 'xy'
assert not fm_2.has_match()
assert fm_2.a == 'abc'
assert fm_2.b == 'z'
def test_a_split_exact_match(self):
fm = FuzzyMatchResult('abc', 'abc', [(0, 0, 3)])
fm_1, fm_2 = fm.a_split_at(2)
assert fm_1.a == 'ab'
assert fm_1.b == 'abc'
assert fm_1.has_match()
assert fm_1.ratio() == 1.0
assert fm_1.a_ratio() == 1.0
assert fm_1.b_ratio() == 2 / 3
assert fm_1.b_gap_ratio() == 2 / 3
assert fm_1.a_index_range() == (0, 2)
assert fm_1.b_index_range() == (0, 2)
assert fm_2.a == 'c'
assert fm_2.b == 'abc'
assert fm_2.has_match()
assert fm_2.ratio() == 1.0
assert fm_2.a_ratio() == 1.0
assert fm_2.b_ratio() == 1 / 3
assert fm_2.b_gap_ratio() == 1 / 3
assert fm_2.a_index_range() == (0, 1)
assert fm_2.b_index_range() == (0, 1)