Skip to content
Snippets Groups Projects
Commit 20b389c3 authored by Daniel Ecer's avatar Daniel Ecer
Browse files

treat asterisk as junk; treat repeating characters as junk

parent 6bbdc0df
No related branches found
No related tags found
No related merge requests found
......@@ -30,7 +30,9 @@ def len_index_range(index_range):
# Treat space or comma after a dot, or a dot after a letter as junk
DEFAULT_ISJUNK = lambda s, i: (
(i > 0 and s[i - 1] == '.' and (s[i] == ' ' or s[i] == ',')) or
(i > 0 and s[i - 1].isalpha() and s[i] == '.')
(i > 0 and s[i - 1].isalpha() and s[i] == '.') or
(i > 0 and s[i - 1] == s[i]) or
s[i] == '*'
)
DOT_IS_JUNK = lambda s, i: s[i] == '.'
......
......@@ -20,6 +20,12 @@ class TestRemoveJunk(object):
def test_should_remove_dots_after_capitals(self):
assert remove_junk('P.O. Box', DOT_IS_JUNK) == 'PO Box'
def test_should_remove_asterisk_after_capitals(self):
assert remove_junk('Mr Beam*') == 'Mr Beam'
def test_should_remove_repeating_characters(self):
assert remove_junk('Mr Beeeam') == 'Mr Beam'
class TestInvertIndexRanges(object):
def test_should_return_empty_for_empty_range(self):
assert list(invert_index_ranges([], 0, 0)) == list([])
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment