Skip to content

Commit 060fd63

Browse files
committed
add tests for extra-phrase removal in index_tokenizer_with_stopwords
Signed-off-by: Alok Kumar <[email protected]>
1 parent 490a081 commit 060fd63

File tree

2 files changed

+50
-2
lines changed

2 files changed

+50
-2
lines changed

src/licensedcode/tokenize.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,13 @@ def query_lines(
8282
required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall
8383

8484

85-
extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])'
85+
extra_phrase_pattern = '(?:' + query_pattern + r'|\[\[|\]\])'
8686
extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
8787

8888

89+
# pattern to match and remove extra phrases like [[1]], [[4]]..etc from the text
90+
extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]')
91+
8992
REQUIRED_PHRASE_OPEN = '{{'
9093
REQUIRED_PHRASE_CLOSE = '}}'
9194

@@ -349,6 +352,8 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS):
349352
"""
350353
if not text:
351354
return [], {}
355+
356+
text = extra_phrase_removal_pattern.sub('', text)
352357

353358
tokens = []
354359
tokens_append = tokens.append

tests/licensedcode/test_tokenize.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from licensedcode.tokenize import get_existing_required_phrase_spans
2222
from licensedcode.tokenize import get_extra_phrase_spans
2323
from licensedcode.tokenize import index_tokenizer
24+
from licensedcode.tokenize import index_tokenizer_with_stopwords
2425
from licensedcode.tokenize import InvalidRuleRequiredPhrase
2526
from licensedcode.tokenize import matched_query_text_tokenizer
2627
from licensedcode.tokenize import ngrams
@@ -640,7 +641,49 @@ def test_get_extra_phrase_spans_ignores_unclosed_opening_bracket(self):
640641

641642
def test_get_extra_phrase_spans_ignores_unopened_closing_bracket(self):
642643
text = 'Neither the name of 3]] nor the names of its'
643-
assert get_extra_phrase_spans(text) == []
644+
assert get_extra_phrase_spans(text) == []
645+
646+
647+
class TestIndexTokenizerWithStopwords(FileBasedTesting):
648+
test_data_dir = TEST_DATA_DIR
649+
650+
def test_index_tokenizer_with_stopwords_empty_input(self):
651+
toks, stops = index_tokenizer_with_stopwords('')
652+
assert toks == []
653+
assert stops == {}
654+
655+
def test_index_tokenizer_with_stopwords_removes_extra_phrase(self):
656+
text = 'Neither the name of [[3]] nor the names of its'
657+
toks, stops = index_tokenizer_with_stopwords(text)
658+
assert toks == ['neither', 'the', 'name', 'of', 'nor', 'the', 'names', 'of', 'its']
659+
assert stops == {}
660+
661+
def test_index_tokenizer_with_stopwords_removes_curly_phrase(self):
662+
text = '{{Hi}}some {{}}Text with{{junk}}spAces!'
663+
toks, stops = index_tokenizer_with_stopwords(text)
664+
assert toks == ['hi', 'some', 'text', 'with', 'junk', 'spaces']
665+
assert stops == {}
666+
667+
def test_index_tokenizer_with_custom_stopwords(self):
668+
stops_set = set(['is', 'a'])
669+
text = 'This is a test'
670+
toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set)
671+
assert toks == ['this', 'test']
672+
assert stops == {0: 2}
673+
674+
def test_index_tokenizer_with_leading_stopwords(self):
675+
stops_set = set(['is', 'a', 'the'])
676+
text = 'The is a test with result'
677+
toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set)
678+
assert toks == ['test', 'with', 'result']
679+
assert stops == {-1: 3}
680+
681+
def test_index_tokenizer_with_embedded_stopwords_after_position(self):
682+
stops_set = set(['markup', 'lt', 'gt', 'quot'])
683+
text = 'some &quot&lt markup &gt&quot'
684+
toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set)
685+
assert toks == ['some']
686+
assert stops == {0: 5}
644687

645688

646689
class TestNgrams(FileBasedTesting):

0 commit comments

Comments
 (0)