|
21 | 21 | from licensedcode.tokenize import get_existing_required_phrase_spans |
22 | 22 | from licensedcode.tokenize import get_extra_phrase_spans |
23 | 23 | from licensedcode.tokenize import index_tokenizer |
| 24 | +from licensedcode.tokenize import index_tokenizer_with_stopwords |
24 | 25 | from licensedcode.tokenize import InvalidRuleRequiredPhrase |
25 | 26 | from licensedcode.tokenize import matched_query_text_tokenizer |
26 | 27 | from licensedcode.tokenize import ngrams |
@@ -640,7 +641,49 @@ def test_get_extra_phrase_spans_ignores_unclosed_opening_bracket(self): |
640 | 641 |
|
641 | 642 | def test_get_extra_phrase_spans_ignores_unopened_closing_bracket(self): |
642 | 643 | text = 'Neither the name of 3]] nor the names of its' |
643 | | - assert get_extra_phrase_spans(text) == [] |
| 644 | + assert get_extra_phrase_spans(text) == [] |
| 645 | + |
| 646 | + |
| 647 | +class TestIndexTokenizerWithStopwords(FileBasedTesting): |
| 648 | + test_data_dir = TEST_DATA_DIR |
| 649 | + |
| 650 | + def test_index_tokenizer_with_stopwords_empty_input(self): |
| 651 | + toks, stops = index_tokenizer_with_stopwords('') |
| 652 | + assert toks == [] |
| 653 | + assert stops == {} |
| 654 | + |
| 655 | + def test_index_tokenizer_with_stopwords_removes_extra_phrase(self): |
| 656 | + text = 'Neither the name of [[3]] nor the names of its' |
| 657 | + toks, stops = index_tokenizer_with_stopwords(text) |
| 658 | + assert toks == ['neither', 'the', 'name', 'of', 'nor', 'the', 'names', 'of', 'its'] |
| 659 | + assert stops == {} |
| 660 | + |
| 661 | + def test_index_tokenizer_with_stopwords_removes_curly_phrase(self): |
| 662 | + text = '{{Hi}}some {{}}Text with{{junk}}spAces!' |
| 663 | + toks, stops = index_tokenizer_with_stopwords(text) |
| 664 | + assert toks == ['hi', 'some', 'text', 'with', 'junk', 'spaces'] |
| 665 | + assert stops == {} |
| 666 | + |
| 667 | + def test_index_tokenizer_with_custom_stopwords(self): |
| 668 | + stops_set = set(['is', 'a']) |
| 669 | + text = 'This is a test' |
| 670 | + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) |
| 671 | + assert toks == ['this', 'test'] |
| 672 | + assert stops == {0: 2} |
| 673 | + |
| 674 | + def test_index_tokenizer_with_leading_stopwords(self): |
| 675 | + stops_set = set(['is', 'a', 'the']) |
| 676 | + text = 'The is a test with result' |
| 677 | + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) |
| 678 | + assert toks == ['test', 'with', 'result'] |
| 679 | + assert stops == {-1: 3} |
| 680 | + |
| 681 | + def test_index_tokenizer_with_embedded_stopwords_after_position(self): |
| 682 | + stops_set = set(['markup', 'lt', 'gt', 'quot']) |
| 683 | + text = 'some "< markup >"' |
| 684 | + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) |
| 685 | + assert toks == ['some'] |
| 686 | + assert stops == {0: 5} |
644 | 687 |
|
645 | 688 |
|
646 | 689 | class TestNgrams(FileBasedTesting): |
|
0 commit comments