Skip to content

Commit 451fffb

Browse files
committed
Refactor required phrase spans collection
* This is best moved into tokenize and is also used in models. * The code is simpler as it does not collect the texts, only the spans which is what is used for indexing, matching and required phrases tagging. Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 05f7981 commit 451fffb

File tree

2 files changed

+160
-1
lines changed

2 files changed

+160
-1
lines changed

src/licensedcode/tokenize.py

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010

1111
import re
1212

13-
from collections import defaultdict
1413
from binascii import crc32
14+
from collections import defaultdict
1515
from itertools import islice
1616

17+
from licensedcode.spans import Span
1718
from licensedcode.stopwords import STOPWORDS
1819
from textcode.analysis import numbered_text_lines
1920

@@ -118,6 +119,101 @@ def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
118119
yield token
119120

120121

122+
def get_existing_required_phrase_spans(text):
123+
"""
124+
Return a list of token position Spans, one for each {{tagged}} required phrase found in ``text``.
125+
126+
For example:
127+
128+
>>> text = 'This is enclosed in {{double curly braces}}'
129+
>>> # 0 1 2 3 4 5 6
130+
>>> x = get_existing_required_phrase_spans(text)
131+
>>> assert x == [Span(4, 6)], x
132+
133+
>>> text = 'This is {{enclosed}} a {{double curly braces}} or not'
134+
>>> # 0 1 2 SW 3 4 5 6 7
135+
>>> x = get_existing_required_phrase_spans(text)
136+
>>> assert x == [Span(2), Span(3, 5)], x
137+
138+
>>> text = 'This {{is}} enclosed a {{double curly braces}} or not'
139+
>>> # 0 1 2 SW 3 4 5 6 7
140+
>>> x = get_existing_required_phrase_spans(text)
141+
>>> assert x == [Span([1]), Span([3, 4, 5])], x
142+
143+
>>> text = '{{AGPL-3.0 GNU Affero General Public License v3.0}}'
144+
>>> # 0 1 2 3 4 5 6 7 8 9
145+
>>> x = get_existing_required_phrase_spans(text)
146+
>>> assert x == [Span(0, 9)], x
147+
148+
>>> assert get_existing_required_phrase_spans('{This}') == []
149+
150+
>>> def check_exception(text):
151+
... try:
152+
... return get_existing_required_phrase_spans(text)
153+
... except InvalidRuleRequiredPhrase:
154+
... pass
155+
156+
>>> check_exception('This {{is')
157+
>>> check_exception('This }}is')
158+
>>> check_exception('{{This }}is{{')
159+
>>> check_exception('This }}is{{')
160+
>>> check_exception('{{}}')
161+
>>> check_exception('{{This is')
162+
>>> check_exception('{{This is{{')
163+
>>> check_exception('{{This is{{ }}')
164+
>>> check_exception('{{{{This}}}}')
165+
>>> check_exception('}}This {{is}}')
166+
>>> check_exception('This }} {{is}}')
167+
>>> check_exception('{{This}}')
168+
[Span(0)]
169+
>>> check_exception('{This}')
170+
[]
171+
>>> check_exception('{{{This}}}')
172+
[Span(0)]
173+
"""
174+
return list(get_phrase_spans(text))
175+
176+
177+
class InvalidRuleRequiredPhrase(Exception):
178+
pass
179+
180+
181+
182+
def get_phrase_spans(text):
183+
"""
184+
Yield position Spans for each tagged required phrase found in ``text``.
185+
"""
186+
ipos = 0
187+
in_required_phrase = False
188+
current_phrase_positions = []
189+
for token in required_phrase_tokenizer(text):
190+
if token == REQUIRED_PHRASE_OPEN:
191+
if in_required_phrase:
192+
raise InvalidRuleRequiredPhrase('Invalid rule with nested required phrase {{ {{ braces', text)
193+
in_required_phrase = True
194+
195+
elif token == REQUIRED_PHRASE_CLOSE:
196+
if in_required_phrase:
197+
if current_phrase_positions:
198+
yield Span(current_phrase_positions)
199+
current_phrase_positions = []
200+
else:
201+
raise InvalidRuleRequiredPhrase('Invalid rule with empty required phrase {{}} braces', text)
202+
in_required_phrase = False
203+
else:
204+
raise InvalidRuleRequiredPhrase(f'Invalid rule with dangling required phrase missing closing braces', text)
205+
continue
206+
else:
207+
if in_required_phrase:
208+
current_phrase_positions.append(ipos)
209+
ipos += 1
210+
211+
if current_phrase_positions or in_required_phrase:
212+
raise InvalidRuleRequiredPhrase(f'Invalid rule with dangling required phrase missing final closing braces', text)
213+
214+
215+
216+
121217
def index_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
122218
"""
123219
Return an iterable of tokens from a rule or query ``text`` using index

tests/licensedcode/test_tokenize.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717

1818
from commoncode.testcase import FileBasedTesting
1919

20+
from licensedcode.spans import Span
21+
from licensedcode.tokenize import get_existing_required_phrase_spans
2022
from licensedcode.tokenize import index_tokenizer
23+
from licensedcode.tokenize import InvalidRuleRequiredPhrase
2124
from licensedcode.tokenize import matched_query_text_tokenizer
2225
from licensedcode.tokenize import ngrams
2326
from licensedcode.tokenize import query_lines
@@ -406,6 +409,10 @@ def test_index_tokenizer_lines_on_html_like_texts_2(self, regen=REGEN_TEST_FIXTU
406409
result = [list(index_tokenizer(line)) for _ln, line in lines]
407410
check_results(result, expected_file, regen=regen)
408411

412+
413+
class TestRequirePhraseTokenizer(FileBasedTesting):
414+
test_data_dir = TEST_DATA_DIR
415+
409416
def test_required_phrase_tokenizer_on_html_like_texts(self, regen=REGEN_TEST_FIXTURES):
410417
test_file = self.get_test_loc('tokenize/htmlish.txt')
411418
expected_file = test_file + '.expected.required_phrase_tokenizer.json'
@@ -521,6 +528,62 @@ def test_required_phrase_tokenizer_ignores_invalid_required_phrase_markup(self):
521528
'i', 'am', 'afraid'
522529
]
523530

531+
def test_get_existing_required_phrase_spans_returns_spans(self):
532+
text = (
533+
'This released software is {{released}} by under {{the MIT license}}. '
534+
'Which is a license originating at Massachusetts Institute of Technology (MIT).'
535+
)
536+
537+
spans = get_existing_required_phrase_spans(text)
538+
assert spans == [Span(4), Span(7, 9)]
539+
540+
def test_get_existing_required_phrase_spans_raises_exception_if_markup_is_not_closed(self):
541+
text = 'This software is {{released by under the MIT license.'
542+
try:
543+
list(get_existing_required_phrase_spans(text))
544+
raise Exception('Exception should be raised')
545+
except InvalidRuleRequiredPhrase:
546+
pass
547+
548+
def test_get_existing_required_phrase_spans_ignores_stopwords_in_positions(self):
549+
text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.'
550+
spans = get_existing_required_phrase_spans(text)
551+
assert spans == [Span(11, 12)]
552+
553+
def test_get_existing_required_phrase_spans_yields_spans_without_stop_words(self):
554+
text = 'This released software is {{released span}} by under {{the MIT quot license}}.'
555+
spans = get_existing_required_phrase_spans(text)
556+
assert spans == [Span(4), Span(7, 9)]
557+
558+
def test_get_existing_required_phrase_spans_does_not_yield_empty_spans(self):
559+
text = 'This released software {{comma}} is {{}} by under {{the MIT license}}.'
560+
try:
561+
list(get_existing_required_phrase_spans(text))
562+
raise Exception('Exception should be raised')
563+
except InvalidRuleRequiredPhrase:
564+
pass
565+
566+
def test_get_existing_required_phrase_spans_only_considers_outer_required_phrase_markup(self):
567+
text = 'This released {{{software under the MIT}}} license.'
568+
required_phrase_spans = get_existing_required_phrase_spans(text)
569+
assert required_phrase_spans == [Span(2, 5)]
570+
571+
def test_get_existing_required_phrase_spans_ignores_nested_required_phrase_markup(self):
572+
text = 'This released {{software {{under the}} MIT}} license.'
573+
try:
574+
list(get_existing_required_phrase_spans(text))
575+
raise Exception('Exception should be raised')
576+
except InvalidRuleRequiredPhrase:
577+
pass
578+
579+
def test_get_existing_required_phrase_spans_with_markup(self):
580+
text = (
581+
"Lua is free software distributed under the terms of the"
582+
"<A HREF='http://www.opensource.org/licenses/mit-license.html'>{{MIT license}}</A>"
583+
"reproduced below;"
584+
)
585+
assert get_existing_required_phrase_spans(text=text) == [Span(18, 19)]
586+
524587

525588
class TestNgrams(FileBasedTesting):
526589
test_data_dir = TEST_DATA_DIR

0 commit comments

Comments
 (0)