|
10 | 10 |
|
11 | 11 | import re |
12 | 12 |
|
13 | | -from collections import defaultdict |
14 | 13 | from binascii import crc32 |
| 14 | +from collections import defaultdict |
15 | 15 | from itertools import islice |
16 | 16 |
|
| 17 | +from licensedcode.spans import Span |
17 | 18 | from licensedcode.stopwords import STOPWORDS |
18 | 19 | from textcode.analysis import numbered_text_lines |
19 | 20 |
|
@@ -118,6 +119,101 @@ def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): |
118 | 119 | yield token |
119 | 120 |
|
120 | 121 |
|
| 122 | +def get_existing_required_phrase_spans(text): |
| 123 | + """ |
| 124 | + Return a list of token position Spans, one for each {{tagged}} required phrase found in ``text``. |
| 125 | +
|
| 126 | + For example: |
| 127 | +
|
| 128 | + >>> text = 'This is enclosed in {{double curly braces}}' |
| 129 | + >>> # 0 1 2 3 4 5 6 |
| 130 | + >>> x = get_existing_required_phrase_spans(text) |
| 131 | + >>> assert x == [Span(4, 6)], x |
| 132 | +
|
| 133 | + >>> text = 'This is {{enclosed}} a {{double curly braces}} or not' |
| 134 | + >>> # 0 1 2 SW 3 4 5 6 7 |
| 135 | + >>> x = get_existing_required_phrase_spans(text) |
| 136 | + >>> assert x == [Span(2), Span(3, 5)], x |
| 137 | +
|
| 138 | + >>> text = 'This {{is}} enclosed a {{double curly braces}} or not' |
| 139 | + >>> # 0 1 2 SW 3 4 5 6 7 |
| 140 | + >>> x = get_existing_required_phrase_spans(text) |
| 141 | + >>> assert x == [Span([1]), Span([3, 4, 5])], x |
| 142 | +
|
| 143 | + >>> text = '{{AGPL-3.0 GNU Affero General Public License v3.0}}' |
| 144 | + >>> # 0 1 2 3 4 5 6 7 8 9 |
| 145 | + >>> x = get_existing_required_phrase_spans(text) |
| 146 | + >>> assert x == [Span(0, 9)], x |
| 147 | +
|
| 148 | + >>> assert get_existing_required_phrase_spans('{This}') == [] |
| 149 | +
|
| 150 | + >>> def check_exception(text): |
| 151 | + ... try: |
| 152 | + ... return get_existing_required_phrase_spans(text) |
| 153 | + ... except InvalidRuleRequiredPhrase: |
| 154 | + ... pass |
| 155 | +
|
| 156 | + >>> check_exception('This {{is') |
| 157 | + >>> check_exception('This }}is') |
| 158 | + >>> check_exception('{{This }}is{{') |
| 159 | + >>> check_exception('This }}is{{') |
| 160 | + >>> check_exception('{{}}') |
| 161 | + >>> check_exception('{{This is') |
| 162 | + >>> check_exception('{{This is{{') |
| 163 | + >>> check_exception('{{This is{{ }}') |
| 164 | + >>> check_exception('{{{{This}}}}') |
| 165 | + >>> check_exception('}}This {{is}}') |
| 166 | + >>> check_exception('This }} {{is}}') |
| 167 | + >>> check_exception('{{This}}') |
| 168 | + [Span(0)] |
| 169 | + >>> check_exception('{This}') |
| 170 | + [] |
| 171 | + >>> check_exception('{{{This}}}') |
| 172 | + [Span(0)] |
| 173 | + """ |
| 174 | + return list(get_phrase_spans(text)) |
| 175 | + |
| 176 | + |
| 177 | +class InvalidRuleRequiredPhrase(Exception): |
| 178 | + pass |
| 179 | + |
| 180 | + |
| 181 | + |
| 182 | +def get_phrase_spans(text): |
| 183 | + """ |
| 184 | + Yield position Spans for each tagged required phrase found in ``text``. |
| 185 | + """ |
| 186 | + ipos = 0 |
| 187 | + in_required_phrase = False |
| 188 | + current_phrase_positions = [] |
| 189 | + for token in required_phrase_tokenizer(text): |
| 190 | + if token == REQUIRED_PHRASE_OPEN: |
| 191 | + if in_required_phrase: |
| 192 | + raise InvalidRuleRequiredPhrase('Invalid rule with nested required phrase {{ {{ braces', text) |
| 193 | + in_required_phrase = True |
| 194 | + |
| 195 | + elif token == REQUIRED_PHRASE_CLOSE: |
| 196 | + if in_required_phrase: |
| 197 | + if current_phrase_positions: |
| 198 | + yield Span(current_phrase_positions) |
| 199 | + current_phrase_positions = [] |
| 200 | + else: |
| 201 | + raise InvalidRuleRequiredPhrase('Invalid rule with empty required phrase {{}} braces', text) |
| 202 | + in_required_phrase = False |
| 203 | + else: |
| 204 | + raise InvalidRuleRequiredPhrase(f'Invalid rule with dangling required phrase missing closing braces', text) |
| 205 | + continue |
| 206 | + else: |
| 207 | + if in_required_phrase: |
| 208 | + current_phrase_positions.append(ipos) |
| 209 | + ipos += 1 |
| 210 | + |
| 211 | + if current_phrase_positions or in_required_phrase: |
| 212 | + raise InvalidRuleRequiredPhrase(f'Invalid rule with dangling required phrase missing final closing braces', text) |
| 213 | + |
| 214 | + |
| 215 | + |
| 216 | + |
121 | 217 | def index_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): |
122 | 218 | """ |
123 | 219 | Return an iterable of tokens from a rule or query ``text`` using index |
|
0 commit comments