Skip to content

Commit 490a081

Browse files
committed
get only extra-phrase-spans
Signed-off-by: Alok Kumar <[email protected]>
1 parent 61284f4 commit 490a081

File tree

3 files changed

+4
-20
lines changed

3 files changed

+4
-20
lines changed

src/licensedcode/match.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,15 +1106,14 @@ def is_extra_words_position_valid(match):
11061106
# Count of extra phrase markers
11071107
extra_phrase_count = 0
11081108

1109-
for span, allowed_extra_words in extra_phrase_spans:
1110-
rule_index = span.start - extra_phrase_count - 1
1111-
allowed_extra_words = allowed_extra_words
1109+
for span, allowed_extra_word in extra_phrase_spans:
1110+
rule_index = span.start
11121111

11131112
matched_index = span.start + matched_count - extra_phrase_count
11141113
extra_words_count = 0
11151114

11161115
# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
1117-
if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
1116+
if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]):
11181117
return False
11191118

11201119
# Count how many tokens in `matched_text` do not match the next rule token
@@ -1124,7 +1123,7 @@ def is_extra_words_position_valid(match):
11241123
matched_count += 1
11251124
extra_words_count += 1
11261125

1127-
if extra_words_count > allowed_extra_words:
1126+
if extra_words_count > allowed_extra_word:
11281127
return False
11291128

11301129
extra_phrase_count += 1

src/licensedcode/models.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
import re
1110
import os
1211
import sys
1312
import traceback
@@ -2333,9 +2332,6 @@ def tokens(self):
23332332

23342333
# identify and capture the spans of extra phrases specified within the rule
23352334
self.extra_phrase_spans = list(self.extra_phrases())
2336-
2337-
# remove extra_phrase marker from rules
2338-
self.text = remove_extra_phrase(self.text)
23392335

23402336
text = self.text
23412337
# We tag this rule as being a bare URL if it starts with a scheme and is
@@ -2600,13 +2596,6 @@ def from_match_data(license_match_mapping):
26002596
return get_index().rules_by_id[rule_identifier]
26012597

26022598

2603-
def remove_extra_phrase(text):
2604-
"""
2605-
Remove extra phrase markers like [[n]], where the n is a digit.
2606-
"""
2607-
pattern = r'\[\[\d+\]\]'
2608-
return re.sub(pattern, '', text)
2609-
26102599
def compute_relevance(length):
26112600
"""
26122601
Return a computed ``relevance`` given a ``length`` and a threshold.

src/licensedcode/tokenize.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ def query_lines(
8686
extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
8787

8888

89-
extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]')
90-
9189
REQUIRED_PHRASE_OPEN = '{{'
9290
REQUIRED_PHRASE_CLOSE = '}}'
9391

@@ -351,8 +349,6 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS):
351349
"""
352350
if not text:
353351
return [], {}
354-
355-
text = extra_phrase_removal_pattern.sub('', text)
356352

357353
tokens = []
358354
tokens_append = tokens.append

0 commit comments

Comments
 (0)