get only extra-phrase-spans

alok1304 · alok1304 · commit 490a0816b60e · 2025-07-01T19:09:19.000+05:30
Signed-off-by: Alok Kumar &lt;alokkumarjipura9973@gmail.com&gt;
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -1106,15 +1106,14 @@ def is_extra_words_position_valid(match):
     # Count of extra phrase markers   
     extra_phrase_count = 0
 
-    for span, allowed_extra_words in extra_phrase_spans:
-        rule_index = span.start - extra_phrase_count - 1
-        allowed_extra_words = allowed_extra_words
+    for span, allowed_extra_word in extra_phrase_spans:
+        rule_index = span.start
 
         matched_index = span.start + matched_count - extra_phrase_count
         extra_words_count = 0
 
         # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
-        if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
+        if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]):
             return False 
 
         # Count how many tokens in `matched_text` do not match the next rule token
@@ -1124,7 +1123,7 @@ def is_extra_words_position_valid(match):
             matched_count += 1
             extra_words_count += 1
 
-            if extra_words_count > allowed_extra_words:
+            if extra_words_count > allowed_extra_word:
                return False
 
         extra_phrase_count += 1
diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -7,7 +7,6 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
-import re
 import os
 import sys
 import traceback
@@ -2333,9 +2332,6 @@ def tokens(self):
         
         # identify and capture the spans of extra phrases specified within the rule
         self.extra_phrase_spans = list(self.extra_phrases())
-        
-        # remove extra_phrase marker from rules
-        self.text = remove_extra_phrase(self.text)
 
         text = self.text
         # We tag this rule as being a bare URL if it starts with a scheme and is
@@ -2600,13 +2596,6 @@ def from_match_data(license_match_mapping):
             return get_index().rules_by_id[rule_identifier]
 
 
-def remove_extra_phrase(text):
-    """
-    Remove extra phrase markers like [[n]], where the n is a digit.
-    """
-    pattern = r'\[\[\d+\]\]'
-    return re.sub(pattern, '', text)
-
 def compute_relevance(length):
     """
     Return a computed ``relevance`` given a ``length`` and a threshold.
diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
@@ -86,8 +86,6 @@ def query_lines(
 extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
 
 
-extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]')
-
 REQUIRED_PHRASE_OPEN = '{{'
 REQUIRED_PHRASE_CLOSE = '}}'
 
@@ -351,8 +349,6 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS):
     """
     if not text:
         return [], {}
-    
-    text = extra_phrase_removal_pattern.sub('', text)
 
     tokens = []
     tokens_append = tokens.append