Add new phrase for 'extra-words` in rules

alok1304 · alok1304 · commit 25849d2b959c · 2025-06-19T14:01:18.000+05:30
Add new phrases like `extra_phrase` this is special for extra-words.
This phrase is represented in the format [[n]], where n indicates the maximum number of extra-words allowed at that position in the rule.

If extra-words appear at the correct position and their count does not exceed the allowed limit `n`, then the score is increased to `100`.

Signed-off-by: Alok Kumar &lt;alokkumarjipura9973@gmail.com&gt;
diff --git a/src/licensedcode/data/rules/bsd-new_158.RULE b/src/licensedcode/data/rules/bsd-new_158.RULE
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 
-Neither the name of nor the names of its
+Neither the name of [[3]] nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/src/licensedcode/index.py b/src/licensedcode/index.py
@@ -391,6 +391,9 @@ def _add_rules(
             # "weak" rules can only be matched with an automaton exactly.
             is_weak = True
 
+            # identify and capture the spans of extra phrases specified within the rule
+            rule.extra_phrase_spans = list(rule.extra_phrases())
+         
             for rts in rule.tokens():
                 rule_tokens_append(rts)
                 rtid = dictionary_get(rts)
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -598,6 +598,12 @@ def score(self):
         in the matched range (including unknowns and unmatched) and the matched
         rule relevance.
         """
+
+        # Check whether extra words in the matched text appear in allowed positions,
+        # and do not exceed the maximum allowed word count at those positions.
+        if is_extra_words_position_valid(self):
+            return 100
+        
         # relevance is a number between 0 and 100. Divide by 100
         relevance = self.rule.relevance / 100
         if not relevance:
@@ -1071,6 +1077,53 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
 # early from the loops: trying to check containment on wildly separated matches
 # does not make sense
 
+def is_extra_words_position_valid(match):
+    """
+    Return True if the extra words appear in valid positions and 
+    do not exceed the maximum allowed word count at those positions.
+    Otherwise, return False.
+    """
+    
+    rule_spans = match.ispan.subspans()
+
+    # If there are multiple subspans, it means not all required tokens are contiguous.
+    if len(rule_spans) > 1:
+        return False
+
+    matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
+    rule_tokens = list(index_tokenizer(match.rule.text))
+    extra_phrase_spans = match.rule.extra_phrase_spans
+
+    if not extra_phrase_spans:
+        return False
+    
+    # count of `extra-words` tokens i.e inserted in `matched_tokens`
+    matched_count = 0
+
+    # Count of extra phrase markers   
+    extra_phrase_count = 0
+
+    for span, allowed_extra_words in extra_phrase_spans:
+        rule_index = span.start
+        allowed_extra_words = allowed_extra_words
+
+        matched_index = rule_index + matched_count - extra_phrase_count
+        extra_words_count = 0
+
+        # Count how many tokens in matched_text do not match the next rule token
+        while (matched_index < len(matched_tokens) and
+               matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
+            matched_index += 1
+            matched_count += 1
+            extra_words_count += 1
+
+        extra_phrase_count += 1
+
+        if extra_words_count > allowed_extra_words:
+            return False
+
+    return True
+        
 
 def filter_contained_matches(
     matches,
diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -7,6 +7,7 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
+import re
 import os
 import sys
 import traceback
@@ -43,6 +44,7 @@
 from licensedcode.tokenize import index_tokenizer
 from licensedcode.tokenize import index_tokenizer_with_stopwords
 from licensedcode.tokenize import query_lines
+from licensedcode.tokenize import get_extra_phrase_spans
 from scancode.api import SCANCODE_LICENSEDB_URL
 from scancode.api import SCANCODE_LICENSE_URL
 from scancode.api import SCANCODE_RULE_URL
@@ -1683,6 +1685,17 @@ class BasicRule:
         )
     )
 
+    extra_phrase_spans = attr.ib(
+        default=attr.Factory(list),
+        repr=False,
+        metadata=dict(
+            help='List of tuples `(Span, int)` representing extra phrases for this rule.'
+            'Each tuple contains a Span of token positions in the rule text and an integer'
+            'indicating the maximum number of extra tokens allowed at that position.'
+            'extra phrases are enclosed in [[double square brackets]] in the rule text.'
+        )
+    )
+
     source = attr.ib(
         default=None,
         repr=False,
@@ -2317,8 +2330,10 @@ def tokens(self):
         "is_continuous",  "minimum_coverage" and "stopword_by_pos" are
         recomputed as a side effect.
         """
+        
+        # remove extra_phrase marker from rules
+        text = remove_extra_phrase(self.text)
 
-        text = self.text
         # We tag this rule as being a bare URL if it starts with a scheme and is
         # on one line: this is used to determine a matching approach
 
@@ -2353,6 +2368,17 @@ def _set_continuous(self):
         ):
             self.is_continuous = True
 
+    def extra_phrases(self):
+        """        
+        Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.
+
+        Each tuple consists of:
+            - a `Span` object representing the position in the tokenized rule text, and
+            - an integer `n` indicating how many extra tokens are allowed at that position.
+        """
+        if self.text:
+            yield from get_extra_phrase_spans(self.text)     
+
     def build_required_phrase_spans(self):
         """
         Return a list of Spans marking required phrases token positions of that must
@@ -2570,6 +2596,13 @@ def from_match_data(license_match_mapping):
             return get_index().rules_by_id[rule_identifier]
 
 
+def remove_extra_phrase(text):
+    """
+    Remove extra phrase markers like [[n]], where the n is a digit.
+    """
+    pattern = r'\[\[\d+\]\]'
+    return re.sub(pattern, '', text)
+
 def compute_relevance(length):
     """
     Return a computed ``relevance`` given a ``length`` and a threshold.
diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
@@ -81,12 +81,78 @@ def query_lines(
 required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
 required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall
 
+
+extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])'
+extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
+
 REQUIRED_PHRASE_OPEN = '{{'
 REQUIRED_PHRASE_CLOSE = '}}'
 
+EXTRA_PHRASE_OPEN ='[['
+EXTRA_PHRASE_CLOSE =']]'
+
 # FIXME: this should be folded in a single pass tokenization with the index_tokenizer
 
 
+def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
+    """
+    Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
+    This n denotes maximum number of extra-words i.e valide at that position.
+    This is same as ``required_phrase_tokenizer``.
+    """
+    if not text:
+        return
+    if not preserve_case:
+        text = text.lower()
+
+    for token in extra_phrase_splitter(text):
+        if token and token not in stopwords:
+            yield token
+
+
+def get_extra_phrase_spans(text):
+    """
+    Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
+    Here, `n` should always be a digit token inside the extra phrase brackets.
+
+    Example:
+    >>> text = 'Neither the name [[3]] of nor the names of its'
+    >>> #          0    1    2     3   4  5    6   7    8   9
+    >>> x = get_extra_phrase_spans(text)
+    >>> assert x == [(Span([3]), 3)], x
+    """
+    ipos = 0
+    in_extra_phrase = False
+    current_phrase_value = []
+    extra_phrase_spans = []
+
+    for token in extra_phrase_tokenizer(text):
+        if token == EXTRA_PHRASE_OPEN:
+            in_extra_phrase = True
+            current_phrase_value = []
+            continue
+
+        elif token == EXTRA_PHRASE_CLOSE:
+            if in_extra_phrase:
+                # token must be digit and token must be present in double square bracket ``[[token]]``
+                # and between extra phrases there must only one token exist
+                if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
+                    extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))
+
+            in_extra_phrase = False
+            current_phrase_value = []
+            continue
+
+        if in_extra_phrase:
+            # consider one token after double open square bracket ``[[``
+            if len(current_phrase_value) == 0:
+                current_phrase_value.append(token)
+
+        ipos += 1
+
+    return extra_phrase_spans   
+
+
 def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
     """
     Yield tokens from a rule ``text`` including required phrases {{brace}} markers.
diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json
@@ -1,13 +1,11 @@
 {
   "license_detections": [
     {
-      "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50",
+      "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e",
       "license_expression": "bsd-new",
       "license_expression_spdx": "BSD-3-Clause",
       "detection_count": 1,
-      "detection_log": [
-        "extra-words"
-      ],
+      "detection_log": [],
       "reference_matches": [
         {
           "license_expression": "bsd-new",
@@ -16,7 +14,7 @@
           "start_line": 4,
           "end_line": 27,
           "matcher": "2-aho",
-          "score": 99.53,
+          "score": 100,
           "matched_length": 210,
           "match_coverage": 100.0,
           "rule_relevance": 100,
@@ -46,7 +44,7 @@
               "start_line": 4,
               "end_line": 27,
               "matcher": "2-aho",
-              "score": 99.53,
+              "score": 100,
               "matched_length": 210,
               "match_coverage": 100.0,
               "rule_relevance": 100,
@@ -56,10 +54,8 @@
               "matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above copyright notice, this\r\n  list of conditions and the following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above copyright notice,\r\n  this list of conditions and the following disclaimer in the documentation\r\n  and/or other materials provided with the distribution.\r\n\r\n* Neither the name of [filesize] nor the names of its\r\n  contributors may be used to endorse or promote products derived from\r\n  this software without specific prior written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\r\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\r\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
             }
           ],
-          "detection_log": [
-            "extra-words"
-          ],
-          "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
+          "detection_log": [],
+          "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
         }
       ],
       "license_clues": [],
diff --git a/tests/licensedcode/test_license_models.py b/tests/licensedcode/test_license_models.py
@@ -591,6 +591,14 @@ def test_key_phrases_yields_spans(self):
         key_phrase_spans = list(rule.build_required_phrase_spans())
         assert key_phrase_spans == [Span(4), Span(7, 9)]
 
+    def test_extra_phrases_yields_spans(self):
+        rule_text = (
+            'Neither the name of [[3]] nor the names of its'
+        )
+        rule = models.Rule(license_expression='bsd-new', text=rule_text)
+        extra_phrase_spans = list(rule.extra_phrases())
+        assert extra_phrase_spans == [(Span(4),3)]        
+
     def test_key_phrases_raises_exception_when_markup_is_not_closed(self):
         rule_text = (
             'This released software is {{released}} by under {{the MIT license. '
diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py
@@ -1381,6 +1381,29 @@ def test_LicenseMatch_score_100_non_contiguous(self):
         m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41))
         assert m1.score() == 80.77
 
+    def test_LicenseMatch_matches_score_100_for_extra_words_within_limit(self):
+        rule_text = 'Neither the name of [[3]] nor the names of its'
+        rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text)
+        idx = index.LicenseIndex([rule])
+
+        query = 'Neither the name of XXX YYY ZZZ nor the names of its'
+        matches = idx.match(query_string=query, _skip_hash_match=True)
+        match = matches[0]
+        score = match.score()
+        assert score == 100
+
+    def test_LicenseMatch_matches_score_not_100_for_extra_words_exceed_limit(self):
+        rule_text = 'Neither the name of [[3]] nor the names of its'
+        rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text)
+        idx = index.LicenseIndex([rule])
+
+        # The query includes 4 extra words instead of the allowed 3.
+        query = 'Neither the name of XXX YYY ZZZ AAA nor the names of its'
+        matches = idx.match(query_string=query, _skip_hash_match=True)
+        match = matches[0]
+        score = match.score()
+        assert score != 100         
+
     def test_LicenseMatch_stopwords_are_treated_as_unknown_2484(self):
         rules_dir = self.get_test_loc('stopwords/index/rules')
         lics_dir = self.get_test_loc('stopwords/index/licenses')
diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py