Add new phrase for 'extra-words` in rules

alok1304 · alok1304 · commit b956432e4b5d · 2025-06-23T19:42:46.000+05:30
Add new phrases like `extra_phrase` this is special for extra-words.
This phrase is represented in the format [[n]], where n indicates the maximum number of extra-words allowed at that position in the rule.

If extra-words appear at the correct position and their count does not exceed the allowed limit `n`, then the score is increased to `100`.

Signed-off-by: Alok Kumar &lt;alokkumarjipura9973@gmail.com&gt;
diff --git a/src/licensedcode/data/rules/bsd-new_158.RULE b/src/licensedcode/data/rules/bsd-new_158.RULE
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 
-Neither the name of nor the names of its
+Neither the name of [[6]] nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/src/licensedcode/data/rules/bsd-new_578.RULE b/src/licensedcode/data/rules/bsd-new_578.RULE
@@ -6,7 +6,9 @@ minimum_coverage: 99
 
 Software License Agreement (BSD License)
 
-Redistribution and use in source and binary forms, with or without
+[[15]]
+
+Redistribution and use [[4]] in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 
@@ -16,7 +18,7 @@ are met:
    copyright notice, this list of conditions and the following
    disclaimer in the documentation and/or other materials provided
    with the distribution.
- * Neither the name of  nor the names of its
+ * Neither the name of [[6]] nor the names of its
    contributors may be used to endorse or promote products derived
    from this software without specific prior written permission.
 
diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
@@ -30,6 +30,7 @@
 from licensedcode.cache import get_licensing
 from licensedcode.match import LicenseMatch
 from licensedcode.match import set_matched_lines
+from licensedcode.match import is_extra_words_position_valid
 from licensedcode.models import compute_relevance
 from licensedcode.models import Rule
 from licensedcode.models import UnDetectedRule
@@ -110,6 +111,7 @@ class DetectionCategory(Enum):
     PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
     PACKAGE_ADD_FROM_FILE = 'from-package-file'
     EXTRA_WORDS = 'extra-words'
+    EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
     UNKNOWN_MATCH = 'unknown-match'
     LICENSE_CLUES = 'license-clues'
     LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
@@ -129,6 +131,7 @@ class DetectionRule(Enum):
     """
     UNKNOWN_MATCH = 'unknown-match'
     EXTRA_WORDS = 'extra-words'
+    EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
     LICENSE_CLUES = 'license-clues'
     LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
     IMPERFECT_COVERAGE = 'imperfect-match-coverage'
@@ -1072,6 +1075,7 @@ def is_correct_detection_non_unknown(license_matches):
         is_correct_detection(license_matches)
         and not has_unknown_matches(license_matches)
         and not has_extra_words(license_matches)
+        and not is_extra_words_at_valid_positions(license_matches)
     )  
 
 
@@ -1159,6 +1163,16 @@ def has_low_rule_relevance(license_matches):
     )
 
 
+def is_extra_words_at_valid_positions(license_matches):
+    """
+    Return True if any of the matches in ``license_matches`` List of LicenseMatch
+    has extra words are in the correct place.
+    """
+    return any(
+        is_extra_words_position_valid(license_match)
+        for license_match in license_matches
+    )
+
 def is_false_positive(license_matches, package_license=False):
     """
     Return True if all of the matches in ``license_matches`` List of LicenseMatch
@@ -1570,6 +1584,12 @@ def get_detected_license_expression(
         detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
         return detection_log, combined_expression
     
+    elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value:
+        if TRACE_ANALYSIS:
+            logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}')
+        matches_for_expression = license_matches
+        detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value)
+    
     elif analysis == DetectionCategory.EXTRA_WORDS.value:
         if TRACE_ANALYSIS:
             logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
@@ -1807,6 +1827,10 @@ def analyze_detection(license_matches, package_license=False):
         threshold=IMPERFECT_MATCH_COVERAGE_THR,
     ):
         return DetectionCategory.IMPERFECT_COVERAGE.value
+    
+    # Case where `extra-words` are in the right place
+    elif is_extra_words_at_valid_positions(license_matches=license_matches):
+        return DetectionCategory.EXTRA_WORDS_PERMITTED.value
 
     # Case where at least one of the match have extra words
     elif has_extra_words(license_matches=license_matches):
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -598,6 +598,12 @@ def score(self):
         in the matched range (including unknowns and unmatched) and the matched
         rule relevance.
         """
+
+        # Check whether extra words in the matched text appear in allowed positions,
+        # and do not exceed the maximum allowed word count at those positions.
+        if is_extra_words_position_valid(match=self):
+            return 100
+        
         # relevance is a number between 0 and 100. Divide by 100
         relevance = self.rule.relevance / 100
         if not relevance:
@@ -1071,6 +1077,57 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
 # early from the loops: trying to check containment on wildly separated matches
 # does not make sense
 
+def is_extra_words_position_valid(match):
+    """
+    Return True if the extra words appear in valid positions and 
+    do not exceed the maximum allowed word count at those positions.
+    Otherwise, return False.
+    """
+    
+    rule_spans = match.ispan.subspans()
+
+    # If there are multiple subspans, it means not all required tokens are contiguous.
+    if len(rule_spans) > 1:
+        return False
+
+    matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
+    rule_tokens = list(index_tokenizer(match.rule.text))
+    extra_phrase_spans = match.rule.extra_phrase_spans
+
+    if not extra_phrase_spans:
+        return False
+    
+    # count of `extra-words` tokens i.e inserted in `matched_tokens`
+    matched_count = 0
+
+    # Count of extra phrase markers   
+    extra_phrase_count = 0
+
+    for span, allowed_extra_words in extra_phrase_spans:
+        rule_index = span.start - extra_phrase_count - 1
+        allowed_extra_words = allowed_extra_words
+
+        matched_index = span.start + matched_count - extra_phrase_count
+        extra_words_count = 0
+
+        # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
+        if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
+            return False 
+
+        # Count how many tokens in `matched_text` do not match the next rule token
+        while (matched_index < len(matched_tokens) and
+               matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
+            matched_index += 1
+            matched_count += 1
+            extra_words_count += 1
+
+            if extra_words_count > allowed_extra_words:
+               return False
+
+        extra_phrase_count += 1
+
+    return True
+
 
 def filter_contained_matches(
     matches,
diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -7,6 +7,7 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
+import re
 import os
 import sys
 import traceback
@@ -43,6 +44,7 @@
 from licensedcode.tokenize import index_tokenizer
 from licensedcode.tokenize import index_tokenizer_with_stopwords
 from licensedcode.tokenize import query_lines
+from licensedcode.tokenize import get_extra_phrase_spans
 from scancode.api import SCANCODE_LICENSEDB_URL
 from scancode.api import SCANCODE_LICENSE_URL
 from scancode.api import SCANCODE_RULE_URL
@@ -1683,6 +1685,17 @@ class BasicRule:
         )
     )
 
+    extra_phrase_spans = attr.ib(
+        default=attr.Factory(list),
+        repr=False,
+        metadata=dict(
+            help='List of tuples `(Span, int)` representing extra phrases for this rule.'
+            'Each tuple contains a Span of token positions in the rule text and an integer'
+            'indicating the maximum number of extra tokens allowed at that position.'
+            'extra phrases are enclosed in [[double square brackets]] in the rule text.'
+        )
+    )
+
     source = attr.ib(
         default=None,
         repr=False,
@@ -2306,6 +2319,9 @@ def load_data(self, rule_file):
         except Exception:
             trace = traceback.format_exc()
             raise InvalidRule(f'While loading: file://{rule_file}\n{trace}')
+        
+        # remove extra_phrase marker from rules
+        self.text = remove_extra_phrase(self.text)
 
         return self
 
@@ -2317,8 +2333,15 @@ def tokens(self):
         "is_continuous",  "minimum_coverage" and "stopword_by_pos" are
         recomputed as a side effect.
         """
+        
+        # identify and capture the spans of extra phrases specified within the rule
+        self.extra_phrase_spans = list(self.extra_phrases())
+        
+        # remove extra_phrase marker from rules
+        self.text = remove_extra_phrase(self.text)
 
         text = self.text
+
         # We tag this rule as being a bare URL if it starts with a scheme and is
         # on one line: this is used to determine a matching approach
 
@@ -2353,6 +2376,17 @@ def _set_continuous(self):
         ):
             self.is_continuous = True
 
+    def extra_phrases(self):
+        """        
+        Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.
+
+        Each tuple consists of:
+            - a `Span` object representing the position in the tokenized rule text, and
+            - an integer `n` indicating how many extra tokens are allowed at that position.
+        """
+        if self.text:
+            yield from get_extra_phrase_spans(self.text)     
+
     def build_required_phrase_spans(self):
         """
         Return a list of Spans marking required phrases token positions of that must
@@ -2570,6 +2604,13 @@ def from_match_data(license_match_mapping):
             return get_index().rules_by_id[rule_identifier]
 
 
+def remove_extra_phrase(text):
+    """
+    Remove extra phrase markers like [[n]], where the n is a digit.
+    """
+    pattern = r'\[\[\d+\]\]'
+    return re.sub(pattern, '', text)
+
 def compute_relevance(length):
     """
     Return a computed ``relevance`` given a ``length`` and a threshold.
diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
@@ -81,12 +81,78 @@ def query_lines(
 required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
 required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall
 
+
+extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])'
+extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
+
 REQUIRED_PHRASE_OPEN = '{{'
 REQUIRED_PHRASE_CLOSE = '}}'
 
+EXTRA_PHRASE_OPEN ='[['
+EXTRA_PHRASE_CLOSE =']]'
+
 # FIXME: this should be folded in a single pass tokenization with the index_tokenizer
 
 
+def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
+    """
+    Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
+    This n denotes maximum number of extra-words i.e valide at that position.
+    This is same as ``required_phrase_tokenizer``.
+    """
+    if not text:
+        return
+    if not preserve_case:
+        text = text.lower()
+
+    for token in extra_phrase_splitter(text):
+        if token and token not in stopwords:
+            yield token
+
+
+def get_extra_phrase_spans(text):
+    """
+    Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
+    Here, `n` should always be a digit token inside the extra phrase brackets.
+
+    Example:
+    >>> text = 'Neither the name [[3]] of nor the names of its'
+    >>> #          0    1    2     3   4  5    6   7    8   9
+    >>> x = get_extra_phrase_spans(text)
+    >>> assert x == [(Span([3]), 3)], x
+    """
+    ipos = 0
+    in_extra_phrase = False
+    current_phrase_value = []
+    extra_phrase_spans = []
+
+    for token in extra_phrase_tokenizer(text):
+        if token == EXTRA_PHRASE_OPEN:
+            in_extra_phrase = True
+            current_phrase_value = []
+            continue
+
+        elif token == EXTRA_PHRASE_CLOSE:
+            if in_extra_phrase:
+                # token must be digit and token must be present in double square bracket ``[[token]]``
+                # and between extra phrases there must only one token exist
+                if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
+                    extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))
+
+            in_extra_phrase = False
+            current_phrase_value = []
+            continue
+
+        if in_extra_phrase:
+            # consider one token after double open square bracket ``[[``
+            if len(current_phrase_value) == 0:
+                current_phrase_value.append(token)
+
+        ipos += 1
+
+    return extra_phrase_spans   
+
+
 def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
     """
     Yield tokens from a rule ``text`` including required phrases {{brace}} markers.
diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json
@@ -1,12 +1,12 @@
 {
   "license_detections": [
     {
-      "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50",
+      "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e",
       "license_expression": "bsd-new",
       "license_expression_spdx": "BSD-3-Clause",
       "detection_count": 1,
       "detection_log": [
-        "extra-words"
+        "extra-words-permitted-in-rule"
       ],
       "reference_matches": [
         {
@@ -16,7 +16,7 @@
           "start_line": 4,
           "end_line": 27,
           "matcher": "2-aho",
-          "score": 99.53,
+          "score": 100,
           "matched_length": 210,
           "match_coverage": 100.0,
           "rule_relevance": 100,
@@ -46,7 +46,7 @@
               "start_line": 4,
               "end_line": 27,
               "matcher": "2-aho",
-              "score": 99.53,
+              "score": 100,
               "matched_length": 210,
               "match_coverage": 100.0,
               "rule_relevance": 100,
@@ -57,9 +57,9 @@
             }
           ],
           "detection_log": [
-            "extra-words"
+            "extra-words-permitted-in-rule"
           ],
-          "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
+          "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
         }
       ],
       "license_clues": [],
diff --git a/tests/licensedcode/test_license_models.py b/tests/licensedcode/test_license_models.py
diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py
diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py