add test for correct position of extra-words and enhance detection_log

alok1304 · alok1304 · commit 467e233ceca7 · 2025-06-19T16:22:20.000+05:30
Add test for is correct position of `extra-words` according to `extra-phrases` that is present in rules.

if we find `extra-words` are in the right place then we set score to `100`.
And also show in `detection_log` why we increasing the score to keep track of this.

Signed-off-by: Alok Kumar &lt;alokkumarjipura9973@gmail.com&gt;
diff --git a/src/licensedcode/data/rules/bsd-new_158.RULE b/src/licensedcode/data/rules/bsd-new_158.RULE
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 
-Neither the name of [[3]] nor the names of its
+Neither the name of [[6]] nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/src/licensedcode/index.py b/src/licensedcode/index.py
@@ -391,9 +391,6 @@ def _add_rules(
             # "weak" rules can only be matched with an automaton exactly.
             is_weak = True
 
-            # identify and capture the spans of extra phrases specified within the rule
-            rule.extra_phrase_spans = list(rule.extra_phrases())
-         
             for rts in rule.tokens():
                 rule_tokens_append(rts)
                 rtid = dictionary_get(rts)
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -601,7 +601,7 @@ def score(self):
 
         # Check whether extra words in the matched text appear in allowed positions,
         # and do not exceed the maximum allowed word count at those positions.
-        if is_extra_words_position_valid(self):
+        if is_extra_words_position_valid(match=self):
             return 100
         
         # relevance is a number between 0 and 100. Divide by 100
@@ -1104,26 +1104,30 @@ def is_extra_words_position_valid(match):
     extra_phrase_count = 0
 
     for span, allowed_extra_words in extra_phrase_spans:
-        rule_index = span.start
+        rule_index = span.start - extra_phrase_count - 1
         allowed_extra_words = allowed_extra_words
 
-        matched_index = rule_index + matched_count - extra_phrase_count
+        matched_index = span.start + matched_count - extra_phrase_count
         extra_words_count = 0
 
-        # Count how many tokens in matched_text do not match the next rule token
+        # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
+        if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
+            return False 
+
+        # Count how many tokens in `matched_text` do not match the next rule token
         while (matched_index < len(matched_tokens) and
                matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
             matched_index += 1
             matched_count += 1
             extra_words_count += 1
 
-        extra_phrase_count += 1
+            if extra_words_count > allowed_extra_words:
+               return False
 
-        if extra_words_count > allowed_extra_words:
-            return False
+        extra_phrase_count += 1
 
     return True
-        
+
 
 def filter_contained_matches(
     matches,
diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -2334,8 +2334,13 @@ def tokens(self):
         recomputed as a side effect.
         """
         
+        # identify and capture the spans of extra phrases specified within the rule
+        self.extra_phrase_spans = list(self.extra_phrases())
+        
         # remove extra_phrase marker from rules
-        text = remove_extra_phrase(self.text)
+        self.text = remove_extra_phrase(self.text)
+
+        text = self.text
 
         # We tag this rule as being a bare URL if it starts with a scheme and is
         # on one line: this is used to determine a matching approach
diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json
@@ -5,7 +5,9 @@
       "license_expression": "bsd-new",
       "license_expression_spdx": "BSD-3-Clause",
       "detection_count": 1,
-      "detection_log": [],
+      "detection_log": [
+        "extra-words-permitted-in-rule"
+      ],
       "reference_matches": [
         {
           "license_expression": "bsd-new",
@@ -54,7 +56,9 @@
               "matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above copyright notice, this\r\n  list of conditions and the following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above copyright notice,\r\n  this list of conditions and the following disclaimer in the documentation\r\n  and/or other materials provided with the distribution.\r\n\r\n* Neither the name of [filesize] nor the names of its\r\n  contributors may be used to endorse or promote products derived from\r\n  this software without specific prior written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\r\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\r\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
             }
           ],
-          "detection_log": [],
+          "detection_log": [
+            "extra-words-permitted-in-rule"
+          ],
           "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
         }
       ],
diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py
@@ -616,9 +616,9 @@ def test_get_extra_phrase_spans_simple(self):
         assert spans == [(Span([2]), 2)]
 
     def test_get_extra_phrase_spans_multiple(self):
-        text = 'Some [[1]] text [[3]] with multiple markers.'
+        text = 'Some [[4]] text [[6]] with multiple markers.'
         spans = get_extra_phrase_spans(text)
-        assert spans == [(Span([1]), 1), (Span([3]), 3)]
+        assert spans == [(Span([1]), 4), (Span([3]), 6)]
 
     def test_get_extra_phrase_spans_returns_nothing_if_none_found(self):
         text = 'Just some normal text.'