Skip to content

Commit 467e233

Browse files
committed
add test for correct position of extra-words and enhance detection_log
Add test for is correct position of `extra-words` according to `extra-phrases` that is present in rules. if we find `extra-words` are in the right place then we set score to `100`. And also show in `detection_log` why we increasing the score to keep track of this. Signed-off-by: Alok Kumar <[email protected]>
1 parent 7937593 commit 467e233

File tree

6 files changed

+27
-17
lines changed

6 files changed

+27
-17
lines changed

src/licensedcode/data/rules/bsd-new_158.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
1414
notice, this list of conditions and the following disclaimer in the
1515
documentation and/or other materials provided with the distribution.
1616

17-
Neither the name of [[3]] nor the names of its
17+
Neither the name of [[6]] nor the names of its
1818
contributors may be used to endorse or promote products derived from
1919
this software without specific prior written permission.
2020

src/licensedcode/index.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -391,9 +391,6 @@ def _add_rules(
391391
# "weak" rules can only be matched with an automaton exactly.
392392
is_weak = True
393393

394-
# identify and capture the spans of extra phrases specified within the rule
395-
rule.extra_phrase_spans = list(rule.extra_phrases())
396-
397394
for rts in rule.tokens():
398395
rule_tokens_append(rts)
399396
rtid = dictionary_get(rts)

src/licensedcode/match.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ def score(self):
601601

602602
# Check whether extra words in the matched text appear in allowed positions,
603603
# and do not exceed the maximum allowed word count at those positions.
604-
if is_extra_words_position_valid(self):
604+
if is_extra_words_position_valid(match=self):
605605
return 100
606606

607607
# relevance is a number between 0 and 100. Divide by 100
@@ -1104,26 +1104,30 @@ def is_extra_words_position_valid(match):
11041104
extra_phrase_count = 0
11051105

11061106
for span, allowed_extra_words in extra_phrase_spans:
1107-
rule_index = span.start
1107+
rule_index = span.start - extra_phrase_count - 1
11081108
allowed_extra_words = allowed_extra_words
11091109

1110-
matched_index = rule_index + matched_count - extra_phrase_count
1110+
matched_index = span.start + matched_count - extra_phrase_count
11111111
extra_words_count = 0
11121112

1113-
# Count how many tokens in matched_text do not match the next rule token
1113+
# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
1114+
if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
1115+
return False
1116+
1117+
# Count how many tokens in `matched_text` do not match the next rule token
11141118
while (matched_index < len(matched_tokens) and
11151119
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
11161120
matched_index += 1
11171121
matched_count += 1
11181122
extra_words_count += 1
11191123

1120-
extra_phrase_count += 1
1124+
if extra_words_count > allowed_extra_words:
1125+
return False
11211126

1122-
if extra_words_count > allowed_extra_words:
1123-
return False
1127+
extra_phrase_count += 1
11241128

11251129
return True
1126-
1130+
11271131

11281132
def filter_contained_matches(
11291133
matches,

src/licensedcode/models.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2334,8 +2334,13 @@ def tokens(self):
23342334
recomputed as a side effect.
23352335
"""
23362336

2337+
# identify and capture the spans of extra phrases specified within the rule
2338+
self.extra_phrase_spans = list(self.extra_phrases())
2339+
23372340
# remove extra_phrase marker from rules
2338-
text = remove_extra_phrase(self.text)
2341+
self.text = remove_extra_phrase(self.text)
2342+
2343+
text = self.text
23392344

23402345
# We tag this rule as being a bare URL if it starts with a scheme and is
23412346
# on one line: this is used to determine a matching approach

tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
"license_expression": "bsd-new",
66
"license_expression_spdx": "BSD-3-Clause",
77
"detection_count": 1,
8-
"detection_log": [],
8+
"detection_log": [
9+
"extra-words-permitted-in-rule"
10+
],
911
"reference_matches": [
1012
{
1113
"license_expression": "bsd-new",
@@ -54,7 +56,9 @@
5456
"matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above copyright notice, this\r\n list of conditions and the following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above copyright notice,\r\n this list of conditions and the following disclaimer in the documentation\r\n and/or other materials provided with the distribution.\r\n\r\n* Neither the name of [filesize] nor the names of its\r\n contributors may be used to endorse or promote products derived from\r\n this software without specific prior written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\r\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\r\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
5557
}
5658
],
57-
"detection_log": [],
59+
"detection_log": [
60+
"extra-words-permitted-in-rule"
61+
],
5862
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
5963
}
6064
],

tests/licensedcode/test_tokenize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,9 +616,9 @@ def test_get_extra_phrase_spans_simple(self):
616616
assert spans == [(Span([2]), 2)]
617617

618618
def test_get_extra_phrase_spans_multiple(self):
619-
text = 'Some [[1]] text [[3]] with multiple markers.'
619+
text = 'Some [[4]] text [[6]] with multiple markers.'
620620
spans = get_extra_phrase_spans(text)
621-
assert spans == [(Span([1]), 1), (Span([3]), 3)]
621+
assert spans == [(Span([1]), 4), (Span([3]), 6)]
622622

623623
def test_get_extra_phrase_spans_returns_nothing_if_none_found(self):
624624
text = 'Just some normal text.'

0 commit comments

Comments
 (0)