Skip to content

Commit 7937593

Browse files
committed
Add new phrase for 'extra-words` in rules
Add new phrases like `extra_phrase` this is special for extra-words. This phrase is represented in the format [[n]], where n indicates the maximum number of extra-words allowed at that position in the rule. If extra-words appear at the correct position and their count does not exceed the allowed limit `n`, then the score is increased to `100`. Signed-off-by: Alok Kumar <[email protected]>
1 parent 008c7d2 commit 7937593

File tree

6 files changed

+17
-27
lines changed

6 files changed

+17
-27
lines changed

src/licensedcode/data/rules/bsd-new_158.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
1414
notice, this list of conditions and the following disclaimer in the
1515
documentation and/or other materials provided with the distribution.
1616

17-
Neither the name of [[6]] nor the names of its
17+
Neither the name of [[3]] nor the names of its
1818
contributors may be used to endorse or promote products derived from
1919
this software without specific prior written permission.
2020

src/licensedcode/index.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,9 @@ def _add_rules(
391391
# "weak" rules can only be matched with an automaton exactly.
392392
is_weak = True
393393

394+
# identify and capture the spans of extra phrases specified within the rule
395+
rule.extra_phrase_spans = list(rule.extra_phrases())
396+
394397
for rts in rule.tokens():
395398
rule_tokens_append(rts)
396399
rtid = dictionary_get(rts)

src/licensedcode/match.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ def score(self):
601601

602602
# Check whether extra words in the matched text appear in allowed positions,
603603
# and do not exceed the maximum allowed word count at those positions.
604-
if is_extra_words_position_valid(match=self):
604+
if is_extra_words_position_valid(self):
605605
return 100
606606

607607
# relevance is a number between 0 and 100. Divide by 100
@@ -1104,30 +1104,26 @@ def is_extra_words_position_valid(match):
11041104
extra_phrase_count = 0
11051105

11061106
for span, allowed_extra_words in extra_phrase_spans:
1107-
rule_index = span.start - extra_phrase_count - 1
1107+
rule_index = span.start
11081108
allowed_extra_words = allowed_extra_words
11091109

1110-
matched_index = span.start + matched_count - extra_phrase_count
1110+
matched_index = rule_index + matched_count - extra_phrase_count
11111111
extra_words_count = 0
11121112

1113-
# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
1114-
if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
1115-
return False
1116-
1117-
# Count how many tokens in `matched_text` do not match the next rule token
1113+
# Count how many tokens in matched_text do not match the next rule token
11181114
while (matched_index < len(matched_tokens) and
11191115
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
11201116
matched_index += 1
11211117
matched_count += 1
11221118
extra_words_count += 1
11231119

1124-
if extra_words_count > allowed_extra_words:
1125-
return False
1126-
11271120
extra_phrase_count += 1
11281121

1129-
return True
1122+
if extra_words_count > allowed_extra_words:
1123+
return False
11301124

1125+
return True
1126+
11311127

11321128
def filter_contained_matches(
11331129
matches,

src/licensedcode/models.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2334,13 +2334,8 @@ def tokens(self):
23342334
recomputed as a side effect.
23352335
"""
23362336

2337-
# identify and capture the spans of extra phrases specified within the rule
2338-
self.extra_phrase_spans = list(self.extra_phrases())
2339-
23402337
# remove extra_phrase marker from rules
2341-
self.text = remove_extra_phrase(self.text)
2342-
2343-
text = self.text
2338+
text = remove_extra_phrase(self.text)
23442339

23452340
# We tag this rule as being a bare URL if it starts with a scheme and is
23462341
# on one line: this is used to determine a matching approach

tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55
"license_expression": "bsd-new",
66
"license_expression_spdx": "BSD-3-Clause",
77
"detection_count": 1,
8-
"detection_log": [
9-
"extra-words-permitted-in-rule"
10-
],
8+
"detection_log": [],
119
"reference_matches": [
1210
{
1311
"license_expression": "bsd-new",
@@ -56,9 +54,7 @@
5654
"matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above copyright notice, this\r\n list of conditions and the following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above copyright notice,\r\n this list of conditions and the following disclaimer in the documentation\r\n and/or other materials provided with the distribution.\r\n\r\n* Neither the name of [filesize] nor the names of its\r\n contributors may be used to endorse or promote products derived from\r\n this software without specific prior written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\r\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\r\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
5755
}
5856
],
59-
"detection_log": [
60-
"extra-words-permitted-in-rule"
61-
],
57+
"detection_log": [],
6258
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
6359
}
6460
],

tests/licensedcode/test_tokenize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,9 +616,9 @@ def test_get_extra_phrase_spans_simple(self):
616616
assert spans == [(Span([2]), 2)]
617617

618618
def test_get_extra_phrase_spans_multiple(self):
619-
text = 'Some [[4]] text [[6]] with multiple markers.'
619+
text = 'Some [[1]] text [[3]] with multiple markers.'
620620
spans = get_extra_phrase_spans(text)
621-
assert spans == [(Span([1]), 4), (Span([3]), 6)]
621+
assert spans == [(Span([1]), 1), (Span([3]), 3)]
622622

623623
def test_get_extra_phrase_spans_returns_nothing_if_none_found(self):
624624
text = 'Just some normal text.'

0 commit comments

Comments
 (0)