Skip to content

Commit 060946d

Browse files
committed
add test for correct position of extra-words and enhance detection_log
Add test for is correct position of `extra-words` according to `extra-phrases` that is present in rules. if we find `extra-words` are in the right place then we set score to `100`. And also show in `detection_log` why we increasing the score to keep track of this.
1 parent 547c03b commit 060946d

File tree

10 files changed

+165
-25
lines changed

10 files changed

+165
-25
lines changed

src/licensedcode/data/rules/bsd-new_158.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
1414
notice, this list of conditions and the following disclaimer in the
1515
documentation and/or other materials provided with the distribution.
1616

17-
Neither the name of [[3]] nor the names of its
17+
Neither the name of [[6]] nor the names of its
1818
contributors may be used to endorse or promote products derived from
1919
this software without specific prior written permission.
2020

src/licensedcode/data/rules/bsd-new_578.RULE

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ minimum_coverage: 99
66

77
Software License Agreement (BSD License)
88

9-
Redistribution and use in source and binary forms, with or without
9+
[[15]]
10+
11+
Redistribution and use [[4]] in source and binary forms, with or without
1012
modification, are permitted provided that the following conditions
1113
are met:
1214

@@ -16,7 +18,7 @@ are met:
1618
copyright notice, this list of conditions and the following
1719
disclaimer in the documentation and/or other materials provided
1820
with the distribution.
19-
* Neither the name of nor the names of its
21+
* Neither the name of [[6]] nor the names of its
2022
contributors may be used to endorse or promote products derived
2123
from this software without specific prior written permission.
2224

src/licensedcode/detection.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from licensedcode.cache import get_licensing
3131
from licensedcode.match import LicenseMatch
3232
from licensedcode.match import set_matched_lines
33+
from licensedcode.match import is_extra_words_position_valid
3334
from licensedcode.models import compute_relevance
3435
from licensedcode.models import Rule
3536
from licensedcode.models import UnDetectedRule
@@ -110,6 +111,7 @@ class DetectionCategory(Enum):
110111
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
111112
PACKAGE_ADD_FROM_FILE = 'from-package-file'
112113
EXTRA_WORDS = 'extra-words'
114+
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
113115
UNKNOWN_MATCH = 'unknown-match'
114116
LICENSE_CLUES = 'license-clues'
115117
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
@@ -129,6 +131,7 @@ class DetectionRule(Enum):
129131
"""
130132
UNKNOWN_MATCH = 'unknown-match'
131133
EXTRA_WORDS = 'extra-words'
134+
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
132135
LICENSE_CLUES = 'license-clues'
133136
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
134137
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
@@ -1072,6 +1075,7 @@ def is_correct_detection_non_unknown(license_matches):
10721075
is_correct_detection(license_matches)
10731076
and not has_unknown_matches(license_matches)
10741077
and not has_extra_words(license_matches)
1078+
and not is_extra_words_at_valid_positions(license_matches)
10751079
)
10761080

10771081

@@ -1159,6 +1163,16 @@ def has_low_rule_relevance(license_matches):
11591163
)
11601164

11611165

1166+
def is_extra_words_at_valid_positions(license_matches):
1167+
"""
1168+
Return True if any of the matches in ``license_matches`` List of LicenseMatch
1169+
has extra words are in the correct place.
1170+
"""
1171+
return any(
1172+
is_extra_words_position_valid(license_match)
1173+
for license_match in license_matches
1174+
)
1175+
11621176
def is_false_positive(license_matches, package_license=False):
11631177
"""
11641178
Return True if all of the matches in ``license_matches`` List of LicenseMatch
@@ -1570,6 +1584,12 @@ def get_detected_license_expression(
15701584
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
15711585
return detection_log, combined_expression
15721586

1587+
elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value:
1588+
if TRACE_ANALYSIS:
1589+
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}')
1590+
matches_for_expression = license_matches
1591+
detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value)
1592+
15731593
elif analysis == DetectionCategory.EXTRA_WORDS.value:
15741594
if TRACE_ANALYSIS:
15751595
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
@@ -1807,6 +1827,10 @@ def analyze_detection(license_matches, package_license=False):
18071827
threshold=IMPERFECT_MATCH_COVERAGE_THR,
18081828
):
18091829
return DetectionCategory.IMPERFECT_COVERAGE.value
1830+
1831+
# Case where `extra-words` are in the right place
1832+
elif is_extra_words_at_valid_positions(license_matches=license_matches):
1833+
return DetectionCategory.EXTRA_WORDS_PERMITTED.value
18101834

18111835
# Case where at least one of the match have extra words
18121836
elif has_extra_words(license_matches=license_matches):

src/licensedcode/index.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -391,9 +391,6 @@ def _add_rules(
391391
# "weak" rules can only be matched with an automaton exactly.
392392
is_weak = True
393393

394-
# identify and capture the spans of extra phrases specified within the rule
395-
rule.extra_phrase_spans = list(rule.extra_phrases())
396-
397394
for rts in rule.tokens():
398395
rule_tokens_append(rts)
399396
rtid = dictionary_get(rts)

src/licensedcode/match.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ def score(self):
601601

602602
# Check whether extra words in the matched text appear in allowed positions,
603603
# and do not exceed the maximum allowed word count at those positions.
604-
if is_extra_words_position_valid(self):
604+
if is_extra_words_position_valid(match=self):
605605
return 100
606606

607607
# relevance is a number between 0 and 100. Divide by 100
@@ -1104,26 +1104,30 @@ def is_extra_words_position_valid(match):
11041104
extra_phrase_count = 0
11051105

11061106
for span, allowed_extra_words in extra_phrase_spans:
1107-
rule_index = span.start
1107+
rule_index = span.start - extra_phrase_count - 1
11081108
allowed_extra_words = allowed_extra_words
11091109

1110-
matched_index = rule_index + matched_count - extra_phrase_count
1110+
matched_index = span.start + matched_count - extra_phrase_count
11111111
extra_words_count = 0
11121112

1113-
# Count how many tokens in matched_text do not match the next rule token
1113+
# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
1114+
if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
1115+
return False
1116+
1117+
# Count how many tokens in `matched_text` do not match the next rule token
11141118
while (matched_index < len(matched_tokens) and
11151119
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
11161120
matched_index += 1
11171121
matched_count += 1
11181122
extra_words_count += 1
11191123

1120-
extra_phrase_count += 1
1124+
if extra_words_count > allowed_extra_words:
1125+
return False
11211126

1122-
if extra_words_count > allowed_extra_words:
1123-
return False
1127+
extra_phrase_count += 1
11241128

11251129
return True
1126-
1130+
11271131

11281132
def filter_contained_matches(
11291133
matches,

src/licensedcode/models.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2319,6 +2319,9 @@ def load_data(self, rule_file):
23192319
except Exception:
23202320
trace = traceback.format_exc()
23212321
raise InvalidRule(f'While loading: file://{rule_file}\n{trace}')
2322+
2323+
# remove extra_phrase marker from rules
2324+
self.text = remove_extra_phrase(self.text)
23222325

23232326
return self
23242327

@@ -2331,8 +2334,13 @@ def tokens(self):
23312334
recomputed as a side effect.
23322335
"""
23332336

2337+
# identify and capture the spans of extra phrases specified within the rule
2338+
self.extra_phrase_spans = list(self.extra_phrases())
2339+
23342340
# remove extra_phrase marker from rules
2335-
text = remove_extra_phrase(self.text)
2341+
self.text = remove_extra_phrase(self.text)
2342+
2343+
text = self.text
23362344

23372345
# We tag this rule as being a bare URL if it starts with a scheme and is
23382346
# on one line: this is used to determine a matching approach

tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
"license_expression": "bsd-new",
66
"license_expression_spdx": "BSD-3-Clause",
77
"detection_count": 1,
8-
"detection_log": [],
8+
"detection_log": [
9+
"extra-words-permitted-in-rule"
10+
],
911
"reference_matches": [
1012
{
1113
"license_expression": "bsd-new",
@@ -54,7 +56,9 @@
5456
"matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above copyright notice, this\r\n list of conditions and the following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above copyright notice,\r\n this list of conditions and the following disclaimer in the documentation\r\n and/or other materials provided with the distribution.\r\n\r\n* Neither the name of [filesize] nor the names of its\r\n contributors may be used to endorse or promote products derived from\r\n this software without specific prior written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\r\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\r\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
5557
}
5658
],
57-
"detection_log": [],
59+
"detection_log": [
60+
"extra-words-permitted-in-rule"
61+
],
5862
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
5963
}
6064
],

tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
{
22
"license_detections": [
33
{
4-
"identifier": "bsd_new-95249a8d-f533-e7c7-159a-9b6e173cba42",
4+
"identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2",
55
"license_expression": "bsd-new",
66
"license_expression_spdx": "BSD-3-Clause",
77
"detection_count": 1,
88
"detection_log": [
9-
"extra-words"
9+
"extra-words-permitted-in-rule"
1010
],
1111
"reference_matches": [
1212
{
@@ -16,7 +16,7 @@
1616
"start_line": 1,
1717
"end_line": 31,
1818
"matcher": "3-seq",
19-
"score": 93.89,
19+
"score": 100,
2020
"matched_length": 215,
2121
"match_coverage": 100.0,
2222
"rule_relevance": 100,
@@ -46,7 +46,7 @@
4646
"start_line": 1,
4747
"end_line": 31,
4848
"matcher": "3-seq",
49-
"score": 93.89,
49+
"score": 100,
5050
"matched_length": 215,
5151
"match_coverage": 100.0,
5252
"rule_relevance": 100,
@@ -57,9 +57,9 @@
5757
}
5858
],
5959
"detection_log": [
60-
"extra-words"
60+
"extra-words-permitted-in-rule"
6161
],
62-
"identifier": "bsd_new-95249a8d-f533-e7c7-159a-9b6e173cba42"
62+
"identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2"
6363
}
6464
],
6565
"license_clues": [],

tests/licensedcode/test_match.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from licensedcode.match import filter_overlapping_matches
2121
from licensedcode.match import get_full_matched_text
2222
from licensedcode.match import get_matching_regions
23+
from licensedcode.match import is_extra_words_position_valid
2324
from licensedcode.match import LicenseMatch
2425
from licensedcode.match import merge_matches
2526
from licensedcode.match import reportable_tokens
@@ -1321,6 +1322,106 @@ def test_get_matching_regions_3_lines_enough(self):
13211322
assert matches[5].qspan in regions[1]
13221323

13231324

1325+
class TestExtraWordsPosition(FileBasedTesting):
1326+
test_data_dir = TEST_DATA_DIR
1327+
1328+
def test_valid_extra_words_within_limit(self):
1329+
rule_text = """
1330+
Redistribution and use [[4]] in source and binary forms are permitted.
1331+
"""
1332+
rule = create_rule_from_text_and_expression(
1333+
license_expression='extra-words',
1334+
text=rule_text
1335+
)
1336+
idx = index.LicenseIndex([rule])
1337+
1338+
query = """
1339+
Redistribution and use of this software in source and binary forms are permitted.
1340+
"""
1341+
match = idx.match(query_string=query, _skip_hash_match=True)[0]
1342+
assert is_extra_words_position_valid(match) is True
1343+
1344+
def test_invalid_extra_words_exceed_limit(self):
1345+
rule_text = """
1346+
Redistribution and use [[2]] in source and binary forms are permitted.
1347+
"""
1348+
rule = create_rule_from_text_and_expression(
1349+
license_expression='extra-words',
1350+
text=rule_text
1351+
)
1352+
idx = index.LicenseIndex([rule])
1353+
1354+
query = """
1355+
Redistribution and use of this software in source and binary forms are permitted.
1356+
"""
1357+
match = idx.match(query_string=query, _skip_hash_match=True)[0]
1358+
assert is_extra_words_position_valid(match) is False
1359+
1360+
def test_no_extra_words_allowed(self):
1361+
rule_text = """
1362+
Redistribution and use in source and binary forms are permitted.
1363+
"""
1364+
rule = create_rule_from_text_and_expression(
1365+
license_expression='extra-words',
1366+
text=rule_text
1367+
)
1368+
idx = index.LicenseIndex([rule])
1369+
1370+
query = """
1371+
Redistribution and use of software in source and binary forms are permitted.
1372+
"""
1373+
match = idx.match(query_string=query, _skip_hash_match=True)[0]
1374+
assert is_extra_words_position_valid(match) is False
1375+
1376+
def test_multiple_extra_spans_valid(self):
1377+
rule_text = """
1378+
Redistribution [[2]] and use [[1]] in source and binary forms are permitted.
1379+
"""
1380+
rule = create_rule_from_text_and_expression(
1381+
license_expression='extra-words',
1382+
text=rule_text
1383+
)
1384+
idx = index.LicenseIndex([rule])
1385+
1386+
query = """
1387+
Redistribution of content and use again in source and binary forms are permitted.
1388+
"""
1389+
match = idx.match(query_string=query, _skip_hash_match=True)[0]
1390+
assert is_extra_words_position_valid(match) is True
1391+
1392+
def test_extra_words_at_wrong_position(self):
1393+
rule_text = """
1394+
Redistribution and use [[2]] in source and binary forms are permitted.
1395+
"""
1396+
rule = create_rule_from_text_and_expression(
1397+
license_expression='extra-words',
1398+
text=rule_text
1399+
)
1400+
idx = index.LicenseIndex([rule])
1401+
1402+
query = """
1403+
Redistribution and amazing use in great source and binary forms are permitted.
1404+
"""
1405+
match = idx.match(query_string=query, _skip_hash_match=True)[0]
1406+
assert is_extra_words_position_valid(match) is False
1407+
1408+
def test_exact_match_without_extra_markers(self):
1409+
rule_text = """
1410+
Redistribution and use in source and binary forms are permitted.
1411+
"""
1412+
rule = create_rule_from_text_and_expression(
1413+
license_expression='extra-words',
1414+
text=rule_text
1415+
)
1416+
idx = index.LicenseIndex([rule])
1417+
1418+
query = """
1419+
Redistribution and use in source and binary forms are permitted.
1420+
"""
1421+
match = idx.match(query_string=query, _skip_hash_match=True)[0]
1422+
assert is_extra_words_position_valid(match) is False
1423+
1424+
13241425
class TestLicenseMatchScore(FileBasedTesting):
13251426
test_data_dir = TEST_DATA_DIR
13261427

tests/licensedcode/test_tokenize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,9 +616,9 @@ def test_get_extra_phrase_spans_simple(self):
616616
assert spans == [(Span([2]), 2)]
617617

618618
def test_get_extra_phrase_spans_multiple(self):
619-
text = 'Some [[1]] text [[3]] with multiple markers.'
619+
text = 'Some [[4]] text [[6]] with multiple markers.'
620620
spans = get_extra_phrase_spans(text)
621-
assert spans == [(Span([1]), 1), (Span([3]), 3)]
621+
assert spans == [(Span([1]), 4), (Span([3]), 6)]
622622

623623
def test_get_extra_phrase_spans_returns_nothing_if_none_found(self):
624624
text = 'Just some normal text.'

0 commit comments

Comments
 (0)