Skip to content

Commit 3b7a1a5

Browse files
Add misc license detection improvements
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 64bf18c commit 3b7a1a5

File tree

11 files changed

+317
-65
lines changed

11 files changed

+317
-65
lines changed

src/licensedcode/data/rules/lead-in_unknown_43.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
license_expression: unknown-license-reference
3-
is_license_tag: yes
3+
is_license_clue: yes
44
relevance: 60
55
notes: Creative commons tag seen in RDF or XML documents
66
---

src/licensedcode/detection.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
from commoncode.resource import clean_path
2525
from commoncode.text import python_safe_name
26+
from commoncode.fileutils import as_posixpath
2627
from licensedcode.cache import build_spdx_license_expression
2728
from licensedcode.cache import get_cache
2829
from licensedcode.cache import get_index
@@ -130,6 +131,7 @@ class DetectionRule(Enum):
130131
EXTRA_WORDS = 'extra-words'
131132
LICENSE_CLUES = 'license-clues'
132133
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
134+
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
133135
FALSE_POSITIVE = 'possible-false-positive'
134136
NOT_LICENSE_CLUES = 'not-license-clues-as-more-detections-present'
135137
UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'
@@ -1139,10 +1141,10 @@ def has_extra_words(license_matches):
11391141

11401142
def has_low_rule_relevance(license_matches):
11411143
"""
1142-
Return True if any on the matches in ``license_matches`` List of LicenseMatch
1144+
Return True if all on the matches in ``license_matches`` List of LicenseMatch
11431145
objects has a match with low score because of low rule relevance.
11441146
"""
1145-
return any(
1147+
return all(
11461148
license_match.rule.relevance < LOW_RELEVANCE_THRESHOLD
11471149
for license_match in license_matches
11481150
)
@@ -1238,11 +1240,16 @@ def has_unknown_matches(license_matches):
12381240

12391241
def is_unknown_intro(license_match):
12401242
"""
1241-
Return True if the LicenseMatch is an unknown license intro.
1243+
Return True if the LicenseMatch is unknown and can be considered
1244+
as a license intro to other license matches.
1245+
I.e. this is not an unknown when followed by other proper matches.
12421246
"""
12431247
return (
12441248
license_match.rule.has_unknown and
1245-
license_match.rule.is_license_intro
1249+
(
1250+
license_match.rule.is_license_intro or license_match.rule.is_license_clue or
1251+
license_match.rule.license_expression == 'free-unknown'
1252+
)
12461253
)
12471254

12481255

@@ -1338,7 +1345,10 @@ def is_license_intro(license_match):
13381345
from licensedcode.match_aho import MATCH_AHO_EXACT
13391346

13401347
return (
1341-
license_match.rule.is_license_intro
1348+
(
1349+
license_match.rule.is_license_intro or license_match.rule.is_license_clue or
1350+
license_match.rule.license_expression == 'free-unknown'
1351+
)
13421352
and (
13431353
license_match.matcher == MATCH_AHO_EXACT
13441354
or license_match.coverage() == 100
@@ -1554,10 +1564,16 @@ def get_detected_license_expression(
15541564
elif analysis == DetectionCategory.EXTRA_WORDS.value:
15551565
if TRACE_ANALYSIS:
15561566
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
1557-
# Apply filtering or handling logic if needed
1567+
# TODO: Fix score if extra words allowed in rules
15581568
matches_for_expression = license_matches
15591569
detection_log.append(DetectionRule.EXTRA_WORDS.value)
15601570

1571+
elif analysis == DetectionCategory.IMPERFECT_COVERAGE.value:
1572+
if TRACE_ANALYSIS:
1573+
logger_debug(f'analysis {DetectionCategory.IMPERFECT_COVERAGE.value}')
1574+
matches_for_expression = license_matches
1575+
detection_log.append(DetectionRule.IMPERFECT_COVERAGE.value)
1576+
15611577
else:
15621578
if TRACE_ANALYSIS:
15631579
logger_debug(f'analysis not-combined')
@@ -1903,7 +1919,11 @@ def find_referenced_resource_from_package(referenced_filename, resource, codebas
19031919

19041920
datafile_paths = datafile_paths_by_package_uid.get(package_uid)
19051921
for path in datafile_paths:
1906-
datafile_path = posixpath.join(root_path, path)
1922+
# support strip_root and normal cases
1923+
if not as_posixpath(path).startswith(f"{as_posixpath(root_path)}/"):
1924+
datafile_path = posixpath.join(root_path, path)
1925+
else:
1926+
datafile_path = path
19071927
datafile_resource = codebase.get_resource(path=datafile_path)
19081928
if not datafile_resource or not datafile_resource.parent_path():
19091929
continue
@@ -1941,8 +1961,6 @@ def find_referenced_resource(referenced_filename, resource, codebase, **kwargs):
19411961
return resource
19421962

19431963
# Also look at codebase root for referenced file
1944-
# TODO: look at project root identified by key-files
1945-
# instead of codebase scan root
19461964
root_path = codebase.root.path
19471965
path = posixpath.join(root_path, referenced_filename)
19481966
resource = codebase.get_resource(path=path)

tests/licensedcode/data/plugin_license/text/scan-diag.expected.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
"license_expression": "fsf-ap",
66
"license_expression_spdx": "FSFAP",
77
"detection_count": 1,
8-
"detection_log": [],
8+
"detection_log": [
9+
"imperfect-match-coverage"
10+
],
911
"reference_matches": [
1012
{
1113
"license_expression": "fsf-ap",
@@ -114,7 +116,9 @@
114116
"matched_text_diagnostics": "and distribution of this file, with or without modification, are\npermitted in any medium without [royalties] provided the copyright notice\nand this notice are preserved. This file is offered as-is, without any"
115117
}
116118
],
117-
"detection_log": [],
119+
"detection_log": [
120+
"imperfect-match-coverage"
121+
],
118122
"identifier": "fsf_ap-49ad9aab-c91b-eeb7-e90f-dc3f959b1c36"
119123
}
120124
],

tests/licensedcode/data/plugin_license/text/scan.expected.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
"license_expression": "fsf-ap",
66
"license_expression_spdx": "FSFAP",
77
"detection_count": 1,
8-
"detection_log": [],
8+
"detection_log": [
9+
"imperfect-match-coverage"
10+
],
911
"reference_matches": [
1012
{
1113
"license_expression": "fsf-ap",
@@ -110,7 +112,9 @@
110112
"matched_text": "Reproduction and distribution of this file, with or without modification, are\npermitted in any medium without royalties provided the copyright notice\nand this notice are preserved. This file is offered as-is, without any warranties."
111113
}
112114
],
113-
"detection_log": [],
115+
"detection_log": [
116+
"imperfect-match-coverage"
117+
],
114118
"identifier": "fsf_ap-49ad9aab-c91b-eeb7-e90f-dc3f959b1c36"
115119
}
116120
],
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
{
2+
"license_detections": [
3+
{
4+
"identifier": "mit-5bfe2e00-00c1-6e00-d2a6-379ee31e7c0f",
5+
"license_expression": "mit",
6+
"license_expression_spdx": "MIT",
7+
"detection_count": 1,
8+
"detection_log": [
9+
"unknown-intro-followed-by-match"
10+
],
11+
"reference_matches": [
12+
{
13+
"license_expression": "free-unknown",
14+
"license_expression_spdx": "LicenseRef-scancode-free-unknown",
15+
"from_file": "scan-free-unknown-intro/README.md",
16+
"start_line": 3,
17+
"end_line": 5,
18+
"matcher": "2-aho",
19+
"score": 50.0,
20+
"matched_length": 3,
21+
"match_coverage": 100.0,
22+
"rule_relevance": 50,
23+
"rule_identifier": "free-unknown_88.RULE",
24+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/free-unknown_88.RULE",
25+
"matched_text": "This plugin was created in [Realytics](https://www.realytics.io/) in 2017. Thank you for supporting Open Source.\n\n## License",
26+
"matched_text_diagnostics": "Open Source.\n\n## License"
27+
},
28+
{
29+
"license_expression": "mit",
30+
"license_expression_spdx": "MIT",
31+
"from_file": "scan-free-unknown-intro/README.md",
32+
"start_line": 5,
33+
"end_line": 7,
34+
"matcher": "2-aho",
35+
"score": 100.0,
36+
"matched_length": 3,
37+
"match_coverage": 100.0,
38+
"rule_relevance": 100,
39+
"rule_identifier": "mit_31.RULE",
40+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/mit_31.RULE",
41+
"matched_text": "## License\n\nMIT License",
42+
"matched_text_diagnostics": "License\n\nMIT License"
43+
}
44+
]
45+
}
46+
],
47+
"files": [
48+
{
49+
"path": "README.md",
50+
"type": "file",
51+
"detected_license_expression": "mit",
52+
"detected_license_expression_spdx": "MIT",
53+
"license_detections": [
54+
{
55+
"license_expression": "mit",
56+
"license_expression_spdx": "MIT",
57+
"matches": [
58+
{
59+
"license_expression": "free-unknown",
60+
"license_expression_spdx": "LicenseRef-scancode-free-unknown",
61+
"from_file": "scan-free-unknown-intro/README.md",
62+
"start_line": 3,
63+
"end_line": 5,
64+
"matcher": "2-aho",
65+
"score": 50.0,
66+
"matched_length": 3,
67+
"match_coverage": 100.0,
68+
"rule_relevance": 50,
69+
"rule_identifier": "free-unknown_88.RULE",
70+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/free-unknown_88.RULE",
71+
"matched_text": "This plugin was created in [Realytics](https://www.realytics.io/) in 2017. Thank you for supporting Open Source.\n\n## License",
72+
"matched_text_diagnostics": "Open Source.\n\n## License"
73+
},
74+
{
75+
"license_expression": "mit",
76+
"license_expression_spdx": "MIT",
77+
"from_file": "scan-free-unknown-intro/README.md",
78+
"start_line": 5,
79+
"end_line": 7,
80+
"matcher": "2-aho",
81+
"score": 100.0,
82+
"matched_length": 3,
83+
"match_coverage": 100.0,
84+
"rule_relevance": 100,
85+
"rule_identifier": "mit_31.RULE",
86+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/mit_31.RULE",
87+
"matched_text": "## License\n\nMIT License",
88+
"matched_text_diagnostics": "License\n\nMIT License"
89+
}
90+
],
91+
"detection_log": [
92+
"unknown-intro-followed-by-match"
93+
],
94+
"identifier": "mit-5bfe2e00-00c1-6e00-d2a6-379ee31e7c0f"
95+
}
96+
],
97+
"license_clues": [],
98+
"percentage_of_license_text": 22.73,
99+
"scan_errors": []
100+
}
101+
]
102+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
## Credits
2+
3+
This plugin was created in [Realytics](https://www.realytics.io/) in 2017. Thank you for supporting Open Source.
4+
5+
## License
6+
7+
MIT License

0 commit comments

Comments
 (0)