Skip to content

Commit 0d8095e

Browse files
Enhance legal file handling #57
Change issue classification for legal files and changes issue grouping in files with license texts. Adds test expectations. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent b9cdbf6 commit 0d8095e

File tree

3 files changed

+216
-6
lines changed

3 files changed

+216
-6
lines changed

src/results_analyze/analyzer.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,10 @@ def from_license_matches(
358358
if not license_matches:
359359
return []
360360

361-
groups_of_license_matches = group_matches(license_matches)
361+
if not is_license_text:
362+
groups_of_license_matches = group_matches(license_matches)
363+
else:
364+
groups_of_license_matches = [license_matches]
362365
return analyze_matches(
363366
groups_of_license_matches, path, is_license_text, is_legal
364367
)
@@ -590,13 +593,13 @@ def get_license_text_issue_type(is_license_text, is_legal):
590593
Classifies the license detection issue into one of ISSUE_TYPES_BY_CLASSIFICATION,
591594
where it is a license text.
592595
"""
593-
if is_license_text:
594-
if is_legal:
596+
if is_legal:
597+
if is_license_text:
595598
return "text-legal-lic-files"
596599
else:
597-
return "text-non-legal-lic-files"
600+
return "text-lic-text-fragments"
598601
else:
599-
return "text-lic-text-fragments"
602+
return "text-non-legal-lic-files"
600603

601604

602605
def get_license_notice_issue_type(license_matches, issue_category):
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
{
2+
"path": "timescaledb-master/LICENSE",
3+
"type": "file",
4+
"name": "LICENSE",
5+
"base_name": "LICENSE",
6+
"extension": "",
7+
"size": 807,
8+
"date": "2021-01-18",
9+
"sha1": "90e62feb3ac6ca475f86e2efd18ce686c4034f2b",
10+
"md5": "a6919544b6c76fc2e0debe84e8dfc7a5",
11+
"sha256": "0378e0948feefd85f579319c74d6e2b671194037f550c7176ef26649d94c895b",
12+
"mime_type": "text/plain",
13+
"file_type": "ASCII text",
14+
"programming_language": null,
15+
"is_binary": false,
16+
"is_text": true,
17+
"is_archive": false,
18+
"is_media": false,
19+
"is_source": false,
20+
"is_script": false,
21+
"licenses": [
22+
{
23+
"key": "apache-2.0",
24+
"score": 49.5,
25+
"name": "Apache License 2.0",
26+
"short_name": "Apache 2.0",
27+
"category": "Permissive",
28+
"is_exception": false,
29+
"owner": "Apache Software Foundation",
30+
"homepage_url": "http://www.apache.org/licenses/",
31+
"text_url": "http://www.apache.org/licenses/LICENSE-2.0",
32+
"reference_url": "https://scancode-licensedb.aboutcode.org/apache-2.0",
33+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/apache-2.0.LICENSE",
34+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/apache-2.0.yml",
35+
"spdx_license_key": "Apache-2.0",
36+
"spdx_url": "https://spdx.org/licenses/Apache-2.0",
37+
"start_line": 1,
38+
"end_line": 2,
39+
"matched_rule": {
40+
"identifier": "apache-2.0_81.RULE",
41+
"license_expression": "apache-2.0",
42+
"licenses": [
43+
"apache-2.0"
44+
],
45+
"is_license_text": false,
46+
"is_license_notice": false,
47+
"is_license_reference": true,
48+
"is_license_tag": false,
49+
"is_license_intro": false,
50+
"matcher": "3-seq",
51+
"rule_length": 10,
52+
"matched_length": 9,
53+
"match_coverage": 90.0,
54+
"rule_relevance": 55
55+
},
56+
"matched_text": "Source code in this repository is variously licensed under the Apache License\nVersion 2.0, an Apache compatible license, or the Timescale License."
57+
},
58+
{
59+
"key": "apache-2.0",
60+
"score": 100.0,
61+
"name": "Apache License 2.0",
62+
"short_name": "Apache 2.0",
63+
"category": "Permissive",
64+
"is_exception": false,
65+
"owner": "Apache Software Foundation",
66+
"homepage_url": "http://www.apache.org/licenses/",
67+
"text_url": "http://www.apache.org/licenses/LICENSE-2.0",
68+
"reference_url": "https://scancode-licensedb.aboutcode.org/apache-2.0",
69+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/apache-2.0.LICENSE",
70+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/apache-2.0.yml",
71+
"spdx_license_key": "Apache-2.0",
72+
"spdx_url": "https://spdx.org/licenses/Apache-2.0",
73+
"start_line": 7,
74+
"end_line": 8,
75+
"matched_rule": {
76+
"identifier": "apache-2.0_152.RULE",
77+
"license_expression": "apache-2.0",
78+
"licenses": [
79+
"apache-2.0"
80+
],
81+
"is_license_text": false,
82+
"is_license_notice": true,
83+
"is_license_reference": false,
84+
"is_license_tag": false,
85+
"is_license_intro": false,
86+
"matcher": "2-aho",
87+
"rule_length": 9,
88+
"matched_length": 9,
89+
"match_coverage": 100.0,
90+
"rule_relevance": 100
91+
},
92+
"matched_text": "* Outside of the \"tsl\" directory, source code in a given file is licensed\n under the Apache License Version 2.0, unless otherwise noted (e.g., an"
93+
},
94+
{
95+
"key": "unknown-license-reference",
96+
"score": 16.0,
97+
"name": "Unknown License file reference",
98+
"short_name": "Unknown License reference",
99+
"category": "Unstated License",
100+
"is_exception": false,
101+
"owner": "Unspecified",
102+
"homepage_url": null,
103+
"text_url": "",
104+
"reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
105+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
106+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
107+
"spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
108+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
109+
"start_line": 11,
110+
"end_line": 11,
111+
"matched_rule": {
112+
"identifier": "license-intro_26.RULE",
113+
"license_expression": "unknown-license-reference",
114+
"licenses": [
115+
"unknown-license-reference"
116+
],
117+
"is_license_text": false,
118+
"is_license_notice": false,
119+
"is_license_reference": false,
120+
"is_license_tag": false,
121+
"is_license_intro": true,
122+
"matcher": "2-aho",
123+
"rule_length": 3,
124+
"matched_length": 3,
125+
"match_coverage": 100.0,
126+
"rule_relevance": 16
127+
},
128+
"matched_text": "* Within the \"tsl\" folder, source code in a given file is licensed under the"
129+
},
130+
{
131+
"key": "unknown-license-reference",
132+
"score": 16.0,
133+
"name": "Unknown License file reference",
134+
"short_name": "Unknown License reference",
135+
"category": "Unstated License",
136+
"is_exception": false,
137+
"owner": "Unspecified",
138+
"homepage_url": null,
139+
"text_url": "",
140+
"reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
141+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
142+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
143+
"spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
144+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
145+
"start_line": 16,
146+
"end_line": 16,
147+
"matched_rule": {
148+
"identifier": "license-intro_26.RULE",
149+
"license_expression": "unknown-license-reference",
150+
"licenses": [
151+
"unknown-license-reference"
152+
],
153+
"is_license_text": false,
154+
"is_license_notice": false,
155+
"is_license_reference": false,
156+
"is_license_tag": false,
157+
"is_license_intro": true,
158+
"matcher": "2-aho",
159+
"rule_length": 3,
160+
"matched_length": 3,
161+
"match_coverage": 100.0,
162+
"rule_relevance": 16
163+
},
164+
"matched_text": "that contain `-tsl` in their name are licensed under the Timescale License."
165+
}
166+
],
167+
"license_expressions": [
168+
"apache-2.0",
169+
"apache-2.0",
170+
"unknown-license-reference",
171+
"unknown-license-reference"
172+
],
173+
"percentage_of_license_text": 19.51,
174+
"copyrights": [],
175+
"holders": [],
176+
"authors": [],
177+
"packages": [],
178+
"emails": [],
179+
"urls": [],
180+
"is_legal": true,
181+
"is_manifest": false,
182+
"is_readme": false,
183+
"is_top_level": true,
184+
"is_key_file": true,
185+
"is_license_text": false,
186+
"files_count": 0,
187+
"dirs_count": 0,
188+
"size_count": 0,
189+
"scan_errors": []
190+
}

tests/test_analyzer.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ def test_analyzer_group_matches_notice_reference_fragments(self):
506506
assert len(list(grouped_matches)) == 2
507507

508508

509-
class TestLicenseMatchErrorResult(FileBasedTesting):
509+
class TestFromLicenseMatches(FileBasedTesting):
510510
test_data_dir = os.path.join(os.path.dirname(__file__), "data/analyzer/")
511511

512512
def test_analyze_license_matches_return_empty_list_with_none_matches(self):
@@ -517,6 +517,23 @@ def test_analyze_license_matches_return_empty_list_with_empty_matches(self):
517517
results = analyzer.LicenseDetectionIssue.from_license_matches([])
518518
assert results == []
519519

520+
def test_dont_group_license_matches_in_high_license_text_files(self):
521+
test_file = self.get_test_loc("dont_group_matches_in_legal_file.json")
522+
file_scan_result = load_json(test_file)
523+
matched_licences = LicenseMatch.from_files_licenses(
524+
file_scan_result["licenses"]
525+
)
526+
is_license_text = True
527+
is_legal = file_scan_result["is_legal"]
528+
issues = analyzer.LicenseDetectionIssue.from_license_matches(
529+
license_matches=matched_licences,
530+
path="path/to/dont_group_matches_in_legal_file.json",
531+
is_license_text=is_license_text,
532+
is_legal=is_legal,
533+
)
534+
assert len(list(issues)) == 1
535+
536+
520537
def test_group_license_matches_by_location_and_analyze(self):
521538
# TODO: Add Explanation for all creation of test Files from scancode scan results
522539
test_file = self.get_test_loc("group_matches_by_location_analyze.json")

0 commit comments

Comments
 (0)