Skip to content

Commit c7d9b5c

Browse files
Merge pull request #55 from AyanSinhaMahapatra/improve-heuristics
Add heuristic improvements
2 parents 904f597 + 0d8095e commit c7d9b5c

File tree

7 files changed

+717
-14
lines changed

7 files changed

+717
-14
lines changed

src/results_analyze/analyzer.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import attr
1111
from collections import Counter
1212

13+
from licensedcode.tokenize import query_tokenizer
14+
1315
# All values of match_coverage less than this value are taken as
1416
# `near-perfect-match-coverage` cases
1517
NEAR_PERFECT_MATCH_COVERAGE_THR = 100
@@ -197,6 +199,15 @@ class IssueType:
197199
),
198200
analysis_confidence="medium",
199201
),
202+
"intro-unknown-match": IssueType(
203+
is_license_reference=True,
204+
classification_id="intro-unknown-match",
205+
classification_description=(
206+
"A piece of common introduction to a license text/notice/reference is "
207+
"detected."
208+
),
209+
analysis_confidence="medium",
210+
),
200211
}
201212

202213

@@ -263,6 +274,24 @@ def identifier(self):
263274
data.append(identifier)
264275

265276
return tuple(data)
277+
278+
@property
279+
def identifier_for_unknown_intro(self):
280+
"""
281+
This is an identifier for a issue, which is an unknown license intro,
282+
based on it's underlying license matches.
283+
"""
284+
data = []
285+
for license_match in self.original_licenses:
286+
tokenized_matched_text = tuple(query_tokenizer(license_match.matched_text))
287+
identifier = (
288+
license_match.rule_identifier,
289+
license_match.match_coverage,
290+
tokenized_matched_text,
291+
)
292+
data.append(identifier)
293+
294+
return tuple(data)
266295

267296
@staticmethod
268297
def format_analysis_result(issue_category, issue_type, license_matches, path):
@@ -329,7 +358,10 @@ def from_license_matches(
329358
if not license_matches:
330359
return []
331360

332-
groups_of_license_matches = group_matches(license_matches)
361+
if not is_license_text:
362+
groups_of_license_matches = group_matches(license_matches)
363+
else:
364+
groups_of_license_matches = [license_matches]
333365
return analyze_matches(
334366
groups_of_license_matches, path, is_license_text, is_legal
335367
)
@@ -561,13 +593,13 @@ def get_license_text_issue_type(is_license_text, is_legal):
561593
Classifies the license detection issue into one of ISSUE_TYPES_BY_CLASSIFICATION,
562594
where it is a license text.
563595
"""
564-
if is_license_text:
565-
if is_legal:
596+
if is_legal:
597+
if is_license_text:
566598
return "text-legal-lic-files"
567599
else:
568-
return "text-non-legal-lic-files"
600+
return "text-lic-text-fragments"
569601
else:
570-
return "text-lic-text-fragments"
602+
return "text-non-legal-lic-files"
571603

572604

573605
def get_license_notice_issue_type(license_matches, issue_category):

src/results_analyze/analyzer_summary.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,22 +197,22 @@ def get_unique_issues(license_issues):
197197
:param license_issues: list of LicenseDetectionIssue
198198
:returns UniqueLicenseIssues: list of UniqueIssue
199199
"""
200-
all_identifiers = (issue.identifier for issue in license_issues)
201-
unique_issue_category_counts = dict(Counter(all_identifiers))
200+
identifiers = get_identifiers(license_issues)
201+
unique_issue_category_counts = dict(Counter(identifiers))
202202

203203
unique_license_issues = []
204-
for issue_number, (unique_issue_categoryentifier, counts) in enumerate(
204+
for issue_number, (unique_issue_identifier, counts) in enumerate(
205205
unique_issue_category_counts.items(), start=1,
206206
):
207207
file_regions = (
208208
issue.file_regions.pop()
209209
for issue in license_issues
210-
if issue.identifier == unique_issue_categoryentifier
210+
if unique_issue_identifier in [issue.identifier, issue.identifier_for_unknown_intro]
211211
)
212212
all_issues = (
213213
issue
214214
for issue in license_issues
215-
if issue.identifier == unique_issue_categoryentifier
215+
if unique_issue_identifier in [issue.identifier, issue.identifier_for_unknown_intro]
216216
)
217217
unique_license_issues.append(
218218
UniqueIssue.get_formatted_unique_issue(
@@ -223,3 +223,18 @@ def get_unique_issues(license_issues):
223223
)
224224

225225
return unique_license_issues
226+
227+
228+
def get_identifiers(license_issues):
229+
"""
230+
Get identifiers for all license detection issues.
231+
232+
:param license_issues: list of LicenseDetectionIssue
233+
:returns identifiers: list of tuples
234+
"""
235+
identifiers = (
236+
issue.identifier if issue.issue_category != "unknown-match"
237+
else issue.identifier_for_unknown_intro
238+
for issue in license_issues
239+
)
240+
return identifiers
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
{
2+
"files": [
3+
{
4+
"path": "Issues",
5+
"type": "directory",
6+
"name": "scan-files",
7+
"base_name": "scan-files",
8+
"extension": "",
9+
"size": 0,
10+
"date": null,
11+
"sha1": null,
12+
"md5": null,
13+
"sha256": null,
14+
"mime_type": null,
15+
"file_type": null,
16+
"programming_language": null,
17+
"is_binary": false,
18+
"is_text": false,
19+
"is_archive": false,
20+
"is_media": false,
21+
"is_source": false,
22+
"is_script": false,
23+
"licenses": [],
24+
"license_expressions": [],
25+
"is_legal": false,
26+
"is_manifest": false,
27+
"is_readme": false,
28+
"is_top_level": true,
29+
"is_key_file": false,
30+
"is_license_text": false,
31+
"files_count": 4,
32+
"dirs_count": 0,
33+
"size_count": 78046,
34+
"scan_errors": []
35+
},
36+
{
37+
"path": "Issues/timescaledb-master-LICENSE",
38+
"type": "file",
39+
"name": "xio-creat.h",
40+
"base_name": "xio-creat",
41+
"extension": ".h",
42+
"size": 306,
43+
"date": "2010-06-20",
44+
"sha1": "72545320afeb44d39c63cac4cea68e96b43a5d96",
45+
"md5": "3c80e4fb0336f80f34db2eef99bdfa31",
46+
"mime_type": "text/x-c",
47+
"file_type": "C source, ASCII text",
48+
"programming_language": "C++",
49+
"is_binary": false,
50+
"is_text": true,
51+
"is_archive": false,
52+
"is_media": false,
53+
"is_source": true,
54+
"is_script": false,
55+
"licenses": [
56+
{
57+
"key": "unknown-license-reference",
58+
"score": 16.0,
59+
"name": "Unknown License file reference",
60+
"short_name": "Unknown License reference",
61+
"category": "Unstated License",
62+
"is_exception": false,
63+
"owner": "Unspecified",
64+
"homepage_url": null,
65+
"text_url": "",
66+
"reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
67+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
68+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
69+
"spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
70+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
71+
"start_line": 11,
72+
"end_line": 11,
73+
"matched_rule": {
74+
"identifier": "license-intro_26.RULE",
75+
"license_expression": "unknown-license-reference",
76+
"licenses": [
77+
"unknown-license-reference"
78+
],
79+
"is_license_text": false,
80+
"is_license_notice": false,
81+
"is_license_reference": false,
82+
"is_license_tag": false,
83+
"is_license_intro": true,
84+
"matcher": "2-aho",
85+
"rule_length": 3,
86+
"matched_length": 3,
87+
"match_coverage": 100.0,
88+
"rule_relevance": 16
89+
},
90+
"matched_text": "* Within the \"tsl\" folder, source code in a given file is licensed under the"
91+
},
92+
{
93+
"key": "unknown-license-reference",
94+
"score": 16.0,
95+
"name": "Unknown License file reference",
96+
"short_name": "Unknown License reference",
97+
"category": "Unstated License",
98+
"is_exception": false,
99+
"owner": "Unspecified",
100+
"homepage_url": null,
101+
"text_url": "",
102+
"reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
103+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
104+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
105+
"spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
106+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
107+
"start_line": 16,
108+
"end_line": 16,
109+
"matched_rule": {
110+
"identifier": "license-intro_26.RULE",
111+
"license_expression": "unknown-license-reference",
112+
"licenses": [
113+
"unknown-license-reference"
114+
],
115+
"is_license_text": false,
116+
"is_license_notice": false,
117+
"is_license_reference": false,
118+
"is_license_tag": false,
119+
"is_license_intro": true,
120+
"matcher": "2-aho",
121+
"rule_length": 3,
122+
"matched_length": 3,
123+
"match_coverage": 100.0,
124+
"rule_relevance": 16
125+
},
126+
"matched_text": "that contain `-tsl` in their name are licensed under the Timescale License."
127+
}
128+
],
129+
"license_expressions": [
130+
"unknown-license-reference",
131+
"unknown-license-reference"
132+
],
133+
"is_legal": true,
134+
"is_manifest": false,
135+
"is_readme": false,
136+
"is_top_level": false,
137+
"is_key_file": false,
138+
"is_license_text": false,
139+
"files_count": 0,
140+
"dirs_count": 0,
141+
"size_count": 0,
142+
"scan_errors": []
143+
},
144+
{
145+
"path": "Issues/timescaledb-master-scripts-c_license_header-timescale.h",
146+
"type": "file",
147+
"name": "iptables-xml.c",
148+
"base_name": "iptables-xml",
149+
"extension": ".c",
150+
"size": 20302,
151+
"date": "2010-10-29",
152+
"sha1": "6c574556090271b9802c6e10ee1163fbf157bcb5",
153+
"md5": "266a65603e63fb1678e926f6adf48d64",
154+
"mime_type": "text/x-c",
155+
"file_type": "C source, ASCII text",
156+
"programming_language": "C++",
157+
"is_binary": false,
158+
"is_text": true,
159+
"is_archive": false,
160+
"is_media": false,
161+
"is_source": true,
162+
"is_script": false,
163+
"licenses": [
164+
{
165+
"key": "unknown-license-reference",
166+
"score": 16.0,
167+
"name": "Unknown License file reference",
168+
"short_name": "Unknown License reference",
169+
"category": "Unstated License",
170+
"is_exception": false,
171+
"owner": "Unspecified",
172+
"homepage_url": null,
173+
"text_url": "",
174+
"reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
175+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
176+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
177+
"spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
178+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
179+
"start_line": 2,
180+
"end_line": 2,
181+
"matched_rule": {
182+
"identifier": "license-intro_26.RULE",
183+
"license_expression": "unknown-license-reference",
184+
"licenses": [
185+
"unknown-license-reference"
186+
],
187+
"is_license_text": false,
188+
"is_license_notice": false,
189+
"is_license_reference": false,
190+
"is_license_tag": false,
191+
"is_license_intro": true,
192+
"matcher": "2-aho",
193+
"rule_length": 3,
194+
"matched_length": 3,
195+
"match_coverage": 100.0,
196+
"rule_relevance": 16
197+
},
198+
"matched_text": " * This file and its contents are licensed under the Timescale License."
199+
}
200+
],
201+
"license_expressions": [
202+
"unknown-license-reference"
203+
],
204+
"is_legal": false,
205+
"is_manifest": false,
206+
"is_readme": false,
207+
"is_top_level": false,
208+
"is_key_file": false,
209+
"is_license_text": false,
210+
"files_count": 0,
211+
"dirs_count": 0,
212+
"size_count": 0,
213+
"scan_errors": []
214+
}
215+
]
216+
}

0 commit comments

Comments
 (0)