Skip to content

Commit d30669c

Browse files
committed
Load v32 and later of ScanCode
1 parent ab27d13 commit d30669c

File tree

3 files changed

+146
-18
lines changed

3 files changed

+146
-18
lines changed

src/fosslight_source/_license_matched.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
logger = logging.getLogger(constant.LOGGER_NAME)
99
HEADER = ['No', 'Category', 'License',
1010
'Matched Text', 'File Count', 'Files']
11+
HEADER_32_LATER = ['No', 'License', 'Matched Text',
12+
'File Count', 'Files']
1113
LOW_PRIORITY = ['Permissive', 'Public Domain']
1214

1315

@@ -44,14 +46,18 @@ def set_category(self, value):
4446
def set_matched_text(self, value):
4547
self.matched_text = value
4648

47-
def get_row_to_print(self):
48-
print_rows = [self.category, self.license, self.matched_text, str(len(self.files)), ','.join(self.files)]
49+
def get_row_to_print(self, result_for_32_earlier=True):
50+
if result_for_32_earlier:
51+
print_rows = [self.category, self.license, self.matched_text, str(len(self.files)), ','.join(self.files)]
52+
else:
53+
print_rows = [self.license, self.matched_text, str(len(self.files)), ','.join(self.files)]
4954
return print_rows
5055

5156

5257
def get_license_list_to_print(license_list):
58+
result_for_32_earlier = any([value.category for key, value in license_list.items()])
5359
license_items = license_list.values()
5460
license_items = sorted(license_items, key=lambda row: (row.priority, row.category, row.license))
55-
license_rows = [lic_item.get_row_to_print() for lic_item in license_items]
56-
license_rows.insert(0, HEADER)
61+
license_rows = [lic_item.get_row_to_print(result_for_32_earlier) for lic_item in license_items]
62+
license_rows.insert(0, HEADER if result_for_32_earlier else HEADER_32_LATER)
5763
return license_rows

src/fosslight_source/_parsing_scancode_file_item.py

Lines changed: 135 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,15 @@
1919
_exclude_directory = [os.path.sep + dir_name +
2020
os.path.sep for dir_name in _exclude_directory]
2121
_exclude_directory.append("/.")
22-
remove_license = ["warranty-disclaimer"]
23-
22+
REMOVE_LICENSE = ["warranty-disclaimer"]
23+
regex = re.compile(r'licenseref-(\S+)', re.IGNORECASE)
24+
find_word = re.compile(rb"SPDX-PackageDownloadLocation\s*:\s*(\S+)", re.IGNORECASE)
25+
KEYWORD_SPDX_ID = r'SPDX-License-Identifier\s*[\S]+'
26+
KEYWORD_DOWNLOAD_LOC = r'DownloadLocation\s*[\S]+'
27+
KEYWORD_SCANCODE_UNKNOWN = "unknown-spdx"
28+
SPDX_REPLACE_WORDS = ["(", ")"]
29+
KEY_AND = "and"
30+
KEY_OR = "or"
2431

2532
def get_error_from_header(header_item):
2633
has_error = False
@@ -41,17 +48,13 @@ def get_error_from_header(header_item):
4148
return has_error, str_error
4249

4350

44-
def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_license=False):
45-
51+
def parsing_scancode_32_earlier(scancode_file_list, path_to_scan, has_error=False):
4652
rc = True
53+
msg = []
4754
scancode_file_item = []
4855
license_list = {} # Key :[license]+[matched_text], value: MatchedLicense()
49-
msg = []
50-
5156
prev_dir = ""
5257
prev_dir_value = False
53-
regex = re.compile(r'licenseref-(\S+)', re.IGNORECASE)
54-
find_word = re.compile(rb"SPDX-PackageDownloadLocation\s*:\s*(\S+)", re.IGNORECASE)
5558

5659
if scancode_file_list:
5760
for file in scancode_file_list:
@@ -100,8 +103,8 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
100103
copyright_data = x.get("value", "")
101104
if copyright_data:
102105
try:
103-
copyright_data = re.sub(r'SPDX-License-Identifier\s*[\S]+', '', copyright_data, flags=re.I)
104-
copyright_data = re.sub(r'DownloadLocation\s*[\S]+', '', copyright_data, flags=re.I).strip()
106+
copyright_data = re.sub(KEYWORD_SPDX_ID, '', copyright_data, flags=re.I)
107+
copyright_data = re.sub(KEYWORD_DOWNLOAD_LOC, '', copyright_data, flags=re.I).strip()
105108
except Exception:
106109
pass
107110
copyright_value_list.append(copyright_data)
@@ -122,7 +125,7 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
122125
for lic_item in licenses:
123126
license_value = ""
124127
key = lic_item.get("key", "")
125-
if key in remove_license:
128+
if key in REMOVE_LICENSE:
126129
if key in license_expression_list:
127130
license_expression_list.remove(key)
128131
continue
@@ -139,7 +142,7 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
139142
license_value = spdx.lower()
140143

141144
if license_value != "":
142-
if key == "unknown-spdx":
145+
if key == KEYWORD_SCANCODE_UNKNOWN:
143146
try:
144147
matched_txt = lic_item.get("matched_text", "")
145148
matched = regex.search(matched_txt)
@@ -154,7 +157,7 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
154157
license_detected.append(license_value)
155158

156159
# Add matched licenses
157-
if need_matched_license and "category" in lic_item:
160+
if "category" in lic_item:
158161
lic_category = lic_item["category"]
159162
if "matched_text" in lic_item:
160163
lic_matched_text = lic_item["matched_text"]
@@ -184,3 +187,122 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
184187
rc = False
185188
msg = list(set(msg))
186189
return rc, scancode_file_item, msg, license_list
190+
191+
192+
def split_spdx_expression(spdx_string):
193+
license = []
194+
for replace in SPDX_REPLACE_WORDS:
195+
spdx_string = spdx_string.replace(replace, "")
196+
spdx_string = spdx_string.replace(KEY_OR, KEY_AND)
197+
license = spdx_string.split(KEY_AND)
198+
return license
199+
200+
201+
def parsing_scancode_32_later(scancode_file_list, path_to_scan, has_error=False):
202+
rc = True
203+
msg = []
204+
scancode_file_item = []
205+
license_list = {} # Key :[license]+[matched_text], value: MatchedLicense()
206+
207+
if scancode_file_list:
208+
for file in scancode_file_list:
209+
file_path = file.get("path", "")
210+
is_binary = file.get("is_binary", False)
211+
is_dir = file.get("type", "") == "directory"
212+
if (not file_path) or is_binary or is_dir:
213+
continue
214+
215+
result_item = ScanItem(file_path)
216+
217+
if has_error:
218+
error_msg = file.get("scan_errors", [])
219+
if error_msg:
220+
result_item.comment = ",".join(error_msg)
221+
scancode_file_item.append(result_item)
222+
continue
223+
224+
url_list = []
225+
if file.get("urls", []):
226+
with open(os.path.join(path_to_scan, file_path), "r") as f:
227+
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmap_obj:
228+
for word in find_word.findall(mmap_obj):
229+
url_list.append(word.decode('utf-8'))
230+
result_item.download_location = url_list
231+
232+
copyright_value_list = []
233+
for x in file.get("copyrights", []):
234+
copyright_data = x.get("copyright", "")
235+
if copyright_data:
236+
try:
237+
copyright_data = re.sub(KEYWORD_SPDX_ID, '', copyright_data, flags=re.I)
238+
copyright_data = re.sub(KEYWORD_DOWNLOAD_LOC, '', copyright_data, flags=re.I).strip()
239+
except Exception:
240+
pass
241+
copyright_value_list.append(copyright_data)
242+
result_item.copyright = copyright_value_list
243+
244+
license_detected = []
245+
licenses = file.get("license_detections", [])
246+
if not licenses:
247+
continue
248+
for lic in licenses:
249+
matched_lic_list = lic.get("matches", [])
250+
for matched_lic in matched_lic_list:
251+
found_lic_list = matched_lic.get("license_expression", "")
252+
matched_txt = matched_lic.get("matched_text", "")
253+
if found_lic_list:
254+
found_lic_list = found_lic_list.lower()
255+
for found_lic in split_spdx_expression(found_lic_list):
256+
if found_lic:
257+
found_lic = found_lic.strip()
258+
if found_lic in REMOVE_LICENSE:
259+
continue
260+
elif found_lic == KEYWORD_SCANCODE_UNKNOWN:
261+
try:
262+
matched = regex.search(matched_txt.lower())
263+
if matched:
264+
found_lic = str(matched.group())
265+
except Exception:
266+
pass
267+
for word in replace_word:
268+
found_lic = found_lic.replace(word, "")
269+
if matched_txt:
270+
lic_matched_key = found_lic + matched_txt
271+
if lic_matched_key in license_list:
272+
license_list[lic_matched_key].set_files(file_path)
273+
else:
274+
lic_info = MatchedLicense(found_lic, "", matched_txt, file_path)
275+
license_list[lic_matched_key] = lic_info
276+
license_detected.append(found_lic)
277+
result_item.licenses = license_detected
278+
if len(license_detected) > 1:
279+
license_expression_spdx = file.get("detected_license_expression_spdx", "")
280+
license_expression = file.get("detected_license_expression", "")
281+
if license_expression_spdx:
282+
license_expression = license_expression_spdx
283+
if license_expression:
284+
result_item.comment = license_expression
285+
286+
result_item.exclude = is_exclude_file(file_path)
287+
result_item.is_license_text = file.get("percentage_of_license_text", 0) > 90
288+
scancode_file_item.append(result_item)
289+
290+
return rc, scancode_file_item, msg, license_list
291+
292+
293+
def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_license=False):
294+
295+
rc = True
296+
msg = []
297+
298+
first_item = next(iter(scancode_file_list or []), None)
299+
if "licenses" in first_item:
300+
rc, scancode_file_item, msg, license_list = parsing_scancode_32_earlier(scancode_file_list,
301+
path_to_scan, has_error)
302+
else:
303+
rc, scancode_file_item, msg, license_list = parsing_scancode_32_later(scancode_file_list,
304+
path_to_scan, has_error)
305+
if not need_matched_license:
306+
license_list = {}
307+
return rc, scancode_file_item, msg, license_list
308+

src/fosslight_source/_scan_item.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import fosslight_util.constant as constant
99

1010
logger = logging.getLogger(constant.LOGGER_NAME)
11-
replace_word = ["-only", "-old-style", "-or-later", "licenseref-scancode-"]
11+
replace_word = ["-only", "-old-style", "-or-later", "licenseref-scancode-", "licenseref-"]
1212
_exclude_filename = ["changelog", "config.guess", "config.sub",
1313
"config.h.in", "changes", "ltmain.sh",
1414
"aclocal.m4", "configure", "configure.ac",

0 commit comments

Comments
 (0)