Skip to content

Commit b491f9d

Browse files
Merge pull request #131 from fosslight/32
Load v32 and later of ScanCode
2 parents 84a006b + 88a323a commit b491f9d

File tree

3 files changed

+150
-18
lines changed

3 files changed

+150
-18
lines changed

src/fosslight_source/_license_matched.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
logger = logging.getLogger(constant.LOGGER_NAME)
99
HEADER = ['No', 'Category', 'License',
1010
'Matched Text', 'File Count', 'Files']
11+
HEADER_32_LATER = ['No', 'License', 'Matched Text',
12+
'File Count', 'Files']
1113
LOW_PRIORITY = ['Permissive', 'Public Domain']
1214

1315

@@ -44,14 +46,18 @@ def set_category(self, value):
4446
def set_matched_text(self, value):
4547
self.matched_text = value
4648

47-
def get_row_to_print(self):
48-
print_rows = [self.category, self.license, self.matched_text, str(len(self.files)), ','.join(self.files)]
49+
def get_row_to_print(self, result_for_32_earlier=True):
50+
if result_for_32_earlier:
51+
print_rows = [self.category, self.license, self.matched_text, str(len(self.files)), ','.join(self.files)]
52+
else:
53+
print_rows = [self.license, self.matched_text, str(len(self.files)), ','.join(self.files)]
4954
return print_rows
5055

5156

5257
def get_license_list_to_print(license_list):
58+
result_for_32_earlier = any([value.category for key, value in license_list.items()])
5359
license_items = license_list.values()
5460
license_items = sorted(license_items, key=lambda row: (row.priority, row.category, row.license))
55-
license_rows = [lic_item.get_row_to_print() for lic_item in license_items]
56-
license_rows.insert(0, HEADER)
61+
license_rows = [lic_item.get_row_to_print(result_for_32_earlier) for lic_item in license_items]
62+
license_rows.insert(0, HEADER if result_for_32_earlier else HEADER_32_LATER)
5763
return license_rows

src/fosslight_source/_parsing_scancode_file_item.py

Lines changed: 139 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,15 @@
1919
_exclude_directory = [os.path.sep + dir_name +
2020
os.path.sep for dir_name in _exclude_directory]
2121
_exclude_directory.append("/.")
22-
remove_license = ["warranty-disclaimer"]
22+
REMOVE_LICENSE = ["warranty-disclaimer"]
23+
regex = re.compile(r'licenseref-(\S+)', re.IGNORECASE)
24+
find_word = re.compile(rb"SPDX-PackageDownloadLocation\s*:\s*(\S+)", re.IGNORECASE)
25+
KEYWORD_SPDX_ID = r'SPDX-License-Identifier\s*[\S]+'
26+
KEYWORD_DOWNLOAD_LOC = r'DownloadLocation\s*[\S]+'
27+
KEYWORD_SCANCODE_UNKNOWN = "unknown-spdx"
28+
SPDX_REPLACE_WORDS = ["(", ")"]
29+
KEY_AND = "and"
30+
KEY_OR = "or"
2331

2432

2533
def get_error_from_header(header_item):
@@ -41,17 +49,13 @@ def get_error_from_header(header_item):
4149
return has_error, str_error
4250

4351

44-
def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_license=False):
45-
52+
def parsing_scancode_32_earlier(scancode_file_list, path_to_scan, has_error=False):
4653
rc = True
54+
msg = []
4755
scancode_file_item = []
4856
license_list = {} # Key :[license]+[matched_text], value: MatchedLicense()
49-
msg = []
50-
5157
prev_dir = ""
5258
prev_dir_value = False
53-
regex = re.compile(r'licenseref-(\S+)', re.IGNORECASE)
54-
find_word = re.compile(rb"SPDX-PackageDownloadLocation\s*:\s*(\S+)", re.IGNORECASE)
5559

5660
if scancode_file_list:
5761
for file in scancode_file_list:
@@ -100,8 +104,8 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
100104
copyright_data = x.get("value", "")
101105
if copyright_data:
102106
try:
103-
copyright_data = re.sub(r'SPDX-License-Identifier\s*[\S]+', '', copyright_data, flags=re.I)
104-
copyright_data = re.sub(r'DownloadLocation\s*[\S]+', '', copyright_data, flags=re.I).strip()
107+
copyright_data = re.sub(KEYWORD_SPDX_ID, '', copyright_data, flags=re.I)
108+
copyright_data = re.sub(KEYWORD_DOWNLOAD_LOC, '', copyright_data, flags=re.I).strip()
105109
except Exception:
106110
pass
107111
copyright_value_list.append(copyright_data)
@@ -122,7 +126,7 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
122126
for lic_item in licenses:
123127
license_value = ""
124128
key = lic_item.get("key", "")
125-
if key in remove_license:
129+
if key in REMOVE_LICENSE:
126130
if key in license_expression_list:
127131
license_expression_list.remove(key)
128132
continue
@@ -139,9 +143,9 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
139143
license_value = spdx.lower()
140144

141145
if license_value != "":
142-
if key == "unknown-spdx":
146+
if key == KEYWORD_SCANCODE_UNKNOWN:
143147
try:
144-
matched_txt = lic_item.get("matched_text", "")
148+
matched_txt = lic_item.get("matched_text", "").lower()
145149
matched = regex.search(matched_txt)
146150
if matched:
147151
license_value = str(matched.group())
@@ -154,7 +158,7 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
154158
license_detected.append(license_value)
155159

156160
# Add matched licenses
157-
if need_matched_license and "category" in lic_item:
161+
if "category" in lic_item:
158162
lic_category = lic_item["category"]
159163
if "matched_text" in lic_item:
160164
lic_matched_text = lic_item["matched_text"]
@@ -184,3 +188,125 @@ def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_
184188
rc = False
185189
msg = list(set(msg))
186190
return rc, scancode_file_item, msg, license_list
191+
192+
193+
def split_spdx_expression(spdx_string):
194+
license = []
195+
for replace in SPDX_REPLACE_WORDS:
196+
spdx_string = spdx_string.replace(replace, "")
197+
spdx_string = spdx_string.replace(KEY_OR, KEY_AND)
198+
license = spdx_string.split(KEY_AND)
199+
return license
200+
201+
202+
def parsing_scancode_32_later(scancode_file_list, path_to_scan, has_error=False):
203+
rc = True
204+
msg = []
205+
scancode_file_item = []
206+
license_list = {} # Key :[license]+[matched_text], value: MatchedLicense()
207+
208+
if scancode_file_list:
209+
for file in scancode_file_list:
210+
try:
211+
file_path = file.get("path", "")
212+
is_binary = file.get("is_binary", False)
213+
is_dir = file.get("type", "") == "directory"
214+
if (not file_path) or is_binary or is_dir:
215+
continue
216+
217+
result_item = ScanItem(file_path)
218+
219+
if has_error:
220+
error_msg = file.get("scan_errors", [])
221+
if error_msg:
222+
result_item.comment = ",".join(error_msg)
223+
scancode_file_item.append(result_item)
224+
continue
225+
226+
url_list = []
227+
if file.get("urls", []):
228+
with open(os.path.join(path_to_scan, file_path), "r") as f:
229+
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmap_obj:
230+
for word in find_word.findall(mmap_obj):
231+
url_list.append(word.decode('utf-8'))
232+
result_item.download_location = url_list
233+
234+
copyright_value_list = []
235+
for x in file.get("copyrights", []):
236+
copyright_data = x.get("copyright", "")
237+
if copyright_data:
238+
try:
239+
copyright_data = re.sub(KEYWORD_SPDX_ID, '', copyright_data, flags=re.I)
240+
copyright_data = re.sub(KEYWORD_DOWNLOAD_LOC, '', copyright_data, flags=re.I).strip()
241+
except Exception:
242+
pass
243+
copyright_value_list.append(copyright_data)
244+
result_item.copyright = copyright_value_list
245+
246+
license_detected = []
247+
licenses = file.get("license_detections", [])
248+
if not licenses:
249+
continue
250+
for lic in licenses:
251+
matched_lic_list = lic.get("matches", [])
252+
for matched_lic in matched_lic_list:
253+
found_lic_list = matched_lic.get("license_expression", "")
254+
matched_txt = matched_lic.get("matched_text", "")
255+
if found_lic_list:
256+
found_lic_list = found_lic_list.lower()
257+
for found_lic in split_spdx_expression(found_lic_list):
258+
if found_lic:
259+
found_lic = found_lic.strip()
260+
if found_lic in REMOVE_LICENSE:
261+
continue
262+
elif found_lic == KEYWORD_SCANCODE_UNKNOWN:
263+
try:
264+
matched = regex.search(matched_txt.lower())
265+
if matched:
266+
found_lic = str(matched.group())
267+
except Exception:
268+
pass
269+
for word in replace_word:
270+
found_lic = found_lic.replace(word, "")
271+
if matched_txt:
272+
lic_matched_key = found_lic + matched_txt
273+
if lic_matched_key in license_list:
274+
license_list[lic_matched_key].set_files(file_path)
275+
else:
276+
lic_info = MatchedLicense(found_lic, "", matched_txt, file_path)
277+
license_list[lic_matched_key] = lic_info
278+
license_detected.append(found_lic)
279+
result_item.licenses = license_detected
280+
if len(license_detected) > 1:
281+
license_expression_spdx = file.get("detected_license_expression_spdx", "")
282+
license_expression = file.get("detected_license_expression", "")
283+
if license_expression_spdx:
284+
license_expression = license_expression_spdx
285+
if license_expression:
286+
result_item.comment = license_expression
287+
288+
result_item.exclude = is_exclude_file(file_path)
289+
result_item.is_license_text = file.get("percentage_of_license_text", 0) > 90
290+
scancode_file_item.append(result_item)
291+
except Exception as ex:
292+
msg.append(f"Error Parsing item: {ex}")
293+
rc = False
294+
295+
return rc, scancode_file_item, msg, license_list
296+
297+
298+
def parsing_file_item(scancode_file_list, has_error, path_to_scan, need_matched_license=False):
299+
300+
rc = True
301+
msg = []
302+
303+
first_item = next(iter(scancode_file_list or []), None)
304+
if "licenses" in first_item:
305+
rc, scancode_file_item, msg, license_list = parsing_scancode_32_earlier(scancode_file_list,
306+
path_to_scan, has_error)
307+
else:
308+
rc, scancode_file_item, msg, license_list = parsing_scancode_32_later(scancode_file_list,
309+
path_to_scan, has_error)
310+
if not need_matched_license:
311+
license_list = {}
312+
return rc, scancode_file_item, msg, license_list

src/fosslight_source/_scan_item.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import fosslight_util.constant as constant
99

1010
logger = logging.getLogger(constant.LOGGER_NAME)
11-
replace_word = ["-only", "-old-style", "-or-later", "licenseref-scancode-"]
11+
replace_word = ["-only", "-old-style", "-or-later", "licenseref-scancode-", "licenseref-"]
1212
_exclude_filename = ["changelog", "config.guess", "config.sub",
1313
"config.h.in", "changes", "ltmain.sh",
1414
"aclocal.m4", "configure", "configure.ac",

0 commit comments

Comments
 (0)