adding code to demonstrate how to retrieve file-level license data, copyright data, and convert the info into CSV format

Glenn Snyder · Glenn Snyder · commit c807197f9a1a · 2020-06-15T15:55:52.000-04:00
diff --git a/examples/convert_bom_component_origin_info_to_csv.py b/examples/convert_bom_component_origin_info_to_csv.py
@@ -0,0 +1,87 @@
+import argparse
+import csv
+import logging
+import json
+import os.path
+import sys
+import urllib.parse
+
+parser = argparse.ArgumentParser("Process the JSON output from get_bom_component_origin_info.py to create CSV output format")
+parser.add_argument("-f", "--origin_info", help="By default, program reads JSON doc from stdin, but you can alternatively give a file name")
+parser.add_argument("-u", "--un_matched_files", action="store_true", help="Include un-matched files in the output")
+parser.add_argument("output_file")
+
+args = parser.parse_args()
+
+logging.basicConfig(format='%(asctime)s%(levelname)s:%(message)s', stream=sys.stderr, level=logging.DEBUG)
+logging.getLogger("requests").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+if args.origin_info:
+    origin_info = json.load(open(args.origin_info, 'r'))
+else:
+    origin_info = json.load(sys.stdin)
+
+with open(args.output_file, 'w') as csv_file:
+    columns = [
+        'component',
+        'component modified',
+        'file path', 
+        'file name',
+        'archive context',
+        'usage(s)', 
+        'license(s)',
+        'match type(s)',
+        'scan (code location)'
+    ]
+    writer = csv.DictWriter(csv_file, fieldnames=columns)
+    writer.writeheader()
+
+    for component, component_info in origin_info.items():
+        if component == 'un_matched_files':
+            # ignore, skip un_matched_files
+            continue
+        logging.debug(f"Writing info for {component}")
+        for matched_file_info in component_info.get('matched_files', []):
+            row = {
+                'component': component,
+                'component modified': component_info['bom_component_info'].get('componentModified', None),
+                'file path': matched_file_info['filePath']['path'],
+                'file name': matched_file_info['filePath']['fileName'],
+                'archive context': matched_file_info['filePath']['archiveContext'],
+                'usage(s)': ",".join(matched_file_info['usages']),
+                'license(s)': ",".join([l['licenseDisplay'] for l in component_info['bom_component_info']['licenses']]),
+                'match type(s)': ",".join(component_info['bom_component_info']['matchTypes']),
+                'scan (code location)': matched_file_info.get('scan', {}).get('name', 'unknown')
+            }
+            writer.writerow(row)
+
+    if args.un_matched_files:
+        for un_matched_file in origin_info.get('un_matched_files'):
+            uri = urllib.parse.unquote(un_matched_file['uri'])
+            parsed = urllib.parse.urlparse(uri)
+            if parsed.scheme == 'zip':
+                file_path = parsed.fragment
+                file_name = os.path.basename(parsed.fragment)
+                archive_context = parsed.path
+            elif parsed.scheme == 'file':
+                file_path = parsed.path
+                file_name = os.path.basename(parsed.path)
+                archive_context = None
+            else:
+                file_path = "unrecognized"
+                file_name = "unrecognized"
+                archive_context = "unrecognized scheme"
+
+            row = {
+                'component': None,
+                'component modified': None,
+                'file path': file_path,
+                'file name': file_name,
+                'archive context': archive_context,
+                'usage(s)': None,
+                'license(s)': None,
+                'match type(s)': "Un-matched/Un-identified",
+                'scan (code location)': un_matched_file.get('scan', {}).get('name', 'unknown')
+            }
+            writer.writerow(row)
diff --git a/examples/get_bom_component_origin_info.py b/examples/get_bom_component_origin_info.py
@@ -11,6 +11,11 @@
 parser = argparse.ArgumentParser("Retreive BOM component license information for the given project and version")
 parser.add_argument("project_name")
 parser.add_argument("version")
+parser.add_argument("-l", "--deep_license_info", action="store_true")
+parser.add_argument("-c", "--copyright_info", action="store_true")
+parser.add_argument("-m", "--matched_files", action="store_true")
+parser.add_argument("-u", "--un_matched_files", action="store_true")
+
 
 args = parser.parse_args()
 
@@ -20,7 +25,7 @@
 project = hub.get_project_by_name(args.project_name)
 version = hub.get_version_by_name(project, args.version)
 
-bom_components = hub.get_version_components(version)
+bom_components = hub.get_version_components(version).get('items', [])
 
 all_origins = dict()
 
@@ -30,32 +35,89 @@
 
 all_origin_info = {}
 
-for bom_component in bom_components['items']:
-	component_url = bom_component['component']
-	response = hub.execute_get(component_url)
+scan_cache = {}
+
+for bom_component in bom_components:
+	if 'componentVersionName' in bom_component:
+		bom_component_name = f"{bom_component['componentName']}:{bom_component['componentVersionName']}"
+	else:
+		bom_component_name = f"{bom_component['componentName']}"
 
 	# Component details include the home page url and additional home pages
-	logging.debug("Retrieving component home page info for {}:{}".format(
-		bom_component['componentName'], bom_component['componentVersionName']))
-	component_details = None
-	if response.status_code == 200:
-		component_details = response.json()
+	component_url = bom_component['component']
+	component_details = hub.execute_get(component_url).json()
 
+	#
+	# Grab origin info, file-level license info, and file-level copyright info
+	#
+	all_origin_details = list()
 	for origin in bom_component.get('origins', []):
-		logging.debug("Retrieving origin details for origin {}".format(origin['name']))
+		logging.debug(f"Retrieving origin details for {bom_component_name} and origin {origin['name']}")
 		origin_url = hub.get_link(origin, 'origin')
-		response = hub.execute_get(origin_url)
-		origin_details = None
-		if response.status_code == 200:
-			origin_details = response.json()
-
-		all_origin_info.update({
-				"{}:{}".format(bom_component['componentName'], bom_component['componentVersionName']): {
-					"component_details": component_details,
-					"component_home_page": component_details.get("url"),
-					"additional_home_pages": component_details.get("additionalHomepages"),
-					"origin_details": origin_details,
-				}
+		origin_details = hub.execute_get(origin_url).json()
+
+		#
+		# Add deep license info and copyright info, as appropriate
+		#
+		info_to_get = []
+		if args.deep_license_info:
+			info_to_get.extend([
+					("file-licenses", "file_licenses"),
+					("file-licenses-fuzzy", "file_licenses_fuzzy")
+				])
+
+		if args.copyright_info:
+			info_to_get.extend([
+					("file-copyrights", "file_copyrights"),
+					("component-origin-copyrights", "component_origin_copyrights")
+				])
+		for link_t in info_to_get:
+			link_name = link_t[0]
+			k = link_t[1]
+			logging.debug(f"Retrieving {link_name} for {bom_component_name}")
+			url = hub.get_link(origin_details, link_name)
+			info = hub.execute_get(url).json().get('items', [])
+			origin_details[k] = info
+
+		all_origin_details.append(origin_details)
+
+	all_origin_info.update({
+			bom_component_name: {
+				"bom_component_info": bom_component,
+				"component_details": component_details,
+				"component_home_page": component_details.get("url"),
+				"additional_home_pages": component_details.get("additionalHomepages"),
+				"all_origin_details": all_origin_details,
+			}
+		})
+
+	if args.matched_files:
+		logging.debug(f"Retrieving matched files for {bom_component_name}")
+		matched_files_url = hub.get_link(bom_component, "matched-files") + "?limit=99999"
+		matched_files = hub.execute_get(matched_files_url).json().get('items', [])
+		# Get scan info
+		for matched_file in matched_files:
+			scan_url = hub.get_link(matched_file, "codelocations")
+			if scan_url in scan_cache:
+				scan = scan_cache[scan_url]
+			else:
+				scan = hub.execute_get(scan_url).json()
+				scan_cache[scan_url] = scan
+			matched_file['scan'] = scan
+		all_origin_info[bom_component_name].update({
+				'matched_files': matched_files
 			})
 
+if args.un_matched_files:
+	# TODO: Probably need to loop on this with smaller page sizes to handle very large
+	# project-versions with many (signature) scans mapped to it
+	#
+	logging.debug(f"Retrieving un-matched files for project {project['name']}, version {version['versionName']}")
+	un_matched_files_url = f"{version['_meta']['href']}/matched-files?limit=99999&filter=bomMatchType:unmatched"
+	un_matched_files = hub.execute_get(un_matched_files_url).json().get('items', [])
+	logging.debug(f"Adding {len(un_matched_files)} un-matched files to the output")
+	all_origin_info.update({
+			'un_matched_files': un_matched_files
+		})
+
 print(json.dumps(all_origin_info))