adding license string search results, if present

Glenn Snyder · Glenn Snyder · commit 63a24eaf3928 · 2020-06-17T15:53:36.000-04:00
diff --git a/examples/convert_bom_component_origin_info_to_csv.py b/examples/convert_bom_component_origin_info_to_csv.py
@@ -9,12 +9,17 @@
 parser = argparse.ArgumentParser("Process the JSON output from get_bom_component_origin_info.py to create CSV output format")
 parser.add_argument("-f", "--origin_info", help="By default, program reads JSON doc from stdin, but you can alternatively give a file name")
 parser.add_argument("-u", "--un_matched_files", action="store_true", help="Include un-matched files in the output")
-parser.add_argument("-l", "--file_level_license", action="store_true", help="Include file level license data, if present")
-parser.add_argument("-c", "--file_level_copyright", action="store_true", help="Include file level copyright data, if present")
+parser.add_argument("-l", "--file_level_license", action="store_true", help="Include file level license data (aka deep license data from the Black Duck KB), if present")
+parser.add_argument("-c", "--file_level_copyright", action="store_true", help="Include file level copyright data (aka copyright data from the Black Duck KB), if present")
+parser.add_argument("-s", "--string_search", action="store_true", help="Include any licenses found via string search (i.e. --detect.blackduck.signature.scanner.license.search==true")
+parser.add_argument("-a", "--all", action="store_true", help="Shortcut for including everything (i.e. all of it)")
 parser.add_argument("output_file")
 
 args = parser.parse_args()
 
+if args.all:
+    args.un_matched_files = args.file_level_license = args.file_level_copyright = args.string_search = True
+
 logging.basicConfig(format='%(asctime)s%(levelname)s:%(message)s', stream=sys.stderr, level=logging.DEBUG)
 logging.getLogger("requests").setLevel(logging.WARNING)
 logging.getLogger("urllib3").setLevel(logging.WARNING)
@@ -35,17 +40,22 @@
         'source',
         'origin(s)',
         'origin_id(s)',
-        'copyright'
+        'copyright',
+        'match type(s)',
+        'codelocation'
     ]
     writer = csv.DictWriter(csv_file, fieldnames=columns)
     writer.writeheader()
 
     for component, component_info in origin_info.items():
-        if component == 'un_matched_files':
-            # ignore, skip un_matched_files
+        if component in ['un_matched_files', 'license_search_results']:
+            # ignore, skip un_matched_files and license search results
+            # since they are not components but other sections of the JSON doc
+            #
             continue
         logging.debug(f"Writing info for {component}")
         for matched_file_info in component_info.get('matched_files', []):
+            # import pdb; pdb.set_trace()
             row = {
                 'component': component,
                 'file path': matched_file_info['filePath']['path'],
@@ -57,6 +67,8 @@
                 'origin(s)': ",".join([o['externalNamespace'] for o in component_info['bom_component_info']['origins']]),
                 'origin_id(s)': ",".join([o.get('externalId', "") for o in component_info['bom_component_info']['origins']]),
                 'copyright': None,
+                'match type(s)': ",".join(component_info['bom_component_info'].get('matchTypes', [])),
+                'codelocation': matched_file_info['scan']['name'],
             }
             writer.writerow(row)
 
@@ -75,7 +87,9 @@
                             'source': 'KB',
                             'origin(s)': origin.get('originName'),
                             'origin_id(s)': origin.get('originId'),
-                            'copyright': None
+                            'copyright': None,
+                            'match type(s)': 'From KB',
+                            'codelocation': None,
                         }
                         writer.writerow(row)
 
@@ -93,11 +107,13 @@
                             'origin(s)': origin.get('originName'),
                             'origin_id(s)': origin.get('originId'),
                             'copyright': copyright['matchData'].replace('\n', ''),
+                            'match type(s)': 'From KB',
+                            'codelocation': None
                         }
                         writer.writerow(row)
 
     if args.un_matched_files:
-        for un_matched_file in origin_info.get('un_matched_files'):
+        for un_matched_file in origin_info.get('un_matched_files', []):
             uri = urllib.parse.unquote(un_matched_file['uri'])
             parsed = urllib.parse.urlparse(uri)
             if parsed.scheme == 'zip':
@@ -115,13 +131,50 @@
 
             row = {
                 'component': None,
-                'component modified': None,
                 'file path': file_path,
                 'file name': file_name,
                 'archive context': archive_context,
                 'usage(s)': None,
                 'license(s)': None,
-                'match type(s)': "Un-matched/Un-identified",
-                'scan (code location)': un_matched_file.get('scan', {}).get('name', 'unknown')
+                'source': 'customers source',
+                'origin(s)': None,
+                'origin_id(s)': None,
+                'copyright': None,
+                'match type(s)': 'Not matched (un-identified)',
+                'codelocation': None
             }
-            writer.writerow(row)
+            writer.writerow(row)
+
+    if args.string_search:
+        for codelocation, codelocation_info in origin_info.get("license_search_results", {}).items():
+            for scan in codelocation_info.get("scans", []):
+                for file_bom_entry in scan.get("file_bom_entries", []):
+                    row = {
+                        'component': None,
+                        'file path': file_bom_entry.get('uri'),
+                        'file name': file_bom_entry.get('name'),
+                        'archive context': file_bom_entry.get('compositePath', {}).get('archiveContext'),
+                        'usage(s)': None,
+                        'license(s)': None,
+                        'source': 'customers source',
+                        'origin(s)': None,
+                        'origin_id(s)': None,
+                        'copyright': None,
+                        'match type(s)': 'License Search',
+                        'codelocation': codelocation
+                    }
+                    writer.writerow(row)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/examples/get_bom_component_origin_info.py b/examples/get_bom_component_origin_info.py
@@ -8,17 +8,21 @@
 from blackduck.HubRestApi import HubInstance
 
 
-parser = argparse.ArgumentParser("Retreive BOM component license information for the given project and version")
+parser = argparse.ArgumentParser("Retreive BOM component origin information, and additional information, for the given project and version")
 parser.add_argument("project_name")
 parser.add_argument("version")
-parser.add_argument("-l", "--deep_license_info", action="store_true")
-parser.add_argument("-c", "--copyright_info", action="store_true")
-parser.add_argument("-m", "--matched_files", action="store_true")
-parser.add_argument("-u", "--un_matched_files", action="store_true")
+parser.add_argument("-l", "--deep_license_info", action="store_true", help="Include deep license (aka embedded license) information from the Black Duck KB for (KB) components in the BOM")
+parser.add_argument("-c", "--copyright_info", action="store_true", help="Include copyright info from the Black Duck KB for (KB) components in the BOM")
+parser.add_argument("-m", "--matched_files", action="store_true", help="Include a list of the matched (aka identified) files and the components they belong to.")
+parser.add_argument("-u", "--un_matched_files", action="store_true", help="Include a list of un-matched (un-identified) files")
+parser.add_argument("-s", "--string_search", action="store_true", help="Include any licenses found via string search (i.e. --detect.blackduck.signature.scanner.license.search==true")
+parser.add_argument("-a", "--all", action="store_true", help="Shortcut for including everything (i.e. all of it)")
 
 
 args = parser.parse_args()
 
+if args.all:
+    args.deep_license_info = args.copyright_info = args.matched_files = args.un_matched_files = args.string_search = True
 
 hub = HubInstance()
 
@@ -120,4 +124,63 @@
 			'un_matched_files': un_matched_files
 		})
 
+if args.string_search:
+    #
+    # Gathering the information on additional licenses/files identified using the
+    # string search features requires we iterate over all the file system scans
+    # to retrieve any additional licenses/files "discovered" (i.e. in GUI they are
+    # displayed as "discoveries")
+    #
+    version_id = version['_meta']['href'].split("/")[-1]
+    codelocations_url = hub.get_link(version, "codelocations")
+    codelocations = hub.execute_get(codelocations_url).json().get('items', [])
+
+    # all the results will be stored here using the code location
+    # name as the key and the value will include all the licenses, files
+    # found to have license info in them
+    #
+    license_search_results = {}
+
+    for codeloc in codelocations:
+        license_search_results.update({
+                codeloc['name']: {
+                    'codeloc_info': codeloc
+                }
+            })
+
+        codeloc_id = codeloc['_meta']['href'].split("/")[-1]
+        scans_url = hub.get_link(codeloc, "scans")
+        scans = hub.execute_get(scans_url).json().get('items', [])
+        latest_scan_url = hub.get_link(codeloc, "latest-scan")
+        latest_scan = hub.execute_get(latest_scan_url).json()
+
+        all_scans = []
+
+        # TODO: Do I need to trim to the latest FS scan? Leaving it as list for now
+        fs_scans = list(filter(lambda s: s['scanType'] == "FS", scans))
+
+        for fs_scan in fs_scans:
+            scan_id = fs_scan['_meta']['href'].split("/")[-1]
+            lic_summary_url = version['_meta']['href'] + f"/scans/{scan_id}/license-search-summary"
+            custom_headers = {'Accept':'*/*'}
+            lic_search_summary = hub.execute_get(lic_summary_url, custom_headers=custom_headers).json().get('items', [])
+
+            file_bom_entries = []
+            for license_d in lic_search_summary:
+                logging.debug(f"Getting {license_d['fileCount']} files where {license_d['licenseName']} was referenced.")
+                file_bom_entries_url = hub.get_apibase() + f"/internal/releases/{version_id}/scans/{scan_id}/nodes/0/file-bom-entries?offset=0&limit=100&sort=&allDescendants=true&filter=stringSearchLicense:{license_d['vsl']}"
+                file_bom_entries.extend(hub.execute_get(file_bom_entries_url).json().get('items', []))
+            all_scans.append({
+                    'scan_info': fs_scan,
+                    'lic_search_summary': lic_search_summary,
+                    'file_bom_entries': file_bom_entries
+                })
+        license_search_results[codeloc['name']].update({
+                'scans': all_scans
+            })
+
+    all_origin_info.update({
+            'license_search_results': license_search_results
+        })
+
 print(json.dumps(all_origin_info))
diff --git a/examples/get_license_search_results.py b/examples/get_license_search_results.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import logging
+import sys
+
+from blackduck.HubRestApi import HubInstance
+
+
+parser = argparse.ArgumentParser("Retreive license search results, i.e. --detect.blackduck.signature.scanner.license.search=true")
+parser.add_argument("project_name")
+parser.add_argument("version")
+
+args = parser.parse_args()
+
+logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', stream=sys.stderr, level=logging.DEBUG)
+logging.getLogger("requests").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+hub = HubInstance()
+
+project = hub.get_project_by_name(args.project_name)
+version = hub.get_version_by_name(project, args.version)
+
+version_id = version['_meta']['href'].split("/")[-1]
+
+codelocations_url = hub.get_link(version, "codelocations")
+codelocations = hub.execute_get(codelocations_url).json().get('items', [])
+
+# all the results will be stored here using the code location
+# name as the key and the value will include all the licenses, files
+# found to have license info in them
+#
+license_search_results = {}
+
+for codeloc in codelocations:
+	license_search_results.update({
+			codeloc['name']: {
+				'codeloc_info': codeloc
+			}
+		})
+
+	codeloc_id = codeloc['_meta']['href'].split("/")[-1]
+	scans_url = hub.get_link(codeloc, "scans")
+	scans = hub.execute_get(scans_url).json().get('items', [])
+	latest_scan_url = hub.get_link(codeloc, "latest-scan")
+	latest_scan = hub.execute_get(latest_scan_url).json()
+
+	all_scans = []
+
+	# TODO: Do I need to trim to the latest FS scan? Leaving it as list for now
+	fs_scans = list(filter(lambda s: s['scanType'] == "FS", scans))
+
+	for fs_scan in fs_scans:
+		scan_id = fs_scan['_meta']['href'].split("/")[-1]
+		lic_summary_url = version['_meta']['href'] + f"/scans/{scan_id}/license-search-summary"
+		custom_headers = {'Accept':'*/*'}
+		lic_search_summary = hub.execute_get(lic_summary_url, custom_headers=custom_headers).json().get('items', [])
+
+		file_bom_entries = []
+		for license_d in lic_search_summary:
+			logging.debug(f"Getting {license_d['fileCount']} files where {license_d['licenseName']} was referenced.")
+			file_bom_entries_url = hub.get_apibase() + f"/internal/releases/{version_id}/scans/{scan_id}/nodes/0/file-bom-entries?offset=0&limit=100&sort=&allDescendants=true&filter=stringSearchLicense:{license_d['vsl']}"
+			file_bom_entries.extend(hub.execute_get(file_bom_entries_url).json().get('items', []))
+		all_scans.append({
+				'scan_info': fs_scan,
+				'lic_search_summary': lic_search_summary,
+				'file_bom_entries': file_bom_entries
+			})
+	license_search_results[codeloc['name']].update({
+			'scans': all_scans
+		})
+
+print(json.dumps(license_search_results))
+
+
+
+
+
+