aboutcode-org
diff --git a/‎etc/scripts/json2csv.py‎
Lines changed: 66 additions & 55 deletions b/‎etc/scripts/json2csv.py‎
Lines changed: 66 additions & 55 deletions
@@ -45,13 +45,13 @@
 
 def load_scan(json_input):
     """
-    Return a list of scan results loaded from a json_input, either in ScanCode
-    standard JSON format or the data.json html-app format.
+    Return a list of scan results loaded from a json_input, either in
+    ScanCode standard JSON format or the data.json html-app format.
     """
     with codecs.open(json_input, 'rb', encoding='utf-8') as jsonf:
         scan = jsonf.read()
 
-    # strip the leading data padding if any (used in the html-app JSON)
+    # strip the leading JSON data padding if any (used in the html-app JSON)
     html_app_lead = 'data='
     is_html_app_json = scan.startswith(html_app_lead)
     if is_html_app_json:
@@ -78,28 +78,28 @@ def json_scan_to_csv(json_input, csv_output, prefix_path=False):
         ('email', []),
         ('url', []),
         ('package', []),
-        ])
+    ])
 
-    if prefix_path:
-        rows = list(flatten_scan(scan_results, headers, True))
-    else:
-        rows = list(flatten_scan(scan_results, headers))
+    # note: FIXME: headers are collected as a side effect and this is not great
+    rows = list(flatten_scan(scan_results, headers, prefix_path))
 
     ordered_headers = []
     for key_group in headers.values():
         ordered_headers.extend(key_group)
 
     w = unicodecsv.DictWriter(csv_output, ordered_headers)
     w.writeheader()
+
     for r in rows:
         w.writerow(r)
 
 
 def flatten_scan(scan, headers, prefix_path=False):
     """
-    Yield ordered dictionaries of key/values flattening the data and
-    keying always by path, given a ScanCode scan results list.
-    Update the headers mapping list with seen keys as a side effect.
+    Yield ordered dictionaries of key/values flattening the sequence
+    data in a single line-separated value and keying always by path,
+    given a ScanCode `scan` results list. Update the `headers` mapping
+    sequences with seen keys as a side effect.
     """
     seen = set()
 
@@ -113,34 +113,44 @@ def collect_keys(mapping, key_group):
         path = scanned_file.pop('path')
 
         # alway use a root slash
-        path = path if path.startswith('/') else '/' + path
+        if not path.startswith('/'):
+            path = '/' + path
 
-        # alway use a trailing slash for directories
-        if scanned_file.get('type', '') == 'directory':
-            if not path.endswith('/'):
-                path = path + '/'
+        # use a trailing slash for directories
+        if scanned_file.get('type') == 'directory' and not path.endswith('/'):
+            path += '/'
 
         if prefix_path:
-            # Create a root directory if option is selected
             path = '/code' + path
 
         errors = scanned_file.pop('scan_errors', [])
 
-        file_info = OrderedDict()
-        file_info['Resource'] = path
-        # info are NOT lists: lists are the actual scans
-        file_info.update(((k, v) for k, v in scanned_file.items() if not isinstance(v, list)))
+        file_info = OrderedDict(Resource=path)
+        file_info.update(((k, v) for k, v in scanned_file.items() 
+        # FIXME: info are NOT lists: lists are the actual scans
+                          if not isinstance(v, list)))
         # Scan errors are joined in a single multi-line value
         file_info['scan_errors'] = '\n'.join(errors)
         collect_keys(file_info, 'info')
         yield file_info
 
         for licensing in scanned_file.get('licenses', []):
-            lic = OrderedDict()
-            lic['Resource'] = path
+            lic = OrderedDict(Resource=path)
             for k, val in licensing.items():
-                # do not include matched rule details for now.
+                # do not include matched text for now.
+                if k == 'matched_text':
+                    continue
                 if k == 'matched_rule':
+                    for mrk, mrv in val.items():
+                        mrk = 'matched_rule__' + mrk
+                        if mrk == 'license_choice':
+                            mrv = 'y' if mrv else ''
+                        if mrk == 'licenses':
+                            mrv = ' '.join(mrv)
+                        if mrk in ('match_coverage', 'rule_relevance'):
+                            # normalize the string representation of this number
+                            mrv = '{:.2f}'.format(mrv)
+                        lic[mrk] = mrv
                     continue
 
                 if k == 'score':
@@ -155,7 +165,7 @@ def collect_keys(mapping, key_group):
             collect_keys(lic, 'license')
             yield lic
 
-        key_to_header_mapping = [
+        copyright_key_to_column_name = [
             ('statements', 'copyright'),
             ('holders', 'copyright_holder'),
             ('authors', 'author')
@@ -164,26 +174,23 @@ def collect_keys(mapping, key_group):
             start_line = copy_info['start_line']
             end_line = copy_info['end_line']
             # rename some keys to a different column header
-            for key, header in key_to_header_mapping:
+            for key, header in copyright_key_to_column_name:
                 for cop in copy_info.get(key, []):
-                    inf = OrderedDict()
-                    inf['Resource'] = path
+                    inf = OrderedDict(Resource=path)
                     inf[header] = cop
                     inf['start_line'] = start_line
                     inf['end_line'] = end_line
                     collect_keys(inf, 'copyright')
                     yield inf
 
         for email in scanned_file.get('emails', []):
-            email_info = OrderedDict()
-            email_info['Resource'] = path
+            email_info = OrderedDict(Resource=path)
             email_info.update(email)
             collect_keys(email_info, 'email')
             yield email_info
 
         for url in scanned_file.get('urls', []):
-            url_info = OrderedDict()
-            url_info['Resource'] = path
+            url_info = OrderedDict(Resource=path)
             url_info.update(url)
             collect_keys(url_info, 'url')
             yield url_info
@@ -203,41 +210,45 @@ def collect_keys(mapping, key_group):
         }
 
         for package in scanned_file.get('packages', []):
-            pack = OrderedDict()
-            pack['Resource'] = path
+            pack = OrderedDict(Resource=path)
             for k, val in package.items():
                 # prefix columns with "package__"
                 nk = 'package__' + k
 
-                # keep all non-excluded plain string values
-                if k not in excluded_package_columns and not isinstance(val, (list, dict, OrderedDict)):
+                if k in excluded_package_columns:
+                    continue
+
+                # process plain string values
+                if not isinstance(val, (list, dict, OrderedDict)):
                     # prefix versions with a v to avoid spreadsheet tools to mistake
                     # a version for a number or date.
                     if k == 'version' and val:
                         val = 'v ' + val
                     pack[nk] = val
 
-                # FIXME: we only keep for now some of the value lists
-                elif k in ('authors', 'download_urls', 'copyrights', 'asserted_licenses'):
-                    pack[nk] = ''
-                    if val and len(val):
-                        if k == 'authors':
-                            # FIXME: we only keep the first author name for now
-                            pack[nk] = val[0]['name']
+                # FIXME: we only keep for now some of the value collections
+                elif not val or k not in ('authors', 'download_urls', 'copyrights', 'asserted_licenses'):
+                    continue
+
+                pack[nk] = ''
+                if k == 'authors':
+                    # FIXME: we only keep the first author name for now
+                    pack[nk] = val[0]['name']
 
-                        if k == 'download_urls':
-                            # FIXME: we only keep the first URL for now
-                            pack[nk] = val[0]
+                elif k == 'download_urls':
+                    # FIXME: we only keep the first URL for now
+                    pack[nk] = val[0]
 
-                        if k == 'copyrights':
-                            # All copyright statements are joined in a single multiline value
-                            pack[nk] = '\n'.join(val)
+                elif k == 'copyrights':
+                    # All copyright statements are joined in a single multiline value
+                    pack[nk] = '\n'.join(val)
 
-                        if k == 'asserted_licenses':
-                            # All licenses are joined in a single multi-line value
-                            licenses = [license_info.get('license') for license_info in val]
-                            licenses = [lic for lic in licenses if lic]
-                            pack[nk] = '\n'.join(licenses)
+                elif k == 'asserted_licenses':
+                    # FIXME: we only keep some license data for now
+                    # All licenses are joined in a single multi-line value
+                    licenses = [license_info.get('license') for license_info in val]
+                    licenses = [lic for lic in licenses if lic]
+                    pack[nk] = '\n'.join(licenses)
 
             collect_keys(pack, 'package')
             yield pack
@@ -247,7 +258,7 @@ def collect_keys(mapping, key_group):
 @click.argument('json_input', type=click.Path(exists=True, readable=True))
 @click.argument('csv_output', type=click.File('wb', lazy=False))
 @click.help_option('-h', '--help')
-@click.option('--prefix_path', is_flag=True, default=False, help='Prefix a root directory to resource paths')
+@click.option('--prefix_path', is_flag=True, default=False, help='Add a "/code" directory prefix to all paths.')
 def cli(json_input, csv_output, prefix_path=False):
     """
     Convert a ScanCode JSON scan file to a nexb-toolkit-like CSV.