11#!/usr/bin/python2
22#
3- # Copyright (c) 2016 nexB Inc. and others. All rights reserved.
3+ # Copyright (c) 2017 nexB Inc. and others. All rights reserved.
44# http://nexb.com and https://github.com/nexB/scancode-toolkit/
55# The ScanCode software is licensed under the Apache License version 2.0.
66# Data generated with ScanCode require an acknowledgment.
2323# ScanCode is a free software code scanning tool from nexB Inc. and others.
2424# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
2525
26- from __future__ import print_function , absolute_import
26+ from __future__ import print_function
27+ from __future__ import absolute_import
28+ from __future__ import unicode_literals
2729
30+ import codecs
2831from collections import OrderedDict
2932import json
3033import os
@@ -44,7 +47,7 @@ def load_scan(json_input):
4447 Return a list of scan results loaded from a json_input, either in ScanCode
4548 standard JSON format or the data.json html-app format.
4649 """
47- with open (json_input ) as jsonf :
50+ with codecs . open (json_input , 'rb' , encoding = 'utf-8' ) as jsonf :
4851 scan = jsonf .read ()
4952
5053 # strip the leading data padding if any (used in the html-app JSON)
@@ -68,7 +71,7 @@ def json_scan_to_csv(json_input, csv_output):
6871 scan_results = load_scan (json_input )
6972 rows = list (flatten_scan (scan_results ))
7073 headers = collect_header_keys (rows )
71- with open (csv_output , 'wb' ) as output :
74+ with codecs . open (csv_output , 'wb' , encoding = 'utf-8 ' ) as output :
7275 w = unicodecsv .DictWriter (output , headers )
7376 w .writeheader ()
7477 for r in rows :
@@ -82,91 +85,126 @@ def flatten_scan(scan):
8285 """
8386 for scanned_file in scan :
8487 path = scanned_file ['path' ]
88+
89+ # alway use a root slash
8590 path = path if path .startswith ('/' ) else '/' + path
8691
92+ # alway use a trailing slash for directories
8793 if scanned_file .get ('type' , '' ) == 'directory' :
8894 if not path .endswith ('/' ):
8995 path = path + '/'
9096
97+ # alway create a root directory
9198 path = '/code' + path
9299
93- file_info = OrderedDict (Resource = path )
94- info_details = OrderedDict (((k , v ) for k , v in scanned_file .items () if k != 'path' and not isinstance (v , list )))
100+ file_info = OrderedDict ()
101+ file_info ['Resource' ] = path
102+ info_details = ((k , v ) for k , v in scanned_file .items () if k != 'path' and not isinstance (v , list ))
95103 file_info .update (info_details )
96- # Scan errors are to be joined in a multi-line cell
104+ # Scan errors are joined in a single multi-line value
97105 file_info ['scan_errors' ] = '\n ' .join (scanned_file .get ('scan_errors' , []))
98106 yield file_info
99107
100108 for licensing in scanned_file .get ('licenses' , []):
101- lic = OrderedDict (Resource = path )
109+ lic = OrderedDict ()
110+ lic ['Resource' ] = path
102111 for k , val in licensing .items ():
112+ # do not include matched rule details for now.
103113 if k == 'matched_rule' :
104114 continue
115+
116+ if k == 'score' :
117+ # normalize the string representation of this number
118+ val = '{:.2f}' .format (val )
119+
120+ # lines are present in multiple scans: keep their column name as not scan-specific
121+ # Prefix othe columns with license__
105122 if k not in ('start_line' , 'end_line' ,):
106123 k = 'license__' + k
107124 lic [k ] = val
108125 yield lic
109126
127+ key_to_header_mapping = [
128+ ('statements' , 'copyright' ),
129+ ('holders' , 'copyright_holder' ),
130+ ('authors' , 'author' )
131+ ]
110132 for copy_info in scanned_file .get ('copyrights' , []):
111133 start_line = copy_info ['start_line' ]
112134 end_line = copy_info ['end_line' ]
113- for key , header in (('statements' , 'copyright' ), ('holders' , 'copyright_holder' ), ('authors' , 'author' )):
135+ # rename some keys to a different column header
136+ for key , header in key_to_header_mapping :
114137 for cop in copy_info .get (key , []):
115- inf = OrderedDict (Resource = path )
138+ inf = OrderedDict ()
139+ inf ['Resource' ] = path
116140 inf [header ] = cop
117141 inf ['start_line' ] = start_line
118142 inf ['end_line' ] = end_line
119143 yield inf
120144
121145 for email in scanned_file .get ('emails' , []):
122- email_info = OrderedDict (Resource = path )
123- for k , val in email . items ():
124- email_info [ k ] = val
146+ email_info = OrderedDict ()
147+ email_info [ 'Resource' ] = path
148+ email_info . update ( email )
125149 yield email_info
126150
127151 for url in scanned_file .get ('urls' , []):
128- url_info = OrderedDict (Resource = path )
129- for k , val in url . items ():
130- url_info [ k ] = val
152+ url_info = OrderedDict ()
153+ url_info [ 'Resource' ] = path
154+ url_info . update ( url )
131155 yield url_info
132156
133- excluded_columns = ('packaging' ,
134- 'payload_type' ,
135- 'keywords_doc_url' ,
136- 'download_sha1' ,
137- 'download_sha256' ,
138- 'download_md5' ,
139- 'code_view_url' ,
140- 'vcs_tool' ,
141- 'vcs_revision' ,
142- 'license_expression' )
157+ # exclude some columns from the packages for now
158+ excluded_package_columns = {
159+ 'packaging' ,
160+ 'payload_type' ,
161+ 'keywords_doc_url' ,
162+ 'download_sha1' ,
163+ 'download_sha256' ,
164+ 'download_md5' ,
165+ 'code_view_url' ,
166+ 'vcs_tool' ,
167+ 'vcs_revision' ,
168+ 'license_expression'
169+ }
143170
144171 for package in scanned_file .get ('packages' , []):
145- pack = OrderedDict (Resource = path )
172+ pack = OrderedDict ()
173+ pack ['Resource' ] = path
146174 for k , val in package .items ():
175+ # prefix columns with "package__"
147176 nk = 'package__' + k
148- if not isinstance (val , (list , dict , OrderedDict )):
149- if k not in excluded_columns :
150- if k == 'version' and val :
151- val = 'v ' + val
152- pack [nk ] = val
177+
178+ # keep all non-excluded plain string values
179+ if k not in excluded_package_columns and not isinstance (val , (list , dict , OrderedDict )):
180+ # prefix versions with a v to avoid spreadsheet tools to mistake
181+ # a version for a number or date.
182+ if k == 'version' and val :
183+ val = 'v ' + val
184+ pack [nk ] = val
185+
186+ # FIXME: we only keep for now some of the value lists
153187 elif k in ('authors' , 'download_urls' , 'copyrights' , 'asserted_licenses' ):
154- if len (val ) > 0 :
188+ pack [nk ] = ''
189+ if val and len (val ):
155190 if k == 'authors' :
156- # We only want the first author
191+ # FIXME: we only keep the first author name for now
157192 pack [nk ] = val [0 ]['name' ]
193+
158194 if k == 'download_urls' :
159- # We only want the first URL
195+ # FIXME: we only keep the first URL for now
160196 pack [nk ] = val [0 ]
197+
161198 if k == 'copyrights' :
162- # All copyright statements are to be joined in a multi-line cell
199+ # All copyright statements are joined in a single multiline value
163200 pack [nk ] = '\n ' .join (val )
201+
164202 if k == 'asserted_licenses' :
165- # All license names are to be joined in a multi-line cell
166- licenses = [license_info .get ('license' ) or '' for license_info in val ]
203+ # All licenses are joined in a single multi-line value
204+ licenses = [license_info .get ('license' ) for license_info in val ]
205+ licenses = [lic for lic in licenses if lic ]
167206 pack [nk ] = '\n ' .join (licenses )
168- else :
169- pack [nk ] = ''
207+
170208 yield pack
171209
172210
@@ -192,7 +230,7 @@ def cli(json_input, csv_output):
192230
193231 JSON_INPUT is either a ScanCode json format scan or the data.json file from a ScanCode html-app format scan.
194232
195- Resource path will be prefixed with \' \\ code\ ' to provide a common base directory for scanned resources.
233+ Paths will be prefixed with '/ code/ ' to provide a common base directory for scanned resources.
196234 """
197235 json_input = os .path .abspath (os .path .expanduser (json_input ))
198236 csv_output = os .path .abspath (os .path .expanduser (csv_output ))
0 commit comments