4545
4646def load_scan (json_input ):
4747 """
48- Return a list of scan results loaded from a json_input, either in ScanCode
49- standard JSON format or the data.json html-app format.
48+ Return a list of scan results loaded from a json_input, either in
49+ ScanCode standard JSON format or the data.json html-app format.
5050 """
5151 with codecs .open (json_input , 'rb' , encoding = 'utf-8' ) as jsonf :
5252 scan = jsonf .read ()
5353
54- # strip the leading data padding if any (used in the html-app JSON)
54+ # strip the leading JSON data padding if any (used in the html-app JSON)
5555 html_app_lead = 'data='
5656 is_html_app_json = scan .startswith (html_app_lead )
5757 if is_html_app_json :
@@ -78,28 +78,28 @@ def json_scan_to_csv(json_input, csv_output, prefix_path=False):
7878 ('email' , []),
7979 ('url' , []),
8080 ('package' , []),
81- ])
81+ ])
8282
83- if prefix_path :
84- rows = list (flatten_scan (scan_results , headers , True ))
85- else :
86- rows = list (flatten_scan (scan_results , headers ))
83+ # note: FIXME: headers are collected as a side effect and this is not great
84+ rows = list (flatten_scan (scan_results , headers , prefix_path ))
8785
8886 ordered_headers = []
8987 for key_group in headers .values ():
9088 ordered_headers .extend (key_group )
9189
9290 w = unicodecsv .DictWriter (csv_output , ordered_headers )
9391 w .writeheader ()
92+
9493 for r in rows :
9594 w .writerow (r )
9695
9796
9897def flatten_scan (scan , headers , prefix_path = False ):
9998 """
100- Yield ordered dictionaries of key/values flattening the data and
101- keying always by path, given a ScanCode scan results list.
102- Update the headers mapping list with seen keys as a side effect.
99+ Yield ordered dictionaries of key/values flattening the sequence
100+ data in a single line-separated value and keying always by path,
101+ given a ScanCode `scan` results list. Update the `headers` mapping
102+ sequences with seen keys as a side effect.
103103 """
104104 seen = set ()
105105
@@ -113,34 +113,44 @@ def collect_keys(mapping, key_group):
113113 path = scanned_file .pop ('path' )
114114
115115 # alway use a root slash
116- path = path if path .startswith ('/' ) else '/' + path
116+ if not path .startswith ('/' ):
117+ path = '/' + path
117118
118- # alway use a trailing slash for directories
119- if scanned_file .get ('type' , '' ) == 'directory' :
120- if not path .endswith ('/' ):
121- path = path + '/'
119+ # use a trailing slash for directories
120+ if scanned_file .get ('type' ) == 'directory' and not path .endswith ('/' ):
121+ path += '/'
122122
123123 if prefix_path :
124- # Create a root directory if option is selected
125124 path = '/code' + path
126125
127126 errors = scanned_file .pop ('scan_errors' , [])
128127
129- file_info = OrderedDict ()
130- file_info [ 'Resource' ] = path
131- # info are NOT lists: lists are the actual scans
132- file_info . update ((( k , v ) for k , v in scanned_file . items () if not isinstance (v , list )))
128+ file_info = OrderedDict (Resource = path )
129+ file_info . update ((( k , v ) for k , v in scanned_file . items ()
130+ # FIXME: info are NOT lists: lists are the actual scans
131+ if not isinstance (v , list )))
133132 # Scan errors are joined in a single multi-line value
134133 file_info ['scan_errors' ] = '\n ' .join (errors )
135134 collect_keys (file_info , 'info' )
136135 yield file_info
137136
138137 for licensing in scanned_file .get ('licenses' , []):
139- lic = OrderedDict ()
140- lic ['Resource' ] = path
138+ lic = OrderedDict (Resource = path )
141139 for k , val in licensing .items ():
142- # do not include matched rule details for now.
140+ # do not include matched text for now.
141+ if k == 'matched_text' :
142+ continue
143143 if k == 'matched_rule' :
144+ for mrk , mrv in val .items ():
145+ mrk = 'matched_rule__' + mrk
146+ if mrk == 'license_choice' :
147+ mrv = 'y' if mrv else ''
148+ if mrk == 'licenses' :
149+ mrv = ' ' .join (mrv )
150+ if mrk in ('match_coverage' , 'rule_relevance' ):
151+ # normalize the string representation of this number
152+ mrv = '{:.2f}' .format (mrv )
153+ lic [mrk ] = mrv
144154 continue
145155
146156 if k == 'score' :
@@ -155,7 +165,7 @@ def collect_keys(mapping, key_group):
155165 collect_keys (lic , 'license' )
156166 yield lic
157167
158- key_to_header_mapping = [
168+ copyright_key_to_column_name = [
159169 ('statements' , 'copyright' ),
160170 ('holders' , 'copyright_holder' ),
161171 ('authors' , 'author' )
@@ -164,26 +174,23 @@ def collect_keys(mapping, key_group):
164174 start_line = copy_info ['start_line' ]
165175 end_line = copy_info ['end_line' ]
166176 # rename some keys to a different column header
167- for key , header in key_to_header_mapping :
177+ for key , header in copyright_key_to_column_name :
168178 for cop in copy_info .get (key , []):
169- inf = OrderedDict ()
170- inf ['Resource' ] = path
179+ inf = OrderedDict (Resource = path )
171180 inf [header ] = cop
172181 inf ['start_line' ] = start_line
173182 inf ['end_line' ] = end_line
174183 collect_keys (inf , 'copyright' )
175184 yield inf
176185
177186 for email in scanned_file .get ('emails' , []):
178- email_info = OrderedDict ()
179- email_info ['Resource' ] = path
187+ email_info = OrderedDict (Resource = path )
180188 email_info .update (email )
181189 collect_keys (email_info , 'email' )
182190 yield email_info
183191
184192 for url in scanned_file .get ('urls' , []):
185- url_info = OrderedDict ()
186- url_info ['Resource' ] = path
193+ url_info = OrderedDict (Resource = path )
187194 url_info .update (url )
188195 collect_keys (url_info , 'url' )
189196 yield url_info
@@ -203,41 +210,45 @@ def collect_keys(mapping, key_group):
203210 }
204211
205212 for package in scanned_file .get ('packages' , []):
206- pack = OrderedDict ()
207- pack ['Resource' ] = path
213+ pack = OrderedDict (Resource = path )
208214 for k , val in package .items ():
209215 # prefix columns with "package__"
210216 nk = 'package__' + k
211217
212- # keep all non-excluded plain string values
213- if k not in excluded_package_columns and not isinstance (val , (list , dict , OrderedDict )):
218+ if k in excluded_package_columns :
219+ continue
220+
221+ # process plain string values
222+ if not isinstance (val , (list , dict , OrderedDict )):
214223 # prefix versions with a v to avoid spreadsheet tools to mistake
215224 # a version for a number or date.
216225 if k == 'version' and val :
217226 val = 'v ' + val
218227 pack [nk ] = val
219228
220- # FIXME: we only keep for now some of the value lists
221- elif k in ('authors' , 'download_urls' , 'copyrights' , 'asserted_licenses' ):
222- pack [nk ] = ''
223- if val and len (val ):
224- if k == 'authors' :
225- # FIXME: we only keep the first author name for now
226- pack [nk ] = val [0 ]['name' ]
229+ # FIXME: we only keep for now some of the value collections
230+ elif not val or k not in ('authors' , 'download_urls' , 'copyrights' , 'asserted_licenses' ):
231+ continue
232+
233+ pack [nk ] = ''
234+ if k == 'authors' :
235+ # FIXME: we only keep the first author name for now
236+ pack [nk ] = val [0 ]['name' ]
227237
228- if k == 'download_urls' :
229- # FIXME: we only keep the first URL for now
230- pack [nk ] = val [0 ]
238+ elif k == 'download_urls' :
239+ # FIXME: we only keep the first URL for now
240+ pack [nk ] = val [0 ]
231241
232- if k == 'copyrights' :
233- # All copyright statements are joined in a single multiline value
234- pack [nk ] = '\n ' .join (val )
242+ elif k == 'copyrights' :
243+ # All copyright statements are joined in a single multiline value
244+ pack [nk ] = '\n ' .join (val )
235245
236- if k == 'asserted_licenses' :
237- # All licenses are joined in a single multi-line value
238- licenses = [license_info .get ('license' ) for license_info in val ]
239- licenses = [lic for lic in licenses if lic ]
240- pack [nk ] = '\n ' .join (licenses )
246+ elif k == 'asserted_licenses' :
247+ # FIXME: we only keep some license data for now
248+ # All licenses are joined in a single multi-line value
249+ licenses = [license_info .get ('license' ) for license_info in val ]
250+ licenses = [lic for lic in licenses if lic ]
251+ pack [nk ] = '\n ' .join (licenses )
241252
242253 collect_keys (pack , 'package' )
243254 yield pack
@@ -247,7 +258,7 @@ def collect_keys(mapping, key_group):
247258@click .argument ('json_input' , type = click .Path (exists = True , readable = True ))
248259@click .argument ('csv_output' , type = click .File ('wb' , lazy = False ))
249260@click .help_option ('-h' , '--help' )
250- @click .option ('--prefix_path' , is_flag = True , default = False , help = 'Prefix a root directory to resource paths' )
261+ @click .option ('--prefix_path' , is_flag = True , default = False , help = 'Add a "/code" directory prefix to all paths. ' )
251262def cli (json_input , csv_output , prefix_path = False ):
252263 """
253264 Convert a ScanCode JSON scan file to a nexb-toolkit-like CSV.
0 commit comments