Skip to content

Commit 1fea769

Browse files
committed
Add match license rules details to CSV. Fix #667
* also refactored tests for simplicity and other minor refinements Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 62550c1 commit 1fea769

File tree

10 files changed

+7309
-5925
lines changed

10 files changed

+7309
-5925
lines changed

etc/scripts/json2csv.py

Lines changed: 66 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@
4545

4646
def load_scan(json_input):
4747
"""
48-
Return a list of scan results loaded from a json_input, either in ScanCode
49-
standard JSON format or the data.json html-app format.
48+
Return a list of scan results loaded from a json_input, either in
49+
ScanCode standard JSON format or the data.json html-app format.
5050
"""
5151
with codecs.open(json_input, 'rb', encoding='utf-8') as jsonf:
5252
scan = jsonf.read()
5353

54-
# strip the leading data padding if any (used in the html-app JSON)
54+
# strip the leading JSON data padding if any (used in the html-app JSON)
5555
html_app_lead = 'data='
5656
is_html_app_json = scan.startswith(html_app_lead)
5757
if is_html_app_json:
@@ -78,28 +78,28 @@ def json_scan_to_csv(json_input, csv_output, prefix_path=False):
7878
('email', []),
7979
('url', []),
8080
('package', []),
81-
])
81+
])
8282

83-
if prefix_path:
84-
rows = list(flatten_scan(scan_results, headers, True))
85-
else:
86-
rows = list(flatten_scan(scan_results, headers))
83+
# note: FIXME: headers are collected as a side effect and this is not great
84+
rows = list(flatten_scan(scan_results, headers, prefix_path))
8785

8886
ordered_headers = []
8987
for key_group in headers.values():
9088
ordered_headers.extend(key_group)
9189

9290
w = unicodecsv.DictWriter(csv_output, ordered_headers)
9391
w.writeheader()
92+
9493
for r in rows:
9594
w.writerow(r)
9695

9796

9897
def flatten_scan(scan, headers, prefix_path=False):
9998
"""
100-
Yield ordered dictionaries of key/values flattening the data and
101-
keying always by path, given a ScanCode scan results list.
102-
Update the headers mapping list with seen keys as a side effect.
99+
Yield ordered dictionaries of key/values flattening the sequence
100+
data in a single line-separated value and keying always by path,
101+
given a ScanCode `scan` results list. Update the `headers` mapping
102+
sequences with seen keys as a side effect.
103103
"""
104104
seen = set()
105105

@@ -113,34 +113,44 @@ def collect_keys(mapping, key_group):
113113
path = scanned_file.pop('path')
114114

115115
# alway use a root slash
116-
path = path if path.startswith('/') else '/' + path
116+
if not path.startswith('/'):
117+
path = '/' + path
117118

118-
# alway use a trailing slash for directories
119-
if scanned_file.get('type', '') == 'directory':
120-
if not path.endswith('/'):
121-
path = path + '/'
119+
# use a trailing slash for directories
120+
if scanned_file.get('type') == 'directory' and not path.endswith('/'):
121+
path += '/'
122122

123123
if prefix_path:
124-
# Create a root directory if option is selected
125124
path = '/code' + path
126125

127126
errors = scanned_file.pop('scan_errors', [])
128127

129-
file_info = OrderedDict()
130-
file_info['Resource'] = path
131-
# info are NOT lists: lists are the actual scans
132-
file_info.update(((k, v) for k, v in scanned_file.items() if not isinstance(v, list)))
128+
file_info = OrderedDict(Resource=path)
129+
file_info.update(((k, v) for k, v in scanned_file.items()
130+
# FIXME: info are NOT lists: lists are the actual scans
131+
if not isinstance(v, list)))
133132
# Scan errors are joined in a single multi-line value
134133
file_info['scan_errors'] = '\n'.join(errors)
135134
collect_keys(file_info, 'info')
136135
yield file_info
137136

138137
for licensing in scanned_file.get('licenses', []):
139-
lic = OrderedDict()
140-
lic['Resource'] = path
138+
lic = OrderedDict(Resource=path)
141139
for k, val in licensing.items():
142-
# do not include matched rule details for now.
140+
# do not include matched text for now.
141+
if k == 'matched_text':
142+
continue
143143
if k == 'matched_rule':
144+
for mrk, mrv in val.items():
145+
mrk = 'matched_rule__' + mrk
146+
if mrk == 'license_choice':
147+
mrv = 'y' if mrv else ''
148+
if mrk == 'licenses':
149+
mrv = ' '.join(mrv)
150+
if mrk in ('match_coverage', 'rule_relevance'):
151+
# normalize the string representation of this number
152+
mrv = '{:.2f}'.format(mrv)
153+
lic[mrk] = mrv
144154
continue
145155

146156
if k == 'score':
@@ -155,7 +165,7 @@ def collect_keys(mapping, key_group):
155165
collect_keys(lic, 'license')
156166
yield lic
157167

158-
key_to_header_mapping = [
168+
copyright_key_to_column_name = [
159169
('statements', 'copyright'),
160170
('holders', 'copyright_holder'),
161171
('authors', 'author')
@@ -164,26 +174,23 @@ def collect_keys(mapping, key_group):
164174
start_line = copy_info['start_line']
165175
end_line = copy_info['end_line']
166176
# rename some keys to a different column header
167-
for key, header in key_to_header_mapping:
177+
for key, header in copyright_key_to_column_name:
168178
for cop in copy_info.get(key, []):
169-
inf = OrderedDict()
170-
inf['Resource'] = path
179+
inf = OrderedDict(Resource=path)
171180
inf[header] = cop
172181
inf['start_line'] = start_line
173182
inf['end_line'] = end_line
174183
collect_keys(inf, 'copyright')
175184
yield inf
176185

177186
for email in scanned_file.get('emails', []):
178-
email_info = OrderedDict()
179-
email_info['Resource'] = path
187+
email_info = OrderedDict(Resource=path)
180188
email_info.update(email)
181189
collect_keys(email_info, 'email')
182190
yield email_info
183191

184192
for url in scanned_file.get('urls', []):
185-
url_info = OrderedDict()
186-
url_info['Resource'] = path
193+
url_info = OrderedDict(Resource=path)
187194
url_info.update(url)
188195
collect_keys(url_info, 'url')
189196
yield url_info
@@ -203,41 +210,45 @@ def collect_keys(mapping, key_group):
203210
}
204211

205212
for package in scanned_file.get('packages', []):
206-
pack = OrderedDict()
207-
pack['Resource'] = path
213+
pack = OrderedDict(Resource=path)
208214
for k, val in package.items():
209215
# prefix columns with "package__"
210216
nk = 'package__' + k
211217

212-
# keep all non-excluded plain string values
213-
if k not in excluded_package_columns and not isinstance(val, (list, dict, OrderedDict)):
218+
if k in excluded_package_columns:
219+
continue
220+
221+
# process plain string values
222+
if not isinstance(val, (list, dict, OrderedDict)):
214223
# prefix versions with a v to avoid spreadsheet tools to mistake
215224
# a version for a number or date.
216225
if k == 'version' and val:
217226
val = 'v ' + val
218227
pack[nk] = val
219228

220-
# FIXME: we only keep for now some of the value lists
221-
elif k in ('authors', 'download_urls', 'copyrights', 'asserted_licenses'):
222-
pack[nk] = ''
223-
if val and len(val):
224-
if k == 'authors':
225-
# FIXME: we only keep the first author name for now
226-
pack[nk] = val[0]['name']
229+
# FIXME: we only keep for now some of the value collections
230+
elif not val or k not in ('authors', 'download_urls', 'copyrights', 'asserted_licenses'):
231+
continue
232+
233+
pack[nk] = ''
234+
if k == 'authors':
235+
# FIXME: we only keep the first author name for now
236+
pack[nk] = val[0]['name']
227237

228-
if k == 'download_urls':
229-
# FIXME: we only keep the first URL for now
230-
pack[nk] = val[0]
238+
elif k == 'download_urls':
239+
# FIXME: we only keep the first URL for now
240+
pack[nk] = val[0]
231241

232-
if k == 'copyrights':
233-
# All copyright statements are joined in a single multiline value
234-
pack[nk] = '\n'.join(val)
242+
elif k == 'copyrights':
243+
# All copyright statements are joined in a single multiline value
244+
pack[nk] = '\n'.join(val)
235245

236-
if k == 'asserted_licenses':
237-
# All licenses are joined in a single multi-line value
238-
licenses = [license_info.get('license') for license_info in val]
239-
licenses = [lic for lic in licenses if lic]
240-
pack[nk] = '\n'.join(licenses)
246+
elif k == 'asserted_licenses':
247+
# FIXME: we only keep some license data for now
248+
# All licenses are joined in a single multi-line value
249+
licenses = [license_info.get('license') for license_info in val]
250+
licenses = [lic for lic in licenses if lic]
251+
pack[nk] = '\n'.join(licenses)
241252

242253
collect_keys(pack, 'package')
243254
yield pack
@@ -247,7 +258,7 @@ def collect_keys(mapping, key_group):
247258
@click.argument('json_input', type=click.Path(exists=True, readable=True))
248259
@click.argument('csv_output', type=click.File('wb', lazy=False))
249260
@click.help_option('-h', '--help')
250-
@click.option('--prefix_path', is_flag=True, default=False, help='Prefix a root directory to resource paths')
261+
@click.option('--prefix_path', is_flag=True, default=False, help='Add a "/code" directory prefix to all paths.')
251262
def cli(json_input, csv_output, prefix_path=False):
252263
"""
253264
Convert a ScanCode JSON scan file to a nexb-toolkit-like CSV.

0 commit comments

Comments
 (0)