Skip to content

Commit e29deef

Browse files
committed
Improve json2csv
* use unicode throughout * fix incorrect expectations * improve tests * add comments and other minor improvements Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 3a1d54a commit e29deef

File tree

7 files changed

+1539
-1476
lines changed

7 files changed

+1539
-1476
lines changed

etc/scripts/json2csv.py

Lines changed: 79 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/python2
22
#
3-
# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
3+
# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
44
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
55
# The ScanCode software is licensed under the Apache License version 2.0.
66
# Data generated with ScanCode require an acknowledgment.
@@ -23,8 +23,11 @@
2323
# ScanCode is a free software code scanning tool from nexB Inc. and others.
2424
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
2525

26-
from __future__ import print_function, absolute_import
26+
from __future__ import print_function
27+
from __future__ import absolute_import
28+
from __future__ import unicode_literals
2729

30+
import codecs
2831
from collections import OrderedDict
2932
import json
3033
import os
@@ -44,7 +47,7 @@ def load_scan(json_input):
4447
Return a list of scan results loaded from a json_input, either in ScanCode
4548
standard JSON format or the data.json html-app format.
4649
"""
47-
with open(json_input) as jsonf:
50+
with codecs.open(json_input, 'rb', encoding='utf-8') as jsonf:
4851
scan = jsonf.read()
4952

5053
# strip the leading data padding if any (used in the html-app JSON)
@@ -68,7 +71,7 @@ def json_scan_to_csv(json_input, csv_output):
6871
scan_results = load_scan(json_input)
6972
rows = list(flatten_scan(scan_results))
7073
headers = collect_header_keys(rows)
71-
with open(csv_output, 'wb') as output:
74+
with codecs.open(csv_output, 'wb', encoding='utf-8') as output:
7275
w = unicodecsv.DictWriter(output, headers)
7376
w.writeheader()
7477
for r in rows:
@@ -82,91 +85,126 @@ def flatten_scan(scan):
8285
"""
8386
for scanned_file in scan:
8487
path = scanned_file['path']
88+
89+
# alway use a root slash
8590
path = path if path.startswith('/') else '/' + path
8691

92+
# alway use a trailing slash for directories
8793
if scanned_file.get('type', '') == 'directory':
8894
if not path.endswith('/'):
8995
path = path + '/'
9096

97+
# alway create a root directory
9198
path = '/code' + path
9299

93-
file_info = OrderedDict(Resource=path)
94-
info_details = OrderedDict(((k, v) for k, v in scanned_file.items() if k != 'path' and not isinstance(v, list)))
100+
file_info = OrderedDict()
101+
file_info['Resource'] = path
102+
info_details = ((k, v) for k, v in scanned_file.items() if k != 'path' and not isinstance(v, list))
95103
file_info.update(info_details)
96-
# Scan errors are to be joined in a multi-line cell
104+
# Scan errors are joined in a single multi-line value
97105
file_info['scan_errors'] = '\n'.join(scanned_file.get('scan_errors', []))
98106
yield file_info
99107

100108
for licensing in scanned_file.get('licenses', []):
101-
lic = OrderedDict(Resource=path)
109+
lic = OrderedDict()
110+
lic['Resource'] = path
102111
for k, val in licensing.items():
112+
# do not include matched rule details for now.
103113
if k == 'matched_rule':
104114
continue
115+
116+
if k == 'score':
117+
# normalize the string representation of this number
118+
val = '{:.2f}'.format(val)
119+
120+
# lines are present in multiple scans: keep their column name as not scan-specific
121+
# Prefix othe columns with license__
105122
if k not in ('start_line', 'end_line',):
106123
k = 'license__' + k
107124
lic[k] = val
108125
yield lic
109126

127+
key_to_header_mapping = [
128+
('statements', 'copyright'),
129+
('holders', 'copyright_holder'),
130+
('authors', 'author')
131+
]
110132
for copy_info in scanned_file.get('copyrights', []):
111133
start_line = copy_info['start_line']
112134
end_line = copy_info['end_line']
113-
for key, header in (('statements', 'copyright'), ('holders', 'copyright_holder'), ('authors', 'author')):
135+
# rename some keys to a different column header
136+
for key, header in key_to_header_mapping:
114137
for cop in copy_info.get(key, []):
115-
inf = OrderedDict(Resource=path)
138+
inf = OrderedDict()
139+
inf['Resource'] = path
116140
inf[header] = cop
117141
inf['start_line'] = start_line
118142
inf['end_line'] = end_line
119143
yield inf
120144

121145
for email in scanned_file.get('emails', []):
122-
email_info = OrderedDict(Resource=path)
123-
for k, val in email.items():
124-
email_info[k] = val
146+
email_info = OrderedDict()
147+
email_info['Resource'] = path
148+
email_info.update(email)
125149
yield email_info
126150

127151
for url in scanned_file.get('urls', []):
128-
url_info = OrderedDict(Resource=path)
129-
for k, val in url.items():
130-
url_info[k] = val
152+
url_info = OrderedDict()
153+
url_info['Resource'] = path
154+
url_info.update(url)
131155
yield url_info
132156

133-
excluded_columns = ('packaging',
134-
'payload_type',
135-
'keywords_doc_url',
136-
'download_sha1',
137-
'download_sha256',
138-
'download_md5',
139-
'code_view_url',
140-
'vcs_tool',
141-
'vcs_revision',
142-
'license_expression')
157+
# exclude some columns from the packages for now
158+
excluded_package_columns = {
159+
'packaging',
160+
'payload_type',
161+
'keywords_doc_url',
162+
'download_sha1',
163+
'download_sha256',
164+
'download_md5',
165+
'code_view_url',
166+
'vcs_tool',
167+
'vcs_revision',
168+
'license_expression'
169+
}
143170

144171
for package in scanned_file.get('packages', []):
145-
pack = OrderedDict(Resource=path)
172+
pack = OrderedDict()
173+
pack['Resource'] = path
146174
for k, val in package.items():
175+
# prefix columns with "package__"
147176
nk = 'package__' + k
148-
if not isinstance(val, (list, dict, OrderedDict)):
149-
if k not in excluded_columns:
150-
if k == 'version' and val:
151-
val = 'v ' + val
152-
pack[nk] = val
177+
178+
# keep all non-excluded plain string values
179+
if k not in excluded_package_columns and not isinstance(val, (list, dict, OrderedDict)):
180+
# prefix versions with a v to avoid spreadsheet tools to mistake
181+
# a version for a number or date.
182+
if k == 'version' and val:
183+
val = 'v ' + val
184+
pack[nk] = val
185+
186+
# FIXME: we only keep for now some of the value lists
153187
elif k in ('authors', 'download_urls', 'copyrights', 'asserted_licenses'):
154-
if len(val) > 0:
188+
pack[nk] = ''
189+
if val and len(val):
155190
if k == 'authors':
156-
# We only want the first author
191+
# FIXME: we only keep the first author name for now
157192
pack[nk] = val[0]['name']
193+
158194
if k == 'download_urls':
159-
# We only want the first URL
195+
# FIXME: we only keep the first URL for now
160196
pack[nk] = val[0]
197+
161198
if k == 'copyrights':
162-
# All copyright statements are to be joined in a multi-line cell
199+
# All copyright statements are joined in a single multiline value
163200
pack[nk] = '\n'.join(val)
201+
164202
if k == 'asserted_licenses':
165-
# All license names are to be joined in a multi-line cell
166-
licenses = [license_info.get('license') or '' for license_info in val]
203+
# All licenses are joined in a single multi-line value
204+
licenses = [license_info.get('license') for license_info in val]
205+
licenses = [lic for lic in licenses if lic]
167206
pack[nk] = '\n'.join(licenses)
168-
else:
169-
pack[nk] = ''
207+
170208
yield pack
171209

172210

@@ -192,7 +230,7 @@ def cli(json_input, csv_output):
192230
193231
JSON_INPUT is either a ScanCode json format scan or the data.json file from a ScanCode html-app format scan.
194232
195-
Resource path will be prefixed with \'\\code\' to provide a common base directory for scanned resources.
233+
Paths will be prefixed with '/code/' to provide a common base directory for scanned resources.
196234
"""
197235
json_input = os.path.abspath(os.path.expanduser(json_input))
198236
csv_output = os.path.abspath(os.path.expanduser(csv_output))

etc/scripts/test_json2csv.py

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
1+
# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
22
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
33
# The ScanCode software is licensed under the Apache License version 2.0.
44
# Data generated with ScanCode require an acknowledgment.
@@ -21,20 +21,49 @@
2121
# ScanCode is a free software code scanning tool from nexB Inc. and others.
2222
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
2323

24-
from __future__ import print_function, absolute_import
25-
26-
from __future__ import absolute_import, print_function
24+
from __future__ import print_function
25+
from __future__ import absolute_import
26+
from __future__ import unicode_literals
2727

2828
import codecs
2929
from collections import OrderedDict
3030
import json
3131
import os
3232

33+
import unicodecsv
34+
3335
from commoncode.testcase import FileBasedTesting
3436

3537
import json2csv
3638

3739

40+
def load_csv(location):
41+
"""
42+
Load a CSV file at location and return a tuple of (field names, list of rows as
43+
mappings field->value)
44+
"""
45+
with codecs.open(location, 'rb', encoding='utf-8') as csvin:
46+
reader = unicodecsv.DictReader(csvin)
47+
fields = reader.fieldnames
48+
values = list(reader)
49+
return fields, values
50+
51+
52+
def check_csvs(result_file, expected_file, regen=False):
53+
"""
54+
Load and compare two CSVs.
55+
"""
56+
result_fields, results = load_csv(result_file)
57+
if regen:
58+
import shutil
59+
shutil.copy2(result_file, expected_file)
60+
expected_fields, expected = load_csv(expected_file)
61+
assert expected_fields == result_fields
62+
# then check results line by line for more compact results
63+
for exp, res in zip(expected,results):
64+
assert exp == res
65+
66+
3867
class TestJson2CSV(FileBasedTesting):
3968
test_data_dir = os.path.join(os.path.dirname(__file__), 'testdata')
4069

@@ -43,34 +72,30 @@ def test_scanc_as_list_minimal(self):
4372
scan = json2csv.load_scan(test_json)
4473
result = list(json2csv.flatten_scan(scan))
4574
expected = self.get_test_loc('json2csv/minimal.json-expected')
46-
expected = json.load(open(expected), object_pairs_hook=OrderedDict)
75+
expected = json.load(codecs.open(expected, encoding='utf-8'), object_pairs_hook=OrderedDict)
4776
assert expected == result
4877

4978
def test_scanc_as_list_full(self):
5079
test_json = self.get_test_loc('json2csv/full.json')
5180
scan = json2csv.load_scan(test_json)
5281
result = list(json2csv.flatten_scan(scan))
5382
expected = self.get_test_loc('json2csv/full.json-expected')
54-
expected = json.load(open(expected), object_pairs_hook=OrderedDict)
83+
expected = json.load(codecs.open(expected, encoding='utf-8'), object_pairs_hook=OrderedDict)
5584
assert expected == result
5685

5786
def test_json2csv_minimal(self):
5887
test_json = self.get_test_loc('json2csv/minimal.json')
5988
result_file = self.get_temp_file('.csv')
6089
json2csv.json_scan_to_csv(test_json, result_file)
6190
expected_file = self.get_test_loc('json2csv/minimal.csv')
62-
expected = codecs.open(expected_file, 'rb', encoding='utf-8').read()
63-
result = codecs.open(result_file, 'rb', encoding='utf-8').read()
64-
assert expected == result
91+
check_csvs(result_file, expected_file)
6592

6693
def test_json2csv_full(self):
6794
test_json = self.get_test_loc('json2csv/full.json')
6895
result_file = self.get_temp_file('.csv')
6996
json2csv.json_scan_to_csv(test_json, result_file)
7097
expected_file = self.get_test_loc('json2csv/full.csv')
71-
expected = codecs.open(expected_file, 'rb', encoding='utf-8').read()
72-
result = codecs.open(result_file, 'rb', encoding='utf-8').read()
73-
assert expected == result
98+
check_csvs(result_file, expected_file)
7499

75100
def test_key_ordering(self):
76101
test_json = self.get_test_loc('json2csv/key_order.json')
@@ -162,5 +187,5 @@ def test_can_process_package_license_when_license_value_is_null(self):
162187
scan = json2csv.load_scan(test_json)
163188
result = list(json2csv.flatten_scan(scan))
164189
expected = self.get_test_loc('json2csv/package_license_value_null.json-expected')
165-
expected = json.load(open(expected), object_pairs_hook=OrderedDict)
190+
expected = json.load(codecs.open(expected, encoding='utf-8'), object_pairs_hook=OrderedDict)
166191
assert expected == result

0 commit comments

Comments
 (0)