Skip to content

Commit dd5eae9

Browse files
authored
Merge pull request #419 from srthkdb/417_json_transform
417, 418: modified transform to work with json from scancode and renamed configuration key for transform
2 parents 295e825 + 68b82cf commit dd5eae9

File tree

8 files changed

+3166
-121
lines changed

8 files changed

+3166
-121
lines changed

REFERENCE.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ Options
324324
Show configuration file format help and exit.
325325
This option will print out examples of the the YAML configuration file.
326326
327-
Keys configuration are: `column_renamings`, `required_columns` and `column_filters`
327+
Keys configuration are: `field_renamings`, `required_fields` and `field_filters`
328328

329329
$ about transform --help-format
330330

@@ -335,5 +335,5 @@ Options
335335

336336
Special Notes
337337
=============
338-
When using the `column_filters` configuration, all the standard required columns
339-
(`about_resource` and `name`) and the user defined `required_columns` need to be included.
338+
When using the `field_filters` configuration, all the standard required columns
339+
(`about_resource` and `name`) and the user defined `required_fields` need to be included.

docs/UsingAboutCodetoDocumentYourSoftwareAssets.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -245,22 +245,22 @@ A transform configuration file is used to describe which transformations and val
245245

246246
The attributes that can be set in a configuration file are:
247247

248-
* column_renamings:
248+
* field_renamings:
249249
An optional map of source CSV column name to target CSV new column name that
250250
is used to rename CSV columns.
251251

252252
For instance with this configuration the columns "Directory/Location" will be
253253
renamed to "about_resource" and "foo" to "bar":
254254

255-
column_renamings:
255+
field_renamings:
256256
'Directory/Location' : about_resource
257257
foo : bar
258258

259259
The renaming is always applied first before other transforms and checks. All
260260
other column names referenced below are these that exist AFTER the renaming
261261
have been applied to the existing column names.
262262

263-
* required_columns:
263+
* required_fields:
264264
An optional list of required column names that must have a value, beyond the
265265
standard columns names. If a source CSV does not have such a column or a row is
266266
missing a value for a required column, an error is reported.
@@ -269,11 +269,11 @@ For instance with this configuration an error will be reported if the columns
269269
"name" and "version" are missing or if any row does not have a value set for
270270
these columns:
271271

272-
required_columns:
272+
required_fields:
273273
- name
274274
- version
275275

276-
* column_filters:
276+
* field_filters:
277277
An optional list of column names that should be kept in the transformed CSV. If
278278
this list is provided, all the columns from the source CSV that should be kept
279279
in the target CSV must be listed be even if they are standard or required
@@ -283,7 +283,7 @@ transformed target CSV.
283283
For instance with this configuration the target CSV will only contains the "name"
284284
and "version" columns and no other column:
285285

286-
column_filters:
286+
field_filters:
287287
- name
288288
- version
289289

src/attributecode/transform.py

Lines changed: 92 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,12 @@ def transform_csv_to_csv(location, output, transformer):
4949

5050
rows = read_csv_rows(location)
5151

52-
column_names, data, errors = transform_csv(rows, transformer)
52+
field_names, data, errors = transform_csv(rows, transformer)
5353

5454
if errors:
5555
return errors
5656
else:
57-
write_csv(output, data, column_names)
57+
write_csv(output, data, field_names)
5858
return []
5959

6060
def transform_json_to_json(location, output, transformer):
@@ -82,36 +82,36 @@ def transform_csv(rows, transformer):
8282
Read a list of list of CSV-like data `rows` and apply transformations using the
8383
`transformer` Transformer.
8484
Return a tuple of:
85-
([column names...], [transformed ordered dict...], [Error objects..])
85+
([field names...], [transformed ordered dict...], [Error objects..])
8686
"""
8787

8888
if not transformer:
8989
return rows
9090

9191
errors = []
9292
rows = iter(rows)
93-
column_names = next(rows)
94-
column_names = transformer.clean_columns(column_names)
93+
field_names = next(rows)
94+
field_names = transformer.clean_fields(field_names)
9595

96-
dupes = check_duplicate_columns(column_names)
96+
dupes = check_duplicate_fields(field_names)
9797

9898
if dupes:
99-
msg = 'Duplicated column name: {name}'
99+
msg = 'Duplicated field name: {name}'
100100
errors.extend(Error(CRITICAL, msg.format(name)) for name in dupes)
101-
return column_names, [], errors
101+
return field_names, [], errors
102102

103-
column_names = transformer.apply_renamings(column_names)
103+
field_names = transformer.apply_renamings(field_names)
104104

105-
# convert to dicts using the renamed columns
106-
data = [OrderedDict(zip_longest(column_names, row)) for row in rows]
105+
# convert to dicts using the renamed fields
106+
data = [OrderedDict(zip_longest(field_names, row)) for row in rows]
107107

108-
if transformer.column_filters:
109-
data = list(transformer.filter_columns(data))
110-
column_names = [c for c in column_names if c in transformer.column_filters]
108+
if transformer.field_filters:
109+
data = list(transformer.filter_fields(data))
110+
field_names = [c for c in field_names if c in transformer.field_filters]
111111

112-
errors = transformer.check_required_columns(data)
112+
errors = transformer.check_required_fields(data)
113113

114-
return column_names, data, errors
114+
return field_names, data, errors
115115

116116

117117
def transform_json(data, transformer):
@@ -126,7 +126,14 @@ def transform_json(data, transformer):
126126

127127
errors = []
128128
new_data = []
129-
renamings = transformer.column_renamings
129+
renamings = transformer.field_renamings
130+
#if json is output of scancode-toolkit
131+
try:
132+
if(data["headers"][0]["tool_name"] == "scancode-toolkit"):
133+
#only takes data inside "files"
134+
data = data["files"]
135+
except:
136+
pass
130137
if isinstance(data, list):
131138
for item in data:
132139
element, err = process_json_keys(item, renamings, transformer)
@@ -151,12 +158,12 @@ def process_json_keys(data, renamings, transformer):
151158
o_dict[k] = data[k]
152159
new_data = [o_dict]
153160

154-
if transformer.column_filters:
155-
new_data = list(transformer.filter_columns(new_data))
161+
if transformer.field_filters:
162+
new_data = list(transformer.filter_fields(new_data))
156163
else:
157164
new_data = list(new_data)
158165

159-
errors = transformer.check_required_columns(new_data)
166+
errors = transformer.check_required_fields(new_data)
160167
return new_data, errors
161168

162169

@@ -167,42 +174,42 @@ def process_json_keys(data, renamings, transformer):
167174
168175
The attributes that can be set in a configuration file are:
169176
170-
* column_renamings:
171-
An optional map of source CSV column name to target CSV new column name that
172-
is used to rename CSV columns.
177+
* field_renamings:
178+
An optional map of source CSV or JSON field name to target CSV/JSON new field name that
179+
is used to rename CSV fields.
173180
174-
For instance with this configuration the columns "Directory/Location" will be
181+
For instance with this configuration the fields "Directory/Location" will be
175182
renamed to "about_resource" and "foo" to "bar":
176-
column_renamings:
183+
field_renamings:
177184
'Directory/Location' : about_resource
178185
foo : bar
179186
180187
The renaming is always applied first before other transforms and checks. All
181-
other column names referenced below are these that exist AFTER the renamings
182-
have been applied to the existing column names.
188+
other field names referenced below are these that exist AFTER the renamings
189+
have been applied to the existing field names.
183190
184-
* required_columns:
185-
An optional list of required column names that must have a value, beyond the
186-
standard columns names. If a source CSV does not have such a column or a row is
187-
missing a value for a required column, an error is reported.
191+
* required_fields:
192+
An optional list of required field names that must have a value, beyond the
193+
standard fields names. If a source CSV/JSON does not have such a field or a row is
194+
missing a value for a required field, an error is reported.
188195
189-
For instance with this configuration an error will be reported if the columns
196+
For instance with this configuration an error will be reported if the fields
190197
"name" and "version" are missing or if any row does not have a value set for
191-
these columns:
192-
required_columns:
198+
these fields:
199+
required_fields:
193200
- name
194201
- version
195202
196-
* column_filters:
197-
An optional list of column names that should be kept in the transformed CSV. If
198-
this list is provided, all the columns from the source CSV that should be kept
199-
in the target CSV must be listed be even if they are standard or required
200-
columns. If this list is not provided, all source CSV columns are kept in the
201-
transformed target CSV.
203+
* field_filters:
204+
An optional list of field names that should be kept in the transformed CSV/JSON. If
205+
this list is provided, all the fields from the source CSV/JSON that should be kept
206+
in the target CSV/JSON must be listed be even if they are standard or required
207+
fields. If this list is not provided, all source CSV/JSON fields are kept in the
208+
transformed target CSV/JSON.
202209
203-
For instance with this configuration the target CSV will only contains the "name"
204-
and "version" columns and no other column:
205-
column_filters:
210+
For instance with this configuration the target CSV/JSON will only contains the "name"
211+
and "version" fields and no other field:
212+
field_filters:
206213
- name
207214
- version
208215
'''
@@ -212,32 +219,32 @@ def process_json_keys(data, renamings, transformer):
212219
class Transformer(object):
213220
__doc__ = tranformer_config_help
214221

215-
column_renamings = attr.attrib(default=attr.Factory(dict))
216-
required_columns = attr.attrib(default=attr.Factory(list))
217-
column_filters = attr.attrib(default=attr.Factory(list))
222+
field_renamings = attr.attrib(default=attr.Factory(dict))
223+
required_fields = attr.attrib(default=attr.Factory(list))
224+
field_filters = attr.attrib(default=attr.Factory(list))
218225

219-
# a list of all the standard columns from AboutCode toolkit
220-
standard_columns = attr.attrib(default=attr.Factory(list), init=False)
221-
# a list of the subset of standard columns that are essential and MUST be
226+
# a list of all the standard fields from AboutCode toolkit
227+
standard_fields = attr.attrib(default=attr.Factory(list), init=False)
228+
# a list of the subset of standard fields that are essential and MUST be
222229
# present for AboutCode toolkit to work
223-
essential_columns = attr.attrib(default=attr.Factory(list), init=False)
230+
essential_fields = attr.attrib(default=attr.Factory(list), init=False)
224231

225232
# called by attr after the __init__()
226233
def __attrs_post_init__(self, *args, **kwargs):
227234
from attributecode.model import About
228235
about = About()
229-
self.essential_columns = list(about.required_fields)
230-
self.standard_columns = [f.name for f in about.all_fields()]
236+
self.essential_fields = list(about.required_fields)
237+
self.standard_fields = [f.name for f in about.all_fields()]
231238

232239
@classmethod
233240
def default(cls):
234241
"""
235242
Return a default Transformer with built-in transforms.
236243
"""
237244
return cls(
238-
column_renamings={},
239-
required_columns=[],
240-
column_filters=[],
245+
field_renamings={},
246+
required_fields=[],
247+
field_filters=[],
241248
)
242249

243250
@classmethod
@@ -249,18 +256,18 @@ def from_file(cls, location):
249256
with io.open(location, encoding='utf-8') as conf:
250257
data = saneyaml.load(replace_tab_with_spaces(conf.read()))
251258
return cls(
252-
column_renamings=data.get('column_renamings', {}),
253-
required_columns=data.get('required_columns', []),
254-
column_filters=data.get('column_filters', []),
259+
field_renamings=data.get('field_renamings', {}),
260+
required_fields=data.get('required_fields', []),
261+
field_filters=data.get('field_filters', []),
255262
)
256263

257-
def check_required_columns(self, data):
264+
def check_required_fields(self, data):
258265
"""
259266
Return a list of Error for a `data` list of ordered dict where a
260-
dict is missing a value for a required column name.
267+
dict is missing a value for a required field name.
261268
"""
262269
errors = []
263-
required = set(self.essential_columns + self.required_columns)
270+
required = set(self.essential_fields + self.required_fields)
264271
if not required:
265272
return []
266273

@@ -270,54 +277,54 @@ def check_required_columns(self, data):
270277
continue
271278

272279
missings = ', '.join(missings)
273-
msg = 'Row {rn} is missing required values for columns: {missings}'
280+
msg = 'Row {rn} is missing required values for fields: {missings}'
274281
errors.append(Error(CRITICAL, msg.format(**locals())))
275282
return errors
276283

277-
def apply_renamings(self, column_names):
284+
def apply_renamings(self, field_names):
278285
"""
279-
Return a tranformed list of `column_names` where columns are renamed
286+
Return a tranformed list of `field_names` where fields are renamed
280287
based on this Transformer configuration.
281288
"""
282-
renamings = self.column_renamings
289+
renamings = self.field_renamings
283290
if not renamings:
284-
return column_names
291+
return field_names
285292
renamings = {n.lower(): rn.lower() for n, rn in renamings.items()}
286293

287294
renamed = []
288-
for name in column_names:
295+
for name in field_names:
289296
name = name.lower()
290297
new_name = renamings.get(name, name)
291298
renamed.append(new_name)
292299
return renamed
293300

294-
def clean_columns(self, column_names):
301+
def clean_fields(self, field_names):
295302
"""
296-
Apply standard cleanups to a list of columns and return these.
303+
Apply standard cleanups to a list of fields and return these.
297304
"""
298-
if not column_names:
299-
return column_names
300-
return [c.strip().lower() for c in column_names]
305+
if not field_names:
306+
return field_names
307+
return [c.strip().lower() for c in field_names]
301308

302-
def filter_columns(self, data):
309+
def filter_fields(self, data):
303310
"""
304311
Yield transformed dicts from a `data` list of dicts keeping only
305-
columns with a name in the `column_filters`of this Transformer.
306-
Return the data unchanged if no `column_filters` exists.
312+
fields with a name in the `field_filters`of this Transformer.
313+
Return the data unchanged if no `field_filters` exists.
307314
"""
308-
column_filters = set(self.clean_columns(self.column_filters))
315+
field_filters = set(self.clean_fields(self.field_filters))
309316
for entry in data:
310-
items = ((k, v) for k, v in entry.items() if k in column_filters)
317+
items = ((k, v) for k, v in entry.items() if k in field_filters)
311318
yield OrderedDict(items)
312319

313320

314-
def check_duplicate_columns(column_names):
321+
def check_duplicate_fields(field_names):
315322
"""
316-
Check that there are no duplicate in the `column_names` list of column name
317-
strings, ignoring case. Return a list of unique duplicated column names.
323+
Check that there are no duplicate in the `field_names` list of field name
324+
strings, ignoring case. Return a list of unique duplicated field names.
318325
"""
319-
counted = Counter(c.lower() for c in column_names)
320-
return [column for column, count in sorted(counted.items()) if count > 1]
326+
counted = Counter(c.lower() for c in field_names)
327+
return [field for field, count in sorted(counted.items()) if count > 1]
321328

322329

323330
def read_csv_rows(location):
@@ -339,13 +346,13 @@ def read_json(location):
339346
return data
340347

341348

342-
def write_csv(location, data, column_names): # NOQA
349+
def write_csv(location, data, field_names): # NOQA
343350
"""
344351
Write a CSV file at `location` the `data` list of ordered dicts using the
345-
`column_names`.
352+
`field_names`.
346353
"""
347354
with io.open(location, 'w', encoding='utf-8', newline='\n') as csvfile:
348-
writer = csv.DictWriter(csvfile, fieldnames=column_names)
355+
writer = csv.DictWriter(csvfile, fieldnames=field_names)
349356
writer.writeheader()
350357
writer.writerows(data)
351358

0 commit comments

Comments
 (0)