@@ -49,12 +49,12 @@ def transform_csv_to_csv(location, output, transformer):
4949
5050 rows = read_csv_rows (location )
5151
52- column_names , data , errors = transform_csv (rows , transformer )
52+ field_names , data , errors = transform_csv (rows , transformer )
5353
5454 if errors :
5555 return errors
5656 else :
57- write_csv (output , data , column_names )
57+ write_csv (output , data , field_names )
5858 return []
5959
6060def transform_json_to_json (location , output , transformer ):
@@ -82,36 +82,36 @@ def transform_csv(rows, transformer):
8282 Read a list of list of CSV-like data `rows` and apply transformations using the
8383 `transformer` Transformer.
8484 Return a tuple of:
85- ([column names...], [transformed ordered dict...], [Error objects..])
85+ ([field names...], [transformed ordered dict...], [Error objects..])
8686 """
8787
8888 if not transformer :
8989 return rows
9090
9191 errors = []
9292 rows = iter (rows )
93- column_names = next (rows )
94- column_names = transformer .clean_columns ( column_names )
93+ field_names = next (rows )
94+ field_names = transformer .clean_fields ( field_names )
9595
96- dupes = check_duplicate_columns ( column_names )
96+ dupes = check_duplicate_fields ( field_names )
9797
9898 if dupes :
99- msg = 'Duplicated column name: {name}'
99+ msg = 'Duplicated field name: {name}'
100100 errors .extend (Error (CRITICAL , msg .format (name )) for name in dupes )
101- return column_names , [], errors
101+ return field_names , [], errors
102102
103- column_names = transformer .apply_renamings (column_names )
103+ field_names = transformer .apply_renamings (field_names )
104104
105- # convert to dicts using the renamed columns
106- data = [OrderedDict (zip_longest (column_names , row )) for row in rows ]
105+ # convert to dicts using the renamed fields
106+ data = [OrderedDict (zip_longest (field_names , row )) for row in rows ]
107107
108- if transformer .column_filters :
109- data = list (transformer .filter_columns (data ))
110- column_names = [c for c in column_names if c in transformer .column_filters ]
108+ if transformer .field_filters :
109+ data = list (transformer .filter_fields (data ))
110+ field_names = [c for c in field_names if c in transformer .field_filters ]
111111
112- errors = transformer .check_required_columns (data )
112+ errors = transformer .check_required_fields (data )
113113
114- return column_names , data , errors
114+ return field_names , data , errors
115115
116116
117117def transform_json (data , transformer ):
@@ -126,7 +126,14 @@ def transform_json(data, transformer):
126126
127127 errors = []
128128 new_data = []
129- renamings = transformer .column_renamings
129+ renamings = transformer .field_renamings
130+ #if json is output of scancode-toolkit
131+ try :
132+ if (data ["headers" ][0 ]["tool_name" ] == "scancode-toolkit" ):
133+ #only takes data inside "files"
134+ data = data ["files" ]
135+ except :
136+ pass
130137 if isinstance (data , list ):
131138 for item in data :
132139 element , err = process_json_keys (item , renamings , transformer )
@@ -151,12 +158,12 @@ def process_json_keys(data, renamings, transformer):
151158 o_dict [k ] = data [k ]
152159 new_data = [o_dict ]
153160
154- if transformer .column_filters :
155- new_data = list (transformer .filter_columns (new_data ))
161+ if transformer .field_filters :
162+ new_data = list (transformer .filter_fields (new_data ))
156163 else :
157164 new_data = list (new_data )
158165
159- errors = transformer .check_required_columns (new_data )
166+ errors = transformer .check_required_fields (new_data )
160167 return new_data , errors
161168
162169
@@ -167,42 +174,42 @@ def process_json_keys(data, renamings, transformer):
167174
168175The attributes that can be set in a configuration file are:
169176
170- * column_renamings :
171- An optional map of source CSV column name to target CSV new column name that
172- is used to rename CSV columns .
177+ * field_renamings :
178+ An optional map of source CSV or JSON field name to target CSV/JSON new field name that
179+ is used to rename CSV fields .
173180
174- For instance with this configuration the columns "Directory/Location" will be
181+ For instance with this configuration the fields "Directory/Location" will be
175182renamed to "about_resource" and "foo" to "bar":
176- column_renamings :
183+ field_renamings :
177184 'Directory/Location' : about_resource
178185 foo : bar
179186
180187The renaming is always applied first before other transforms and checks. All
181- other column names referenced below are these that exist AFTER the renamings
182- have been applied to the existing column names.
188+ other field names referenced below are these that exist AFTER the renamings
189+ have been applied to the existing field names.
183190
184- * required_columns :
185- An optional list of required column names that must have a value, beyond the
186- standard columns names. If a source CSV does not have such a column or a row is
187- missing a value for a required column , an error is reported.
191+ * required_fields :
192+ An optional list of required field names that must have a value, beyond the
193+ standard fields names. If a source CSV/JSON does not have such a field or a row is
194+ missing a value for a required field , an error is reported.
188195
189- For instance with this configuration an error will be reported if the columns
196+ For instance with this configuration an error will be reported if the fields
190197"name" and "version" are missing or if any row does not have a value set for
191- these columns :
192- required_columns :
198+ these fields :
199+ required_fields :
193200 - name
194201 - version
195202
196- * column_filters :
197- An optional list of column names that should be kept in the transformed CSV. If
198- this list is provided, all the columns from the source CSV that should be kept
199- in the target CSV must be listed be even if they are standard or required
200- columns . If this list is not provided, all source CSV columns are kept in the
201- transformed target CSV.
203+ * field_filters :
204+ An optional list of field names that should be kept in the transformed CSV/JSON . If
205+ this list is provided, all the fields from the source CSV/JSON that should be kept
206+ in the target CSV/JSON must be listed be even if they are standard or required
207+ fields . If this list is not provided, all source CSV/JSON fields are kept in the
208+ transformed target CSV/JSON .
202209
203- For instance with this configuration the target CSV will only contains the "name"
204- and "version" columns and no other column :
205- column_filters :
210+ For instance with this configuration the target CSV/JSON will only contains the "name"
211+ and "version" fields and no other field :
212+ field_filters :
206213 - name
207214 - version
208215'''
@@ -212,32 +219,32 @@ def process_json_keys(data, renamings, transformer):
212219class Transformer (object ):
213220 __doc__ = tranformer_config_help
214221
215- column_renamings = attr .attrib (default = attr .Factory (dict ))
216- required_columns = attr .attrib (default = attr .Factory (list ))
217- column_filters = attr .attrib (default = attr .Factory (list ))
222+ field_renamings = attr .attrib (default = attr .Factory (dict ))
223+ required_fields = attr .attrib (default = attr .Factory (list ))
224+ field_filters = attr .attrib (default = attr .Factory (list ))
218225
219- # a list of all the standard columns from AboutCode toolkit
220- standard_columns = attr .attrib (default = attr .Factory (list ), init = False )
221- # a list of the subset of standard columns that are essential and MUST be
226+ # a list of all the standard fields from AboutCode toolkit
227+ standard_fields = attr .attrib (default = attr .Factory (list ), init = False )
228+ # a list of the subset of standard fields that are essential and MUST be
222229 # present for AboutCode toolkit to work
223- essential_columns = attr .attrib (default = attr .Factory (list ), init = False )
230+ essential_fields = attr .attrib (default = attr .Factory (list ), init = False )
224231
225232 # called by attr after the __init__()
226233 def __attrs_post_init__ (self , * args , ** kwargs ):
227234 from attributecode .model import About
228235 about = About ()
229- self .essential_columns = list (about .required_fields )
230- self .standard_columns = [f .name for f in about .all_fields ()]
236+ self .essential_fields = list (about .required_fields )
237+ self .standard_fields = [f .name for f in about .all_fields ()]
231238
232239 @classmethod
233240 def default (cls ):
234241 """
235242 Return a default Transformer with built-in transforms.
236243 """
237244 return cls (
238- column_renamings = {},
239- required_columns = [],
240- column_filters = [],
245+ field_renamings = {},
246+ required_fields = [],
247+ field_filters = [],
241248 )
242249
243250 @classmethod
@@ -249,18 +256,18 @@ def from_file(cls, location):
249256 with io .open (location , encoding = 'utf-8' ) as conf :
250257 data = saneyaml .load (replace_tab_with_spaces (conf .read ()))
251258 return cls (
252- column_renamings = data .get ('column_renamings ' , {}),
253- required_columns = data .get ('required_columns ' , []),
254- column_filters = data .get ('column_filters ' , []),
259+ field_renamings = data .get ('field_renamings ' , {}),
260+ required_fields = data .get ('required_fields ' , []),
261+ field_filters = data .get ('field_filters ' , []),
255262 )
256263
257- def check_required_columns (self , data ):
264+ def check_required_fields (self , data ):
258265 """
259266 Return a list of Error for a `data` list of ordered dict where a
260- dict is missing a value for a required column name.
267+ dict is missing a value for a required field name.
261268 """
262269 errors = []
263- required = set (self .essential_columns + self .required_columns )
270+ required = set (self .essential_fields + self .required_fields )
264271 if not required :
265272 return []
266273
@@ -270,54 +277,54 @@ def check_required_columns(self, data):
270277 continue
271278
272279 missings = ', ' .join (missings )
273- msg = 'Row {rn} is missing required values for columns : {missings}'
280+ msg = 'Row {rn} is missing required values for fields : {missings}'
274281 errors .append (Error (CRITICAL , msg .format (** locals ())))
275282 return errors
276283
277- def apply_renamings (self , column_names ):
284+ def apply_renamings (self , field_names ):
278285 """
279- Return a tranformed list of `column_names ` where columns are renamed
286+ Return a tranformed list of `field_names ` where fields are renamed
280287 based on this Transformer configuration.
281288 """
282- renamings = self .column_renamings
289+ renamings = self .field_renamings
283290 if not renamings :
284- return column_names
291+ return field_names
285292 renamings = {n .lower (): rn .lower () for n , rn in renamings .items ()}
286293
287294 renamed = []
288- for name in column_names :
295+ for name in field_names :
289296 name = name .lower ()
290297 new_name = renamings .get (name , name )
291298 renamed .append (new_name )
292299 return renamed
293300
294- def clean_columns (self , column_names ):
301+ def clean_fields (self , field_names ):
295302 """
296- Apply standard cleanups to a list of columns and return these.
303+ Apply standard cleanups to a list of fields and return these.
297304 """
298- if not column_names :
299- return column_names
300- return [c .strip ().lower () for c in column_names ]
305+ if not field_names :
306+ return field_names
307+ return [c .strip ().lower () for c in field_names ]
301308
302- def filter_columns (self , data ):
309+ def filter_fields (self , data ):
303310 """
304311 Yield transformed dicts from a `data` list of dicts keeping only
305- columns with a name in the `column_filters `of this Transformer.
306- Return the data unchanged if no `column_filters ` exists.
312+ fields with a name in the `field_filters `of this Transformer.
313+ Return the data unchanged if no `field_filters ` exists.
307314 """
308- column_filters = set (self .clean_columns (self .column_filters ))
315+ field_filters = set (self .clean_fields (self .field_filters ))
309316 for entry in data :
310- items = ((k , v ) for k , v in entry .items () if k in column_filters )
317+ items = ((k , v ) for k , v in entry .items () if k in field_filters )
311318 yield OrderedDict (items )
312319
313320
314- def check_duplicate_columns ( column_names ):
321+ def check_duplicate_fields ( field_names ):
315322 """
316- Check that there are no duplicate in the `column_names ` list of column name
317- strings, ignoring case. Return a list of unique duplicated column names.
323+ Check that there are no duplicate in the `field_names ` list of field name
324+ strings, ignoring case. Return a list of unique duplicated field names.
318325 """
319- counted = Counter (c .lower () for c in column_names )
320- return [column for column , count in sorted (counted .items ()) if count > 1 ]
326+ counted = Counter (c .lower () for c in field_names )
327+ return [field for field , count in sorted (counted .items ()) if count > 1 ]
321328
322329
323330def read_csv_rows (location ):
@@ -339,13 +346,13 @@ def read_json(location):
339346 return data
340347
341348
342- def write_csv (location , data , column_names ): # NOQA
349+ def write_csv (location , data , field_names ): # NOQA
343350 """
344351 Write a CSV file at `location` the `data` list of ordered dicts using the
345- `column_names `.
352+ `field_names `.
346353 """
347354 with io .open (location , 'w' , encoding = 'utf-8' , newline = '\n ' ) as csvfile :
348- writer = csv .DictWriter (csvfile , fieldnames = column_names )
355+ writer = csv .DictWriter (csvfile , fieldnames = field_names )
349356 writer .writeheader ()
350357 writer .writerows (data )
351358
0 commit comments