Skip to content

Commit 69f5dfa

Browse files
authored
Merge pull request #370 from nexB/361-csv-transform-subcommand
Add new CSV transform subcommand #361
2 parents 0d23465 + e230982 commit 69f5dfa

File tree

7 files changed

+478
-0
lines changed

7 files changed

+478
-0
lines changed

src/attributecode/cmd.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,84 @@ def check(location, verbose):
494494
sys.exit(severe_errors_count)
495495

496496

497+
######################################################################
498+
# transform subcommand
499+
######################################################################
500+
501+
def print_config_help(ctx, param, value):
502+
if not value or ctx.resilient_parsing:
503+
return
504+
from attributecode.transform import tranformer_config_help
505+
click.echo(tranformer_config_help)
506+
ctx.exit()
507+
508+
509+
@about.command(cls=AboutCommand,
510+
short_help='Transform a CSV by applying renamings, filters and checks.')
511+
512+
@click.argument('location',
513+
required=True,
514+
callback=partial(validate_extensions, extensions=('.csv',)),
515+
metavar='LOCATION',
516+
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True))
517+
518+
@click.argument('output',
519+
required=True,
520+
callback=partial(validate_extensions, extensions=('.csv',)),
521+
metavar='OUTPUT',
522+
type=click.Path(exists=False, dir_okay=False, writable=True, resolve_path=True))
523+
524+
@click.option('-c', '--configuration',
525+
metavar='FILE',
526+
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
527+
help='Path to an optional YAML configuration file. See --help-format for '
528+
'format help.')
529+
530+
@click.option('--help-format',
531+
is_flag=True, is_eager=True, expose_value=False,
532+
callback=print_config_help,
533+
help='Show configuration file format help and exit.')
534+
535+
@click.option('-q', '--quiet',
536+
is_flag=True,
537+
help='Do not print error or warning messages.')
538+
539+
@click.option('--verbose',
540+
is_flag=True,
541+
help='Show all error and warning messages.')
542+
543+
@click.help_option('-h', '--help')
544+
545+
def transform(location, output, configuration, quiet, verbose): # NOQA
546+
"""
547+
Transform the CSV file at LOCATION by applying renamings, filters and checks
548+
and write a new CSV to OUTPUT.
549+
550+
LOCATION: Path to a CSV file.
551+
552+
OUTPUT: Path to CSV inventory file to create.
553+
"""
554+
from attributecode.transform import transform_csv_to_csv
555+
from attributecode.transform import Transformer
556+
557+
if not quiet:
558+
print_version()
559+
click.echo('Transforming CSV...')
560+
561+
if not configuration:
562+
transformer = Transformer.default()
563+
else:
564+
transformer = Transformer.from_file(configuration)
565+
566+
errors = transform_csv_to_csv(location, output, transformer)
567+
568+
errors_count = report_errors(errors, quiet, verbose)
569+
if not quiet:
570+
msg = 'Transformed CSV written to {output}.'.format(**locals())
571+
click.echo(msg)
572+
sys.exit(errors_count)
573+
574+
497575
######################################################################
498576
# Error management
499577
######################################################################

src/attributecode/transform.py

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf8 -*-
3+
# ============================================================================
4+
# Copyright (c) 2013-2018 nexB Inc. http://www.nexb.com/ - All rights reserved.
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
16+
from __future__ import absolute_import
17+
from __future__ import print_function
18+
from __future__ import unicode_literals
19+
20+
from collections import Counter
21+
from collections import OrderedDict
22+
import io
23+
24+
import attr
25+
26+
from attributecode import CRITICAL
27+
from attributecode import Error
28+
from attributecode import saneyaml
29+
from attributecode.util import python2
30+
31+
if python2: # pragma: nocover
32+
from itertools import izip_longest as zip_longest # NOQA
33+
34+
import backports.csv as csv # NOQA
35+
else: # pragma: nocover
36+
from itertools import zip_longest # NOQA
37+
38+
import csv # NOQA
39+
40+
41+
def transform_csv_to_csv(location, output, transformer):
42+
"""
43+
Read a CSV file at `location` and write a new CSV file at `output`. Apply
44+
transformations using the `transformer` Tranformer.
45+
Return a list of Error objects.
46+
"""
47+
if not transformer:
48+
raise ValueError('Cannot transform without Transformer')
49+
50+
rows = read_csv_rows(location)
51+
52+
column_names, data, errors = transform_data(rows, transformer)
53+
54+
if errors:
55+
return errors
56+
else:
57+
write_csv(output, data, column_names)
58+
return []
59+
60+
61+
def transform_data(rows, transformer):
62+
"""
63+
Read a list of list of CSV-like data `rows` and apply transformations using the
64+
`transformer` Tranformer.
65+
Return a tuple of:
66+
([column names...], [transformed ordered mappings...], [Error objects..])
67+
"""
68+
69+
if not transformer:
70+
return rows
71+
72+
errors = []
73+
rows = iter(rows)
74+
column_names = next(rows)
75+
column_names = transformer.clean_columns(column_names)
76+
77+
dupes = check_duplicate_columns(column_names)
78+
79+
if dupes:
80+
msg = 'Duplicated column name: {name}'
81+
errors.extend(Error(CRITICAL, msg.format(name)) for name in dupes)
82+
return column_names, [], errors
83+
84+
column_names = transformer.apply_renamings(column_names)
85+
86+
# convert to mappings using the renamed columns
87+
data = [OrderedDict(zip_longest(column_names, row)) for row in rows]
88+
89+
if transformer.column_filters:
90+
data = list(transformer.filter_columns(data))
91+
column_names = [c for c in column_names if c in transformer.column_filters]
92+
93+
errors = transformer.check_required_columns(data)
94+
if errors:
95+
return column_names, data, errors
96+
97+
if transformer.row_filters:
98+
data = list(transformer.filter_rows(data))
99+
100+
return column_names, data, errors
101+
102+
103+
tranformer_config_help = '''
104+
A transform configuration file is used to describe which transformations and
105+
validations to apply to a source CSV file. This is a simple text file using YAML
106+
format, using the same format as an .ABOUT file.
107+
108+
The attributes that can be set in a configuration file are:
109+
110+
* column_renamings:
111+
An optional mapping of source CSV column name to target CSV new column name that
112+
is used to rename CSV columns.
113+
114+
For instance with this configuration the columns "Directory/Location" will be
115+
renamed to "about_resource" and "foo" to "bar":
116+
renamings:
117+
'Directory/Location' : about_resource
118+
foo : bar
119+
120+
The renaming is always applied first before other transforms and checks. All
121+
other column names referenced below are these that exist AFTER the renamings
122+
have been applied to the existing column names.
123+
124+
* required_columns:
125+
An optional list of required column names that must have a value, beyond the
126+
standard columns names. If a source CSV does not have such a column or a row is
127+
missing a value for a required column, an error is reported.
128+
129+
For instance with this configuration an error will be reported if the columns
130+
"name" and "version" are missing or if any row does not have a value set for
131+
these columns:
132+
required_columns:
133+
- name
134+
- version
135+
136+
* column_filters:
137+
An optional list of column names that should be kept in the transformed CSV. If
138+
this list is provided, all the columns from the source CSV that should be kept
139+
in the target CSV must be listed be even if they are standard or required
140+
columns. If this list is not provided, all source CSV columns are kept in the
141+
transformed target CSV.
142+
143+
For instance with this configuration the target CSV will only contains the "name"
144+
and "version" columns and no other column:
145+
column_filters:
146+
- name
147+
- version
148+
149+
* row_filters:
150+
An optional list of mappings of <column name>: <value> that a source CSV row
151+
should match to be added to the transformed target CSV. If any column value of a
152+
row matches any such filter it is kept. Otherwise it is skipped. Filters are
153+
applied last after all renamings, checks and tranforms and can therefore onlu
154+
use remaining column names.
155+
156+
For instance with this configuration the target CSV will only contain rows that
157+
have a "path" equal to "/root/user/lib":
158+
row_filters:
159+
path : /root/user/lib
160+
'''
161+
162+
163+
@attr.attributes
164+
class Transformer(object):
165+
__doc__ = tranformer_config_help
166+
167+
column_renamings = attr.attrib(default=attr.Factory(dict))
168+
required_columns = attr.attrib(default=attr.Factory(list))
169+
column_filters = attr.attrib(default=attr.Factory(list))
170+
row_filters = attr.attrib(default=attr.Factory(list))
171+
172+
# TODO: populate these!
173+
# a list of all the standard columns from AboutCode toolkit
174+
standard_columns = attr.attrib(default=attr.Factory(list), init=False)
175+
# a list of the subset of standard columns that are essential and MUST be
176+
# present for AboutCode toolkit to work
177+
essential_columns = attr.attrib(default=attr.Factory(list), init=False)
178+
179+
@classmethod
180+
def default(cls):
181+
"""
182+
Return a default Transformer with built-in transforms.
183+
"""
184+
return cls(
185+
column_renamings={},
186+
required_columns=[],
187+
column_filters=[],
188+
row_filters=[],
189+
)
190+
191+
@classmethod
192+
def from_file(cls, location):
193+
"""
194+
Load and return a Transformer instance from a YAML configuration file at
195+
`location`.
196+
"""
197+
with io.open(location, encoding='utf-8') as conf:
198+
data = saneyaml.load(conf.read())
199+
return cls(
200+
column_renamings=data.get('column_renamings', {}),
201+
required_columns=data.get('required_columns', []),
202+
column_filters=data.get('column_filters', []),
203+
row_filters=data.get('row_filters', []),
204+
)
205+
206+
def check_required_columns(self, data):
207+
"""
208+
Return a list of Error for a `data` list of ordered mappings where a
209+
mapping is missing a value for a required column name.
210+
"""
211+
errors = []
212+
required = set(self.essential_columns + self.required_columns)
213+
if not required:
214+
return []
215+
216+
for rn, item in enumerate(data):
217+
missings = [rk for rk in required if not item.get(rk)]
218+
if not missings:
219+
continue
220+
221+
missings = ', '.join(missings)
222+
msg = 'Row {rn} is missing required values for columns: {missings}'
223+
errors.append(Error(CRITICAL, msg.format(**locals())))
224+
return errors
225+
226+
def apply_renamings(self, column_names):
227+
"""
228+
Return a tranformed list of `column_names` where columns are renamed
229+
based on this Transformer configuration.
230+
"""
231+
renamings = self.column_renamings
232+
if not renamings:
233+
return column_names
234+
renamings = {n.lower(): rn.lower() for n, rn in renamings.items()}
235+
236+
renamed = []
237+
for name in column_names:
238+
name = name.lower()
239+
new_name = renamings.get(name, name)
240+
renamed.append(new_name)
241+
return renamed
242+
243+
def clean_columns(self, column_names):
244+
"""
245+
Apply standard cleanups to a list of columns and return these.
246+
"""
247+
if not column_names:
248+
return column_names
249+
return [c.strip().lower() for c in column_names]
250+
251+
def filter_columns(self, data):
252+
"""
253+
Yield transformed mappings from a `data` list of mappings keeping only
254+
columns with a name in the `column_filters`of this Transformer.
255+
Return the data unchanged if no `column_filters` exists.
256+
"""
257+
column_filters = set(self.clean_columns(self.column_filters))
258+
for entry in data:
259+
items = ((k, v) for k, v in entry.items() if k in column_filters)
260+
yield OrderedDict(items)
261+
262+
def filter_rows(self, data):
263+
"""
264+
Yield a filtered list of mappings from a `data` list of mappings keeping
265+
only items that match any one of the `row_filters` of this Transformer.
266+
Return the data unchanged if no `row_filters` is avilable in this
267+
Transformer.
268+
"""
269+
filters = self.row_filters
270+
for entry in data:
271+
for filt in filters:
272+
for filtered_column_name, filtered_column_value in filt.items():
273+
if entry.get(filtered_column_name) == filtered_column_value:
274+
yield entry
275+
276+
277+
def check_duplicate_columns(column_names):
278+
"""
279+
Check that there are no duplicate in the `column_names` list of column name
280+
strings, ignoring case. Return a list of unique duplicated column names.
281+
"""
282+
counted = Counter(c.lower() for c in column_names)
283+
return [column for column, count in sorted(counted.items()) if count > 1]
284+
285+
286+
def read_csv_rows(location):
287+
"""
288+
Yield rows (as a list of values) from a CSV file at `location`.
289+
"""
290+
with io.open(location, encoding='utf-8') as csvfile:
291+
reader = csv.reader(csvfile)
292+
for row in reader:
293+
yield row
294+
295+
296+
def write_csv(location, data, column_names): # NOQA
297+
"""
298+
Write a CSV file at `location` the `data` list of ordered mappings using the
299+
`column_names`.
300+
"""
301+
with io.open(location, 'w', encoding='utf-8') as csvfile:
302+
writer = csv.DictWriter(csvfile, fieldnames=column_names)
303+
writer.writeheader()
304+
writer.writerows(data)

0 commit comments

Comments
 (0)