Skip to content

Commit 04f9159

Browse files
committed
Fixed #512 - Remove the needed to have the same format for input/output for transform
* Code have been enhanced and re-organized * Add more tests Signed-off-by: Chin Yeung Li <[email protected]>
1 parent 9c1ff71 commit 04f9159

File tree

10 files changed

+100
-101
lines changed

10 files changed

+100
-101
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Changelog
66

77
* Fixed version mismatch (https://github.com/nexB/aboutcode-toolkit/issues/510)
88
* Improve `check` performance (https://github.com/nexB/aboutcode-toolkit/issues/511)
9+
* Relax the requirement to have the same format for input and output for `transform`
910

1011

1112
2022-03-21

docs/source/reference.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -595,8 +595,7 @@ Purpose
595595
-------
596596

597597
Transform the CSV/JSON/XLSX file at LOCATION by applying renamings,
598-
filters and checks and then write a new CSV/JSON/Excel to OUTPUT
599-
(Format for input and output need to be the same).
598+
filters and checks and then write a new CSV/JSON/Excel to OUTPUT.
600599

601600
Details
602601
^^^^^^^

src/attributecode/cmd.py

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,13 @@
4141
from attributecode.model import get_copy_list
4242
from attributecode.model import pre_process_and_fetch_license_dict
4343
from attributecode.model import write_output
44-
from attributecode.transform import transform_csv_to_csv
45-
from attributecode.transform import transform_json_to_json
46-
from attributecode.transform import transform_excel_to_excel
44+
from attributecode.transform import transform_data
45+
from attributecode.transform import transform_csv
46+
from attributecode.transform import transform_json
47+
from attributecode.transform import transform_excel
48+
from attributecode.transform import write_csv
49+
from attributecode.transform import write_json
50+
from attributecode.transform import write_excel
4751
from attributecode.transform import Transformer
4852
from attributecode.util import extract_zip
4953
from attributecode.util import filter_errors
@@ -771,8 +775,7 @@ def print_config_help(ctx, param, value):
771775
def transform(location, output, configuration, quiet, verbose): # NOQA
772776
"""
773777
Transform the CSV/JSON/XLSX file at LOCATION by applying renamings, filters and checks
774-
and then write a new CSV/JSON/XLSX to OUTPUT (Format for input and output need to be
775-
the same).
778+
and then write a new CSV/JSON/XLSX to OUTPUT.
776779
777780
LOCATION: Path to a CSV/JSON/XLSX file.
778781
@@ -783,16 +786,32 @@ def transform(location, output, configuration, quiet, verbose): # NOQA
783786
else:
784787
transformer = Transformer.from_file(configuration)
785788

786-
if location.endswith('.csv') and output.endswith('.csv'):
787-
errors = transform_csv_to_csv(location, output, transformer)
788-
elif location.endswith('.json') and output.endswith('.json'):
789-
errors = transform_json_to_json(location, output, transformer)
790-
elif location.endswith('.xlsx') and output.endswith('.xlsx'):
791-
errors = transform_excel_to_excel(location, output, transformer)
792-
else:
793-
msg = 'Extension for the input and output need to be the same.'
789+
if not transformer:
790+
msg = 'Cannot transform without Transformer'
794791
click.echo(msg)
795-
sys.exit()
792+
sys.exit(1)
793+
794+
errors = []
795+
updated_data = []
796+
new_data = []
797+
798+
if location.endswith('.csv'):
799+
new_data, errors = transform_csv(location)
800+
elif location.endswith('.json'):
801+
errors = transform_json(location)
802+
elif location.endswith('.xlsx'):
803+
errors = transform_excel(location)
804+
805+
if not errors:
806+
updated_data, errors = transform_data(new_data, transformer)
807+
808+
if not errors:
809+
if output.endswith('.csv'):
810+
write_csv(output, updated_data)
811+
elif output.endswith('.json'):
812+
write_json(output, updated_data)
813+
else:
814+
write_excel(output, updated_data)
796815

797816
if not quiet:
798817
print_version()

src/attributecode/transform.py

Lines changed: 23 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,11 @@
1313
# limitations under the License.
1414
# ============================================================================
1515

16-
import io
1716
import json
1817
from collections import Counter, OrderedDict
1918
from itertools import zip_longest
2019

2120
import attr
22-
import itertools
2321
import openpyxl
2422

2523
from attributecode import CRITICAL
@@ -28,19 +26,13 @@
2826
from attributecode.util import csv
2927
from attributecode.util import replace_tab_with_spaces
3028

31-
32-
def transform_csv_to_csv(location, output, transformer):
29+
def transform_csv(location):
3330
"""
34-
Read a CSV file at `location` and write a new CSV file at `output`. Apply
35-
transformations using the `transformer` Transformer.
36-
Return a list of Error objects.
31+
Read a CSV file at `location` and convert data into list of dictionaries.
3732
"""
38-
if not transformer:
39-
raise ValueError('Cannot transform without Transformer')
40-
41-
rows = read_csv_rows(location)
42-
4333
errors = []
34+
new_data = []
35+
rows = read_csv_rows(location)
4436
data = iter(rows)
4537
names = next(rows)
4638
field_names = strip_trailing_fields_csv(names)
@@ -50,65 +42,39 @@ def transform_csv_to_csv(location, output, transformer):
5042
msg = u'Duplicated field name: %(name)s'
5143
for name in dupes:
5244
errors.append(Error(CRITICAL, msg % locals()))
53-
return errors
5445

55-
# Convert to dicts
56-
new_data = [dict(zip_longest(field_names, item)) for item in data]
46+
if not errors:
47+
# Convert to dicts
48+
new_data = [dict(zip_longest(field_names, item)) for item in data]
5749

58-
field_names, updated_data, errors = transform_data(new_data, transformer)
59-
60-
if errors:
61-
return errors
62-
else:
63-
write_csv(output, updated_data, field_names)
64-
return []
50+
return new_data, errors
6551

6652

67-
def transform_json_to_json(location, output, transformer):
53+
def transform_json(location):
6854
"""
69-
Read a JSON file at `location` and write a new JSON file at `output`. Apply
70-
transformations using the `transformer` Transformer.
71-
Return a list of Error objects.
55+
Read a JSON file at `location` and convert data into list of dictionaries.
7256
"""
73-
if not transformer:
74-
raise ValueError('Cannot transform without Transformer')
75-
57+
errors = []
58+
new_data = []
7659
items = read_json(location)
7760
data = normalize_dict_data(items)
7861
new_data = strip_trailing_fields_json(data)
7962

80-
_field_names, updated_data, errors = transform_data(new_data, transformer)
81-
82-
if errors:
83-
return errors
84-
else:
85-
write_json(output, updated_data)
86-
return []
63+
return new_data, errors
8764

8865

89-
def transform_excel_to_excel(location, output, transformer):
66+
def transform_excel(location):
9067
"""
91-
Read a XLSX file at `location` and write a new Excel file at `output`. Apply
92-
transformations using the `transformer` Transformer.
93-
Return a list of Error objects.
68+
Read a XLSX file at `location` and convert data into list of dictionaries.
9469
"""
95-
if not transformer:
96-
raise ValueError('Cannot transform without Transformer')
97-
98-
dupes, new_data = read_excel(location)
9970
errors = []
71+
new_data = []
72+
dupes, new_data = read_excel(location)
10073
if dupes:
10174
msg = u'Duplicated field name: %(name)s'
10275
for name in dupes:
10376
errors.append(Error(CRITICAL, msg % locals()))
104-
return errors
105-
106-
_field_names, updated_data, errors = transform_data(new_data, transformer)
107-
if errors:
108-
return errors
109-
else:
110-
write_excel(output, updated_data)
111-
return []
77+
return new_data, errors
11278

11379

11480
def strip_trailing_fields_csv(names):
@@ -160,25 +126,18 @@ def transform_data(data, transformer):
160126
Return a tuple of:
161127
([field names...], [transformed ordered dict...], [Error objects..])
162128
"""
163-
if not transformer:
164-
return data
165-
166129
renamed_field_data = transformer.apply_renamings(data)
167130

168-
field_names = renamed_field_data[0].keys()
169-
170131
if transformer.field_filters:
171132
renamed_field_data = list(transformer.filter_fields(renamed_field_data))
172-
field_names = [c for c in field_names if c in transformer.field_filters]
173133

174134
if transformer.exclude_fields:
175135
renamed_field_data = list(transformer.filter_excluded(renamed_field_data))
176-
field_names = [c for c in field_names if c not in transformer.exclude_fields]
177136

178137
errors = transformer.check_required_fields(renamed_field_data)
179138
if errors:
180-
return field_names, data, errors
181-
return field_names, renamed_field_data, errors
139+
return data, errors
140+
return renamed_field_data, errors
182141

183142

184143
tranformer_config_help = '''
@@ -395,11 +354,11 @@ def read_json(location):
395354
return json.load(jsonfile)
396355

397356

398-
def write_csv(location, data, field_names): # NOQA
357+
def write_csv(location, data):
399358
"""
400-
Write a CSV file at `location` the `data` list of ordered dicts using the
401-
`field_names`.
359+
Write a CSV file at `location` with the `data` which is a list of ordered dicts.
402360
"""
361+
field_names = list(data[0].keys())
403362
with open(location, 'w', encoding='utf-8', newline='\n', errors='replace') as csvfile:
404363
writer = csv.DictWriter(csvfile, fieldnames=field_names)
405364
writer.writeheader()

tests/test_cmd.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,6 @@ def test_report_errors_with_verbose_flag(capsys):
136136
'DEBUG: msg4',
137137
'NOTSET: msg4'
138138
]
139-
print("@@@@@@@@@@@@@@@@@@@@@@@@")
140-
print(out.splitlines(False))
141139
assert expected_out == out.splitlines(False)
142140
assert '' == err
143141

@@ -334,10 +332,6 @@ def check_about_stdout(options, expected_loc, regen=False):
334332
with open(expected_file, 'r') as ef:
335333
expected = ef.read()
336334

337-
print("!!!!!!!!!!!!!!!!!!!!")
338-
print(expected.splitlines(False))
339-
print("#####################")
340-
print(result.output.splitlines(False))
341335
assert expected.splitlines(False) == result.output.splitlines(False)
342336

343337

tests/test_transform.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from attributecode.transform import strip_trailing_fields_json
2727
from attributecode.transform import Transformer
2828
from attributecode.transform import read_csv_rows, read_excel, read_json
29+
from attributecode.transform import transform_csv, transform_excel, transform_json
2930

3031

3132
class TransformTest(unittest.TestCase):
@@ -36,16 +37,12 @@ def test_transform_data_new_col(self):
3637
configuration = get_test_loc('test_transform/configuration_new_cols')
3738
transformer = Transformer.from_file(configuration)
3839

39-
field_name, data, err = transform_data(data, transformer)
40+
data, err = transform_data(data, transformer)
4041

41-
expect_name = [u'path', u'about_resource', u'name', u'version', u'notes', u'temp']
4242
expected_data = [dict(OrderedDict([(u'path', u'/tmp/test.c'),
4343
(u'about_resource', u'/tmp/test.c'),
4444
(u'name', u'test.c'), (u'version', u'1'),
4545
(u'notes', u'test'), (u'temp', u'foo')]))]
46-
assert len(field_name) == len(expect_name)
47-
for name in field_name:
48-
assert name in expect_name
4946
assert len(data) == len(expected_data)
5047
for d in data:
5148
assert dict(d) in expected_data
@@ -57,14 +54,11 @@ def test_transform_data(self):
5754
configuration = get_test_loc('test_transform/configuration')
5855
transformer = Transformer.from_file(configuration)
5956

60-
field_name, data, err = transform_data(data, transformer)
57+
data, err = transform_data(data, transformer)
6158

6259
expect_name = [u'about_resource', u'name', u'version']
6360
expected_data = [dict(OrderedDict([(u'about_resource', u'/tmp/test.c'), (u'name', u'test.c'), (u'version', u'1')]))]
6461

65-
assert len(field_name) == len(expect_name)
66-
for name in field_name:
67-
assert name in expect_name
6862
assert len(data) == len(expected_data)
6963
for d in data:
7064
assert dict(d) in expected_data
@@ -75,15 +69,12 @@ def test_transform_data_mutli_rows(self):
7569
configuration = get_test_loc('test_transform/configuration2')
7670
transformer = Transformer.from_file(configuration)
7771

78-
field_name, data, err = transform_data(data, transformer)
72+
data, err = transform_data(data, transformer)
7973

8074
expect_name = [u'about_resource', u'name', u'version']
8175
expected_data = [dict(OrderedDict([(u'about_resource', u'/tmp/test.c'), (u'name', u'test.c'), (u'version', u'v0.01')])),
8276
dict(OrderedDict([(u'about_resource', u'/tmp/tmp.h'), (u'name', u'tmp.h'), (u'version', None)]))]
8377

84-
assert len(field_name) == len(expect_name)
85-
for name in field_name:
86-
assert name in expect_name
8778
assert len(data) == len(expected_data)
8879
for d in data:
8980
assert dict(d) in expected_data
@@ -173,3 +164,30 @@ def test_read_csv_rows(self):
173164
['/test.c', 'test.c', 'mit'],
174165
['/test2.c', 'test2.c', 'mit and apache-2.0']]
175166
assert list(data) == expected
167+
168+
def test_transform_csv(self):
169+
test_file = get_test_loc('test_transform/input.csv')
170+
data, err = transform_csv(test_file)
171+
expected = [{'Directory/Filename': '/aboutcode-toolkit/',
172+
'Component': 'AboutCode-toolkit',
173+
'Confirmed Version': '123', 'notes': ''}]
174+
assert len(err) == 0
175+
assert data == expected
176+
177+
def test_transform_excel(self):
178+
test_file = get_test_loc('test_transform/input.xlsx')
179+
data, err = transform_excel(test_file)
180+
expected = [OrderedDict([('Directory/Filename', '/aboutcode-toolkit/'),
181+
('Component', 'AboutCode-toolkit'),
182+
('Confirmed Version', 123), ('notes', '')])]
183+
assert len(err) == 0
184+
assert data == expected
185+
186+
def test_transform_json(self):
187+
test_file = get_test_loc('test_transform/input.json')
188+
data, err = transform_json(test_file)
189+
expected = [{'Directory/Filename': '/aboutcode-toolkit/',
190+
'Component': 'AboutCode-toolkit',
191+
'Confirmed Version': '123', 'notes': ''}]
192+
assert len(err) == 0
193+
assert data == expected

tests/testdata/test_cmd/help/about_transform_help.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
Usage: about transform [OPTIONS] LOCATION OUTPUT
22

33
Transform the CSV/JSON/XLSX file at LOCATION by applying renamings, filters
4-
and checks and then write a new CSV/JSON/XLSX to OUTPUT (Format for input and
5-
output need to be the same).
4+
and checks and then write a new CSV/JSON/XLSX to OUTPUT.
65

76
LOCATION: Path to a CSV/JSON/XLSX file.
87

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Directory/Filename,Component,Confirmed Version,notes
2+
/aboutcode-toolkit/,AboutCode-toolkit,123,
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[
2+
{
3+
"Directory/Filename": "/aboutcode-toolkit/",
4+
"Component": "AboutCode-toolkit",
5+
"Confirmed Version": "123",
6+
"notes": ""
7+
}
8+
]
9.76 KB
Binary file not shown.

0 commit comments

Comments
 (0)