Skip to content

Commit 7babd3c

Browse files
committed
Fixed #531 - transform to work with nested list
* The `transform` will now work with nested list (specially for json formatted input) Signed-off-by: Chin Yeung Li <[email protected]>
1 parent 545f062 commit 7babd3c

File tree

3 files changed

+148
-50
lines changed

3 files changed

+148
-50
lines changed

src/attributecode/transform.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from attributecode.util import csv
2727
from attributecode.util import replace_tab_with_spaces
2828

29+
2930
def transform_csv(location):
3031
"""
3132
Read a CSV file at `location` and convert data into list of dictionaries.
@@ -109,7 +110,7 @@ def normalize_dict_data(data):
109110
"""
110111
try:
111112
# Check if this is a JSON output from scancode-toolkit
112-
if(data["headers"][0]["tool_name"] == "scancode-toolkit"):
113+
if (data["headers"][0]["tool_name"] == "scancode-toolkit"):
113114
# only takes data inside "files"
114115
new_data = data["files"]
115116
except:
@@ -129,10 +130,12 @@ def transform_data(data, transformer):
129130
renamed_field_data = transformer.apply_renamings(data)
130131

131132
if transformer.field_filters:
132-
renamed_field_data = list(transformer.filter_fields(renamed_field_data))
133+
renamed_field_data = list(
134+
transformer.filter_fields(renamed_field_data))
133135

134136
if transformer.exclude_fields:
135-
renamed_field_data = list(transformer.filter_excluded(renamed_field_data))
137+
renamed_field_data = list(
138+
transformer.filter_excluded(renamed_field_data))
136139

137140
errors = transformer.check_required_fields(renamed_field_data)
138141
if errors:
@@ -277,23 +280,26 @@ def apply_renamings(self, data):
277280
based on this Transformer configuration.
278281
"""
279282
renamings = self.field_renamings
283+
renamed_to_list = list(renamings.keys())
284+
renamed_from_list = list(renamings.values())
280285
if not renamings:
281286
return data
282-
renamings = {n: rn for n, rn in renamings.items()}
283-
284-
renamed_list = []
285-
for row in data:
286-
renamed = {}
287-
for key in row:
288-
matched = False
289-
for renamed_key in renamings:
290-
if key == renamings[renamed_key]:
291-
renamed[renamed_key] = row[key]
292-
matched = True
293-
if not matched:
294-
renamed[key] = row[key]
295-
renamed_list.append(renamed)
296-
return renamed_list
287+
if isinstance(data, dict):
288+
renamed_obj = {}
289+
for key, value in data.items():
290+
if key in renamed_from_list:
291+
for idx, renamed_from_key in enumerate(renamed_from_list):
292+
if key == renamed_from_key:
293+
renamed_key = renamed_to_list[idx]
294+
renamed_obj[renamed_key] = self.apply_renamings(
295+
value)
296+
else:
297+
renamed_obj[key] = self.apply_renamings(value)
298+
return renamed_obj
299+
elif isinstance(data, list):
300+
return [self.apply_renamings(item) for item in data]
301+
else:
302+
return data
297303

298304
"""
299305
def clean_fields(self, field_names):
@@ -324,8 +330,18 @@ def filter_excluded(self, data):
324330
"""
325331
# exclude_fields = set(self.clean_fields(self.exclude_fields))
326332
exclude_fields = set(self.exclude_fields)
333+
filtered_list = []
327334
for entry in data:
328-
yield {k: v for k, v in entry.items() if k not in exclude_fields}
335+
result = {}
336+
for k, v in entry.items():
337+
if type(v) == list:
338+
result[k] = self.filter_excluded(v)
339+
elif k not in exclude_fields:
340+
result[k] = v
341+
filtered_list.append(result)
342+
# yield result
343+
# yield {k: v for k, v in entry.items() if k not in exclude_fields}
344+
return filtered_list
329345

330346

331347
def check_duplicate_fields(field_names):
@@ -373,6 +389,7 @@ def write_json(location, data):
373389
with open(location, 'w') as jsonfile:
374390
json.dump(data, jsonfile, indent=3)
375391

392+
376393
def read_excel(location, worksheet=None):
377394
"""
378395
Read XLSX at `location`, return a list of ordered dictionaries, one

tests/test_transform.py

Lines changed: 97 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ def test_transform_data(self):
5757
data, err = transform_data(data, transformer)
5858

5959
expect_name = [u'about_resource', u'name', u'version']
60-
expected_data = [dict(OrderedDict([(u'about_resource', u'/tmp/test.c'), (u'name', u'test.c'), (u'version', u'1')]))]
60+
expected_data = [dict(OrderedDict(
61+
[(u'about_resource', u'/tmp/test.c'), (u'name', u'test.c'), (u'version', u'1')]))]
6162

6263
assert len(data) == len(expected_data)
6364
for d in data:
@@ -84,22 +85,23 @@ def test_normalize_dict_data_scancode(self):
8485
json_data = read_json(test_file)
8586
data = normalize_dict_data(json_data)
8687
expected_data = [OrderedDict([(u'path', u'samples'),
87-
(u'type', u'directory'),
88-
(u'name', u'samples'),
89-
(u'base_name', u'samples'),
90-
(u'extension', u''), (u'size', 0),
91-
(u'date', None), (u'sha1', None), (u'md5', None),
92-
(u'mime_type', None), (u'file_type', None),
93-
(u'programming_language', None),
94-
(u'is_binary', False), (u'is_text', False),
95-
(u'is_archive', False), (u'is_media', False),
96-
(u'is_source', False), (u'is_script', False),
97-
(u'licenses', []), (u'license_expressions', []),
98-
(u'copyrights', []), (u'holders', []),
99-
(u'authors', []), (u'packages', []),
100-
(u'emails', []), (u'urls', []),
101-
(u'files_count', 33), (u'dirs_count', 10),
102-
(u'size_count', 1161083), (u'scan_errors', [])])]
88+
(u'type', u'directory'),
89+
(u'name', u'samples'),
90+
(u'base_name', u'samples'),
91+
(u'extension', u''), (u'size', 0),
92+
(u'date', None), (u'sha1',
93+
None), (u'md5', None),
94+
(u'mime_type', None), (u'file_type', None),
95+
(u'programming_language', None),
96+
(u'is_binary', False), (u'is_text', False),
97+
(u'is_archive', False), (u'is_media', False),
98+
(u'is_source', False), (u'is_script', False),
99+
(u'licenses', []), (u'license_expressions', []),
100+
(u'copyrights', []), (u'holders', []),
101+
(u'authors', []), (u'packages', []),
102+
(u'emails', []), (u'urls', []),
103+
(u'files_count', 33), (u'dirs_count', 10),
104+
(u'size_count', 1161083), (u'scan_errors', [])])]
103105
assert data == expected_data
104106

105107
def test_normalize_dict_data_json(self):
@@ -116,19 +118,19 @@ def test_normalize_dict_data_json(self):
116118

117119
def test_normalize_dict_data_json_array(self):
118120
json_data = [OrderedDict([(u'Directory/Filename', u'/aboutcode-toolkit/'),
119-
(u'Component', u'AboutCode-toolkit'),
120-
(u'version', u'1.0'), (u'temp', u'fpp')]),
121-
OrderedDict([(u'Directory/Filename', u'/aboutcode-toolkit1/'),
122-
(u'Component', u'AboutCode-toolkit1'),
123-
(u'version', u'1.1'), (u'temp', u'foo')])]
121+
(u'Component', u'AboutCode-toolkit'),
122+
(u'version', u'1.0'), (u'temp', u'fpp')]),
123+
OrderedDict([(u'Directory/Filename', u'/aboutcode-toolkit1/'),
124+
(u'Component', u'AboutCode-toolkit1'),
125+
(u'version', u'1.1'), (u'temp', u'foo')])]
124126
data = normalize_dict_data(json_data)
125127
expected_data = [OrderedDict([(u'Directory/Filename', u'/aboutcode-toolkit/'),
126-
(u'Component', u'AboutCode-toolkit'),
127-
(u'version', u'1.0'), (u'temp', u'fpp')]),
128-
OrderedDict([(u'Directory/Filename', u'/aboutcode-toolkit1/'),
129-
(u'Component', u'AboutCode-toolkit1'),
130-
(u'version', u'1.1'),
131-
(u'temp', u'foo')])]
128+
(u'Component', u'AboutCode-toolkit'),
129+
(u'version', u'1.0'), (u'temp', u'fpp')]),
130+
OrderedDict([(u'Directory/Filename', u'/aboutcode-toolkit1/'),
131+
(u'Component', u'AboutCode-toolkit1'),
132+
(u'version', u'1.1'),
133+
(u'temp', u'foo')])]
132134
assert data == expected_data
133135

134136
def test_check_duplicate_fields(self):
@@ -144,8 +146,10 @@ def test_strip_trailing_fields_csv(self):
144146
assert result == expected
145147

146148
def test_strip_trailing_fields_json(self):
147-
test = [OrderedDict([(u'about_resource', u'/this.c'), (u'name ', u'this.c'), (u' version ', u'0.11.0')])]
148-
expected = [OrderedDict([(u'about_resource', u'/this.c'), (u'name', u'this.c'), (u'version', u'0.11.0')])]
149+
test = [OrderedDict([(u'about_resource', u'/this.c'),
150+
(u'name ', u'this.c'), (u' version ', u'0.11.0')])]
151+
expected = [OrderedDict(
152+
[(u'about_resource', u'/this.c'), (u'name', u'this.c'), (u'version', u'0.11.0')])]
149153
result = strip_trailing_fields_json(test)
150154
assert result == expected
151155

@@ -190,4 +194,66 @@ def test_transform_json(self):
190194
'Component': 'AboutCode-toolkit',
191195
'Confirmed Version': '123', 'notes': ''}]
192196
assert len(err) == 0
193-
assert data == expected
197+
assert data == expected
198+
199+
def test_apply_renamings(self):
200+
data = [OrderedDict([(u'Directory/Filename', u'/tmp/test.c'),
201+
(u'Component', u'test.c'), (u'version', u'1'),
202+
(u'notes', u'test'), (u'temp', u'foo')])]
203+
configuration = get_test_loc('test_transform/configuration')
204+
transformer = Transformer.from_file(configuration)
205+
206+
expected = [OrderedDict([(u'about_resource', u'/tmp/test.c'), (u'name',
207+
u'test.c'), (u'version', u'1'), (u'notes', u'test'), (u'temp', u'foo')])]
208+
renamed_field_data = transformer.apply_renamings(data)
209+
assert renamed_field_data == expected
210+
211+
def test_apply_renamings_nested_list(self):
212+
data = [{'path': 'samples/JGroups-error.log', 'name': 'JGroups-error.log', 'license_detections': [{'license_expression': 'apache-1.1 AND apache-2.0', 'matches': [
213+
{'score': 90.0, 'start_line': 4, 'end_line': 4, 'license_expression': 'apache-1.1'}, {'score': 100.0, 'start_line': 5, 'end_line': 5, 'license_expression': 'apache-2.0'}]}]}]
214+
configuration = get_test_loc('test_transform/configuration3')
215+
transformer = Transformer.from_file(configuration)
216+
217+
expected = [{'about_resource': 'samples/JGroups-error.log', 'name': 'JGroups-error.log', 'license_detections': [{'license_expression': 'apache-1.1 AND apache-2.0', 'matches': [
218+
{'score_renamed': 90.0, 'start_line': 4, 'end_line': 4, 'license_expression': 'apache-1.1'}, {'score_renamed': 100.0, 'start_line': 5, 'end_line': 5, 'license_expression': 'apache-2.0'}]}]}]
219+
updated_data = transformer.apply_renamings(data)
220+
assert updated_data == expected
221+
222+
def test_filter_excluded(self):
223+
data = [OrderedDict([(u'Directory/Filename', u'/tmp/test.c'),
224+
(u'Component', u'test.c'), (u'version', u'1'),
225+
(u'notes', u'test'), (u'temp', u'foo')])]
226+
configuration = get_test_loc('test_transform/configuration')
227+
transformer = Transformer.from_file(configuration)
228+
229+
expected = [OrderedDict([(u'Directory/Filename', u'/tmp/test.c'), (u'Component',
230+
u'test.c'), (u'version', u'1'), (u'notes', u'test')])]
231+
updated_data = transformer.filter_excluded(data)
232+
assert updated_data == expected
233+
234+
def test_filter_excluded_nested_list(self):
235+
data = [{'path': 'samples/JGroups-error.log', 'type': 'file', 'name': 'JGroups-error.log', 'license_detections': [{'license_expression': 'apache-1.1 AND apache-2.0', 'matches': [
236+
{'score': 90.0, 'start_line': 4, 'end_line': 4, 'license_expression': 'apache-1.1'}, {'score': 100.0, 'start_line': 5, 'end_line': 5, 'license_expression': 'apache-2.0'}]}]}]
237+
configuration = get_test_loc('test_transform/configuration3')
238+
transformer = Transformer.from_file(configuration)
239+
240+
expected = [{'path': 'samples/JGroups-error.log', 'name': 'JGroups-error.log', 'license_detections': [{'license_expression': 'apache-1.1 AND apache-2.0', 'matches': [
241+
{'score': 90.0, 'end_line': 4, 'license_expression': 'apache-1.1'}, {'score': 100.0, 'end_line': 5, 'license_expression': 'apache-2.0'}]}]}]
242+
updated_data = transformer.filter_excluded(data)
243+
assert updated_data == expected
244+
245+
def test_filter_fields(self):
246+
data = [OrderedDict([(u'about_resource', u'/tmp/test.c'),
247+
(u'name', u'test.c'), (u'version', u'1'),
248+
(u'notes', u'test'), (u'temp', u'foo')])]
249+
configuration = get_test_loc('test_transform/configuration')
250+
transformer = Transformer.from_file(configuration)
251+
252+
updated_data = transformer.filter_fields(data)
253+
254+
expected = [OrderedDict([(u'about_resource', u'/tmp/test.c'),
255+
(u'name', u'test.c'), (u'version', u'1'),
256+
(u'temp', u'foo')])]
257+
258+
for d in updated_data:
259+
assert dict(d) in expected
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
field_renamings:
2+
about_resource : 'path'
3+
score_renamed : score
4+
size_renamed : size
5+
required_fields:
6+
- about_resource
7+
- name
8+
exclude_fields:
9+
- sha1
10+
- sha256
11+
- md5
12+
- type
13+
- start_line
14+
- matched_length
15+
- scan_errors

0 commit comments

Comments
 (0)