Skip to content

Commit 7222de7

Browse files
committed
Cell source map, no test failures
1 parent 9a1fd16 commit 7222de7

File tree

3 files changed

+211
-32
lines changed

3 files changed

+211
-32
lines changed

flattentool/__init__.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from flattentool.json_input import JSONParser
33
from flattentool.output import FORMATS as OUTPUT_FORMATS
44
from flattentool.output import FORMATS_SUFFIX
5-
from flattentool.input import FORMATS as INPUT_FORMATS
5+
from flattentool.input import FORMATS as INPUT_FORMATS, WITH_CELLS
66
import json
77
import codecs
88
from decimal import Decimal
@@ -104,7 +104,8 @@ def decimal_default(o):
104104

105105
def unflatten(input_name, base_json=None, input_format=None, output_name='releases.json',
106106
main_sheet_name='releases', encoding='utf8', timezone_name='UTC',
107-
root_id='ocid', schema='', convert_titles=False, **_):
107+
root_id='ocid', schema='', convert_titles=False, cell_source_map=None,
108+
heading_source_map=None, **_):
108109
"""
109110
Unflatten a flat structure (spreadsheet - csv or xlsx) into a nested structure (JSON).
110111
@@ -132,7 +133,20 @@ def unflatten(input_name, base_json=None, input_format=None, output_name='releas
132133
base = json.load(fp, object_pairs_hook=OrderedDict)
133134
else:
134135
base = OrderedDict()
135-
base[main_sheet_name] = list(spreadsheet_input.unflatten())
136-
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
137-
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
136+
if WITH_CELLS:
137+
result, cell_source_map_data, heading_source_map_data = spreadsheet_input.fancy_unflatten()
138+
base[main_sheet_name] = list(result)
139+
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
140+
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
141+
if cell_source_map:
142+
with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
143+
json.dump(cell_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
144+
if heading_source_map:
145+
with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
146+
json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
147+
else:
148+
result = spreadsheet_input.unflatten()
149+
base[main_sheet_name] = list(result)
150+
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
151+
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
138152

flattentool/cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ def create_parser():
111111
parser_unflatten.add_argument(
112112
"-o", "--output-name",
113113
help="Name of the outputted file. Will have an extension appended as appropriate. Defaults to releases")
114+
parser_unflatten.add_argument(
115+
"-c", "--cell-source-map",
116+
help="Path to write a cell source map to. Will have an extension appended as appropriate.")
117+
parser_unflatten.add_argument(
118+
"-a", "--heading-source-map",
119+
help="Path to write a heading source map to. Will have an extension appended as appropriate.")
114120
parser_unflatten.add_argument(
115121
"--timezone-name",
116122
help="Name of the timezone, defaults to UTC. Should be in tzdata format, e.g. Europe/London")

flattentool/input.py

Lines changed: 186 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,24 @@
1515
import traceback
1616
import datetime
1717
import pytz
18+
from openpyxl.utils import _get_column_letter, column_index_from_string
19+
20+
WITH_CELLS = True
21+
22+
class Cell:
23+
def __init__(self, cell_value, cell_location):
24+
self.cell_value = cell_value
25+
self.cell_location = cell_location
26+
self.sub_cells = []
1827

1928
# The "pylint: disable" lines exist to ignore warnings about the imports we expect not to work not working
2029

2130
if sys.version > '3':
2231
from csv import DictReader
32+
from csv import reader as csvreader
2333
else:
2434
from unicodecsv import DictReader # pylint: disable=F0401
35+
from unicodecsv import reader as csvreader # pylint: disable=F0401
2536

2637
try:
2738
from collections import UserDict # pylint: disable=E0611
@@ -73,26 +84,43 @@ def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
7384
def merge(base, mergee, debug_info=None):
7485
if not debug_info:
7586
debug_info = {}
76-
for key, value in mergee.items():
87+
for key, v in mergee.items():
88+
if WITH_CELLS and isinstance(v, Cell):
89+
value = v.cell_value
90+
else:
91+
value = v
7792
if key in base:
7893
if isinstance(value, TemporaryDict):
7994
for temporarydict_key, temporarydict_value in value.items():
8095
if temporarydict_key in base[key]:
8196
merge(base[key][temporarydict_key], temporarydict_value, debug_info)
8297
else:
98+
assert temporarydict_key not in base[key], 'Overwriting cell {} by mistake'.format(temporarydict_value)
8399
base[key][temporarydict_key] = temporarydict_value
84100
for temporarydict_value in value.items_no_keyfield:
85101
base[key].items_no_keyfield.append(temporarydict_value)
86102
elif isinstance(value, dict) and isinstance(base[key], dict):
87103
merge(base[key], value, debug_info)
88-
elif base[key] != value:
89-
id_info = 'id "{}"'.format(debug_info.get('id'))
90-
if debug_info.get('root_id'):
91-
id_info = '{} "{}", '.format(debug_info.get('root_id'), debug_info.get('root_id_or_none'))+id_info
92-
warn('Conflict when merging field "{}" for {} in sheet {}: "{}" != "{}". If you were not expecting merging you may have a duplicate ID.'.format(
93-
key, id_info, debug_info.get('sheet_name'), base[key], value))
104+
else:
105+
if WITH_CELLS:
106+
base_value = base[key].cell_value
107+
else:
108+
base_value = base[key]
109+
if base_value != value:
110+
id_info = 'id "{}"'.format(debug_info.get('id'))
111+
if debug_info.get('root_id'):
112+
id_info = '{} "{}", '.format(debug_info.get('root_id'), debug_info.get('root_id_or_none'))+id_info
113+
warn('Conflict when merging field "{}" for {} in sheet {}: "{}" != "{}". If you were not expecting merging you may have a duplicate ID.'.format(
114+
key, id_info, debug_info.get('sheet_name'), base_value, value))
115+
else:
116+
if WITH_CELLS:
117+
base[key].sub_cells.append(v)
94118
else:
95-
base[key] = value
119+
# This happens when a parent record finds the first a child record of a known type
120+
if WITH_CELLS: # Either way, we still want to pass back either the cell or the value
121+
base[key] = v
122+
else:
123+
base[key] = v
96124

97125
class SpreadsheetInput(object):
98126
"""
@@ -111,7 +139,7 @@ def convert_dict_titles(self, dicts, title_lookup=None):
111139
title_lookup = title_lookup or self.parser.title_lookup
112140
for d in dicts:
113141
if title_lookup:
114-
yield { title_lookup.lookup_header(k):v for k,v in d.items() }
142+
yield OrderedDict([(title_lookup.lookup_header(k), v) for k,v in d.items()])
115143
else:
116144
yield d
117145

@@ -144,7 +172,7 @@ def get_sheet_lines(self, sheet_name):
144172
def read_sheets(self):
145173
raise NotImplementedError
146174

147-
175+
# XXX This method does not appear to get called, could it be deleted?
148176
def convert_types(self, in_dict):
149177
out_dict = OrderedDict()
150178
for key, value in in_dict.items():
@@ -156,35 +184,145 @@ def convert_types(self, in_dict):
156184
return out_dict
157185

158186

159-
def unflatten(self):
187+
def do_unflatten(self):
160188
main_sheet_by_ocid = OrderedDict()
161189
# Eventually we should get rid of the concept of a "main sheet entirely"
162-
for sheet_name, lines in [(self.main_sheet_name, self.get_main_sheet_lines())] + list(self.get_sub_sheets_lines()):
163-
for line in lines:
164-
if all(x == '' for x in line.values()):
190+
sheets = [(self.main_sheet_name, self.get_main_sheet_lines())] + list(self.get_sub_sheets_lines())
191+
for i, sheet in enumerate(sheets):
192+
sheet_name, lines = sheet
193+
for j, line in enumerate(lines):
194+
if all(x is None or x == '' for x in line.values()):
195+
#if all(x == '' for x in line.values()):
165196
continue
166197
root_id_or_none = line[self.root_id] if self.root_id else None
167-
unflattened = unflatten_main_with_parser(self.parser, line, self.timezone)
198+
if WITH_CELLS:
199+
cells = OrderedDict()
200+
for k, header in enumerate(line):
201+
cells[header] = Cell(line[header], (sheet_name, _get_column_letter(k+1), j+2, header))
202+
unflattened = unflatten_main_with_parser(self.parser, cells, self.timezone)
203+
else:
204+
unflattened = unflatten_main_with_parser(self.parser, line, self.timezone)
168205
if root_id_or_none not in main_sheet_by_ocid:
169206
main_sheet_by_ocid[root_id_or_none] = TemporaryDict('id')
170-
if 'id' in unflattened and unflattened['id'] in main_sheet_by_ocid[root_id_or_none]:
207+
def inthere(unflattened, id_name):
208+
if WITH_CELLS:
209+
return unflattened[id_name].cell_value
210+
else:
211+
return unflattened[id_name]
212+
if 'id' in unflattened and inthere(unflattened, 'id') in main_sheet_by_ocid[root_id_or_none]:
213+
if WITH_CELLS:
214+
unflattened_id = unflattened.get('id').cell_value
215+
else:
216+
unflattened_id = unflattened.get('id')
171217
merge(
172-
main_sheet_by_ocid[root_id_or_none][unflattened.get('id')],
218+
main_sheet_by_ocid[root_id_or_none][unflattened_id],
173219
unflattened,
174220
{
175221
'sheet_name': sheet_name,
176222
'root_id': self.root_id,
177223
'root_id_or_none': root_id_or_none,
178-
'id': unflattened.get('id')
224+
'id': unflattened_id
179225
}
180226
)
181227
else:
182228
main_sheet_by_ocid[root_id_or_none].append(unflattened)
183-
184229
temporarydicts_to_lists(main_sheet_by_ocid)
185-
186230
return sum(main_sheet_by_ocid.values(), [])
187231

232+
def unflatten(self):
233+
result = self.do_unflatten()
234+
if WITH_CELLS:
235+
result = extract_list_to_value(result)
236+
return result
237+
238+
def fancy_unflatten(self):
239+
if not WITH_CELLS:
240+
raise Exception('Can only do a fancy_unflatten() if WITH_CELLS=True')
241+
cell_tree = self.do_unflatten()
242+
result = extract_list_to_value(cell_tree)
243+
cell_source_map = extract_list_to_error_path([self.main_sheet_name.lower()], cell_tree)
244+
ordered_cell_source_map = OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in sorted(cell_source_map.items()))
245+
row_source_map = OrderedDict()
246+
heading_source_map = {}
247+
for path in cell_source_map:
248+
cells = cell_source_map[path]
249+
# Prepare row_source_map key
250+
key = '/'.join(str(x) for x in path[:-1])
251+
if not key in row_source_map:
252+
row_source_map[key] = []
253+
# Prepeare header_source_map key
254+
header_path_parts = []
255+
for x in path:
256+
try:
257+
int(x)
258+
except:
259+
header_path_parts.append(x)
260+
header_path = '/'.join(header_path_parts)
261+
if header_path not in heading_source_map:
262+
heading_source_map[header_path] = []
263+
# Populate the row and header source maps
264+
for cell in cells:
265+
sheet, col, row, header = cell
266+
if (sheet, row) not in row_source_map[key]:
267+
row_source_map[key].append((sheet, row))
268+
if (sheet, header) not in heading_source_map[header_path]:
269+
heading_source_map[header_path].append((sheet, header))
270+
for key in row_source_map:
271+
assert key not in ordered_cell_source_map, 'Row/cell collision: {}'.format(key)
272+
ordered_cell_source_map[key] = row_source_map[key]
273+
return result, ordered_cell_source_map, heading_source_map
274+
275+
def extract_list_to_error_path(path, input):
276+
output = {}
277+
for i, item in enumerate(input):
278+
res = extract_dict_to_error_path(path + [i], item)
279+
for p in res:
280+
assert p not in output, 'Already have key {}'.format(p)
281+
output[p] = res[p]
282+
return output
283+
284+
def extract_dict_to_error_path(path, input):
285+
output = {}
286+
for k in input:
287+
if isinstance(input[k], list):
288+
res = extract_list_to_error_path(path+[k], input[k])
289+
for p in res:
290+
assert p not in output, 'Already have key {}'.format(p)
291+
output[p] = res[p]
292+
elif isinstance(input[k], dict):
293+
res = extract_dict_to_error_path(path+[k], input[k])
294+
for p in res:
295+
assert p not in output, 'Already have key {}'.format(p)
296+
output[p] = res[p]
297+
elif isinstance(input[k], Cell):
298+
p = tuple(path+[k])
299+
assert p not in output, 'Already have key {}'.format(p)
300+
output[p] = [input[k].cell_location]
301+
for sub_cell in input[k].sub_cells:
302+
assert sub_cell.cell_value == input[k].cell_value, 'Two sub-cells have different values: {}, {}'.format(input[k].cell_value, sub_cell.cell_value)
303+
output[p].append(sub_cell.cell_location)
304+
else:
305+
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
306+
return output
307+
308+
def extract_list_to_value(input):
309+
output = []
310+
for item in input:
311+
output.append(extract_dict_to_value(item))
312+
return output
313+
314+
def extract_dict_to_value(input):
315+
output = OrderedDict()
316+
for k in input:
317+
if isinstance(input[k], list):
318+
output[k] = extract_list_to_value(input[k])
319+
elif isinstance(input[k], dict):
320+
output[k] = extract_dict_to_value(input[k])
321+
elif isinstance(input[k], Cell):
322+
output[k] = input[k].cell_value
323+
else:
324+
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
325+
return output
188326

189327
class CSVInput(SpreadsheetInput):
190328
encoding = 'utf-8'
@@ -265,6 +403,8 @@ class ListAsDict(dict):
265403

266404
def list_as_dicts_to_temporary_dicts(unflattened):
267405
for key, value in list(unflattened.items()):
406+
if WITH_CELLS and isinstance(value, Cell):
407+
continue
268408
if hasattr(value, 'items'):
269409
if not value:
270410
unflattened.pop(key)
@@ -279,9 +419,16 @@ def list_as_dicts_to_temporary_dicts(unflattened):
279419

280420
def unflatten_main_with_parser(parser, line, timezone):
281421
unflattened = OrderedDict()
282-
for path, value in line.items():
283-
if value is None or value == '':
284-
continue
422+
for path, input in line.items():
423+
# Skip blank cells
424+
if WITH_CELLS:
425+
cell = input
426+
if cell.cell_value is None or cell.cell_value == '':
427+
continue
428+
else:
429+
value = input
430+
if value is None or value == '':
431+
continue
285432
current_path = unflattened
286433
path_list = [item.rstrip('[]') for item in path.split('/')]
287434
for num, path_item in enumerate(path_list):
@@ -328,9 +475,16 @@ def unflatten_main_with_parser(parser, line, timezone):
328475
raise ValueError("There is an object or list at '{}' but it should be an {}".format(path_till_now, current_type))
329476

330477
## Other Types
331-
converted_value = convert_type(current_type or '', value, timezone)
332-
if converted_value is not None and converted_value != '':
333-
current_path[path_item] = converted_value
478+
if WITH_CELLS:
479+
value = cell.cell_value
480+
converted_value = convert_type(current_type or '', value, timezone)
481+
cell.cell_value = converted_value
482+
if converted_value is not None and converted_value != '':
483+
current_path[path_item] = cell
484+
else:
485+
converted_value = convert_type(current_type or '', value, timezone)
486+
if converted_value is not None and converted_value != '':
487+
current_path[path_item] = converted_value
334488

335489
unflattened = list_as_dicts_to_temporary_dicts(unflattened)
336490
return unflattened
@@ -384,7 +538,10 @@ def __repr__(self):
384538

385539
def append(self, item):
386540
if self.keyfield in item:
387-
key = item[self.keyfield]
541+
if WITH_CELLS and isinstance(item[self.keyfield], Cell):
542+
key = item[self.keyfield].cell_value
543+
else:
544+
key = item[self.keyfield]
388545
if key not in self.data:
389546
self.data[key] = item
390547
else:
@@ -399,6 +556,8 @@ def to_list(self):
399556
def temporarydicts_to_lists(nested_dict):
400557
""" Recrusively transforms TemporaryDicts to lists inplace. """
401558
for key, value in nested_dict.items():
559+
if isinstance(value, Cell):
560+
continue
402561
if hasattr(value, 'to_list'):
403562
temporarydicts_to_lists(value)
404563
if hasattr(value, 'items_no_keyfield'):

0 commit comments

Comments
 (0)