15
15
import traceback
16
16
import datetime
17
17
import pytz
18
+ from openpyxl .utils import _get_column_letter , column_index_from_string
19
+
20
+ WITH_CELLS = True
21
+
22
+ class Cell :
23
+ def __init__ (self , cell_value , cell_location ):
24
+ self .cell_value = cell_value
25
+ self .cell_location = cell_location
26
+ self .sub_cells = []
18
27
19
28
# The "pylint: disable" lines exist to ignore warnings about the imports we expect not to work not working
20
29
21
30
if sys .version > '3' :
22
31
from csv import DictReader
32
+ from csv import reader as csvreader
23
33
else :
24
34
from unicodecsv import DictReader # pylint: disable=F0401
35
+ from unicodecsv import reader as csvreader # pylint: disable=F0401
25
36
26
37
try :
27
38
from collections import UserDict # pylint: disable=E0611
@@ -73,26 +84,43 @@ def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
73
84
def merge (base , mergee , debug_info = None ):
74
85
if not debug_info :
75
86
debug_info = {}
76
- for key , value in mergee .items ():
87
+ for key , v in mergee .items ():
88
+ if WITH_CELLS and isinstance (v , Cell ):
89
+ value = v .cell_value
90
+ else :
91
+ value = v
77
92
if key in base :
78
93
if isinstance (value , TemporaryDict ):
79
94
for temporarydict_key , temporarydict_value in value .items ():
80
95
if temporarydict_key in base [key ]:
81
96
merge (base [key ][temporarydict_key ], temporarydict_value , debug_info )
82
97
else :
98
+ assert temporarydict_key not in base [key ], 'Overwriting cell {} by mistake' .format (temporarydict_value )
83
99
base [key ][temporarydict_key ] = temporarydict_value
84
100
for temporarydict_value in value .items_no_keyfield :
85
101
base [key ].items_no_keyfield .append (temporarydict_value )
86
102
elif isinstance (value , dict ) and isinstance (base [key ], dict ):
87
103
merge (base [key ], value , debug_info )
88
- elif base [key ] != value :
89
- id_info = 'id "{}"' .format (debug_info .get ('id' ))
90
- if debug_info .get ('root_id' ):
91
- id_info = '{} "{}", ' .format (debug_info .get ('root_id' ), debug_info .get ('root_id_or_none' ))+ id_info
92
- warn ('Conflict when merging field "{}" for {} in sheet {}: "{}" != "{}". If you were not expecting merging you may have a duplicate ID.' .format (
93
- key , id_info , debug_info .get ('sheet_name' ), base [key ], value ))
104
+ else :
105
+ if WITH_CELLS :
106
+ base_value = base [key ].cell_value
107
+ else :
108
+ base_value = base [key ]
109
+ if base_value != value :
110
+ id_info = 'id "{}"' .format (debug_info .get ('id' ))
111
+ if debug_info .get ('root_id' ):
112
+ id_info = '{} "{}", ' .format (debug_info .get ('root_id' ), debug_info .get ('root_id_or_none' ))+ id_info
113
+ warn ('Conflict when merging field "{}" for {} in sheet {}: "{}" != "{}". If you were not expecting merging you may have a duplicate ID.' .format (
114
+ key , id_info , debug_info .get ('sheet_name' ), base_value , value ))
115
+ else :
116
+ if WITH_CELLS :
117
+ base [key ].sub_cells .append (v )
94
118
else :
95
- base [key ] = value
119
+ # This happens when a parent record finds the first a child record of a known type
120
+ if WITH_CELLS : # Either way, we still want to pass back either the cell or the value
121
+ base [key ] = v
122
+ else :
123
+ base [key ] = v
96
124
97
125
class SpreadsheetInput (object ):
98
126
"""
@@ -111,7 +139,7 @@ def convert_dict_titles(self, dicts, title_lookup=None):
111
139
title_lookup = title_lookup or self .parser .title_lookup
112
140
for d in dicts :
113
141
if title_lookup :
114
- yield { title_lookup .lookup_header (k ): v for k ,v in d .items () }
142
+ yield OrderedDict ([( title_lookup .lookup_header (k ), v ) for k ,v in d .items ()])
115
143
else :
116
144
yield d
117
145
@@ -144,7 +172,7 @@ def get_sheet_lines(self, sheet_name):
144
172
def read_sheets (self ):
145
173
raise NotImplementedError
146
174
147
-
175
+ # XXX This method does not appear to get called, could it be deleted?
148
176
def convert_types (self , in_dict ):
149
177
out_dict = OrderedDict ()
150
178
for key , value in in_dict .items ():
@@ -156,35 +184,145 @@ def convert_types(self, in_dict):
156
184
return out_dict
157
185
158
186
159
- def unflatten (self ):
187
+ def do_unflatten (self ):
160
188
main_sheet_by_ocid = OrderedDict ()
161
189
# Eventually we should get rid of the concept of a "main sheet entirely"
162
- for sheet_name , lines in [(self .main_sheet_name , self .get_main_sheet_lines ())] + list (self .get_sub_sheets_lines ()):
163
- for line in lines :
164
- if all (x == '' for x in line .values ()):
190
+ sheets = [(self .main_sheet_name , self .get_main_sheet_lines ())] + list (self .get_sub_sheets_lines ())
191
+ for i , sheet in enumerate (sheets ):
192
+ sheet_name , lines = sheet
193
+ for j , line in enumerate (lines ):
194
+ if all (x is None or x == '' for x in line .values ()):
195
+ #if all(x == '' for x in line.values()):
165
196
continue
166
197
root_id_or_none = line [self .root_id ] if self .root_id else None
167
- unflattened = unflatten_main_with_parser (self .parser , line , self .timezone )
198
+ if WITH_CELLS :
199
+ cells = OrderedDict ()
200
+ for k , header in enumerate (line ):
201
+ cells [header ] = Cell (line [header ], (sheet_name , _get_column_letter (k + 1 ), j + 2 , header ))
202
+ unflattened = unflatten_main_with_parser (self .parser , cells , self .timezone )
203
+ else :
204
+ unflattened = unflatten_main_with_parser (self .parser , line , self .timezone )
168
205
if root_id_or_none not in main_sheet_by_ocid :
169
206
main_sheet_by_ocid [root_id_or_none ] = TemporaryDict ('id' )
170
- if 'id' in unflattened and unflattened ['id' ] in main_sheet_by_ocid [root_id_or_none ]:
207
+ def inthere (unflattened , id_name ):
208
+ if WITH_CELLS :
209
+ return unflattened [id_name ].cell_value
210
+ else :
211
+ return unflattened [id_name ]
212
+ if 'id' in unflattened and inthere (unflattened , 'id' ) in main_sheet_by_ocid [root_id_or_none ]:
213
+ if WITH_CELLS :
214
+ unflattened_id = unflattened .get ('id' ).cell_value
215
+ else :
216
+ unflattened_id = unflattened .get ('id' )
171
217
merge (
172
- main_sheet_by_ocid [root_id_or_none ][unflattened . get ( 'id' ) ],
218
+ main_sheet_by_ocid [root_id_or_none ][unflattened_id ],
173
219
unflattened ,
174
220
{
175
221
'sheet_name' : sheet_name ,
176
222
'root_id' : self .root_id ,
177
223
'root_id_or_none' : root_id_or_none ,
178
- 'id' : unflattened . get ( 'id' )
224
+ 'id' : unflattened_id
179
225
}
180
226
)
181
227
else :
182
228
main_sheet_by_ocid [root_id_or_none ].append (unflattened )
183
-
184
229
temporarydicts_to_lists (main_sheet_by_ocid )
185
-
186
230
return sum (main_sheet_by_ocid .values (), [])
187
231
232
+ def unflatten (self ):
233
+ result = self .do_unflatten ()
234
+ if WITH_CELLS :
235
+ result = extract_list_to_value (result )
236
+ return result
237
+
238
+ def fancy_unflatten (self ):
239
+ if not WITH_CELLS :
240
+ raise Exception ('Can only do a fancy_unflatten() if WITH_CELLS=True' )
241
+ cell_tree = self .do_unflatten ()
242
+ result = extract_list_to_value (cell_tree )
243
+ cell_source_map = extract_list_to_error_path ([self .main_sheet_name .lower ()], cell_tree )
244
+ ordered_cell_source_map = OrderedDict (( '/' .join (str (x ) for x in path ), location ) for path , location in sorted (cell_source_map .items ()))
245
+ row_source_map = OrderedDict ()
246
+ heading_source_map = {}
247
+ for path in cell_source_map :
248
+ cells = cell_source_map [path ]
249
+ # Prepare row_source_map key
250
+ key = '/' .join (str (x ) for x in path [:- 1 ])
251
+ if not key in row_source_map :
252
+ row_source_map [key ] = []
253
+ # Prepeare header_source_map key
254
+ header_path_parts = []
255
+ for x in path :
256
+ try :
257
+ int (x )
258
+ except :
259
+ header_path_parts .append (x )
260
+ header_path = '/' .join (header_path_parts )
261
+ if header_path not in heading_source_map :
262
+ heading_source_map [header_path ] = []
263
+ # Populate the row and header source maps
264
+ for cell in cells :
265
+ sheet , col , row , header = cell
266
+ if (sheet , row ) not in row_source_map [key ]:
267
+ row_source_map [key ].append ((sheet , row ))
268
+ if (sheet , header ) not in heading_source_map [header_path ]:
269
+ heading_source_map [header_path ].append ((sheet , header ))
270
+ for key in row_source_map :
271
+ assert key not in ordered_cell_source_map , 'Row/cell collision: {}' .format (key )
272
+ ordered_cell_source_map [key ] = row_source_map [key ]
273
+ return result , ordered_cell_source_map , heading_source_map
274
+
275
+ def extract_list_to_error_path (path , input ):
276
+ output = {}
277
+ for i , item in enumerate (input ):
278
+ res = extract_dict_to_error_path (path + [i ], item )
279
+ for p in res :
280
+ assert p not in output , 'Already have key {}' .format (p )
281
+ output [p ] = res [p ]
282
+ return output
283
+
284
+ def extract_dict_to_error_path (path , input ):
285
+ output = {}
286
+ for k in input :
287
+ if isinstance (input [k ], list ):
288
+ res = extract_list_to_error_path (path + [k ], input [k ])
289
+ for p in res :
290
+ assert p not in output , 'Already have key {}' .format (p )
291
+ output [p ] = res [p ]
292
+ elif isinstance (input [k ], dict ):
293
+ res = extract_dict_to_error_path (path + [k ], input [k ])
294
+ for p in res :
295
+ assert p not in output , 'Already have key {}' .format (p )
296
+ output [p ] = res [p ]
297
+ elif isinstance (input [k ], Cell ):
298
+ p = tuple (path + [k ])
299
+ assert p not in output , 'Already have key {}' .format (p )
300
+ output [p ] = [input [k ].cell_location ]
301
+ for sub_cell in input [k ].sub_cells :
302
+ assert sub_cell .cell_value == input [k ].cell_value , 'Two sub-cells have different values: {}, {}' .format (input [k ].cell_value , sub_cell .cell_value )
303
+ output [p ].append (sub_cell .cell_location )
304
+ else :
305
+ raise Exception ('Unexpected result type in the JSON cell tree: {}' .format (input [k ]))
306
+ return output
307
+
308
+ def extract_list_to_value (input ):
309
+ output = []
310
+ for item in input :
311
+ output .append (extract_dict_to_value (item ))
312
+ return output
313
+
314
+ def extract_dict_to_value (input ):
315
+ output = OrderedDict ()
316
+ for k in input :
317
+ if isinstance (input [k ], list ):
318
+ output [k ] = extract_list_to_value (input [k ])
319
+ elif isinstance (input [k ], dict ):
320
+ output [k ] = extract_dict_to_value (input [k ])
321
+ elif isinstance (input [k ], Cell ):
322
+ output [k ] = input [k ].cell_value
323
+ else :
324
+ raise Exception ('Unexpected result type in the JSON cell tree: {}' .format (input [k ]))
325
+ return output
188
326
189
327
class CSVInput (SpreadsheetInput ):
190
328
encoding = 'utf-8'
@@ -265,6 +403,8 @@ class ListAsDict(dict):
265
403
266
404
def list_as_dicts_to_temporary_dicts (unflattened ):
267
405
for key , value in list (unflattened .items ()):
406
+ if WITH_CELLS and isinstance (value , Cell ):
407
+ continue
268
408
if hasattr (value , 'items' ):
269
409
if not value :
270
410
unflattened .pop (key )
@@ -279,9 +419,16 @@ def list_as_dicts_to_temporary_dicts(unflattened):
279
419
280
420
def unflatten_main_with_parser (parser , line , timezone ):
281
421
unflattened = OrderedDict ()
282
- for path , value in line .items ():
283
- if value is None or value == '' :
284
- continue
422
+ for path , input in line .items ():
423
+ # Skip blank cells
424
+ if WITH_CELLS :
425
+ cell = input
426
+ if cell .cell_value is None or cell .cell_value == '' :
427
+ continue
428
+ else :
429
+ value = input
430
+ if value is None or value == '' :
431
+ continue
285
432
current_path = unflattened
286
433
path_list = [item .rstrip ('[]' ) for item in path .split ('/' )]
287
434
for num , path_item in enumerate (path_list ):
@@ -328,9 +475,16 @@ def unflatten_main_with_parser(parser, line, timezone):
328
475
raise ValueError ("There is an object or list at '{}' but it should be an {}" .format (path_till_now , current_type ))
329
476
330
477
## Other Types
331
- converted_value = convert_type (current_type or '' , value , timezone )
332
- if converted_value is not None and converted_value != '' :
333
- current_path [path_item ] = converted_value
478
+ if WITH_CELLS :
479
+ value = cell .cell_value
480
+ converted_value = convert_type (current_type or '' , value , timezone )
481
+ cell .cell_value = converted_value
482
+ if converted_value is not None and converted_value != '' :
483
+ current_path [path_item ] = cell
484
+ else :
485
+ converted_value = convert_type (current_type or '' , value , timezone )
486
+ if converted_value is not None and converted_value != '' :
487
+ current_path [path_item ] = converted_value
334
488
335
489
unflattened = list_as_dicts_to_temporary_dicts (unflattened )
336
490
return unflattened
@@ -384,7 +538,10 @@ def __repr__(self):
384
538
385
539
def append (self , item ):
386
540
if self .keyfield in item :
387
- key = item [self .keyfield ]
541
+ if WITH_CELLS and isinstance (item [self .keyfield ], Cell ):
542
+ key = item [self .keyfield ].cell_value
543
+ else :
544
+ key = item [self .keyfield ]
388
545
if key not in self .data :
389
546
self .data [key ] = item
390
547
else :
@@ -399,6 +556,8 @@ def to_list(self):
399
556
def temporarydicts_to_lists (nested_dict ):
400
557
""" Recrusively transforms TemporaryDicts to lists inplace. """
401
558
for key , value in nested_dict .items ():
559
+ if isinstance (value , Cell ):
560
+ continue
402
561
if hasattr (value , 'to_list' ):
403
562
temporarydicts_to_lists (value )
404
563
if hasattr (value , 'items_no_keyfield' ):
0 commit comments