Skip to content

Commit dceac82

Browse files
committed
Merge pull request #98 from OpenDataServices/90-json-pointer-multisheet
[#90] Json pointer multisheet
2 parents 393e1c1 + bb1f6ad commit dceac82

10 files changed

+590
-514
lines changed

flattentool/input.py

Lines changed: 54 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,31 @@ def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
6969
else:
7070
raise ValueError('Unrecognised type: "{}"'.format(type_string))
7171

72+
73+
def merge(base, mergee, debug_info=None):
74+
if not debug_info:
75+
debug_info = {}
76+
for key, value in mergee.items():
77+
if key in base:
78+
if isinstance(value, TemporaryDict):
79+
for temporarydict_key, temporarydict_value in value.items():
80+
if temporarydict_key in base[key]:
81+
merge(base[key][temporarydict_key], temporarydict_value, debug_info)
82+
else:
83+
base[key][temporarydict_key] = temporarydict_value
84+
for temporarydict_value in value.items_no_keyfield:
85+
base[key].items_no_keyfield.append(temporarydict_value)
86+
elif isinstance(value, dict) and isinstance(base[key], dict):
87+
merge(base[key], value, debug_info)
88+
elif base[key] != value:
89+
id_info = 'id "{}"'.format(debug_info.get('id'))
90+
if debug_info.get('root_id'):
91+
id_info = '{} "{}", '.format(debug_info.get('root_id'), debug_info.get('root_id_or_none'))+id_info
92+
warn('Conflict when merging field "{}" for {} in sheet {}: "{}" != "{}". If you were not expecting merging you may have a duplicate ID.'.format(
93+
key, id_info, debug_info.get('sheet_name'), base[key], value))
94+
else:
95+
base[key] = value
96+
7297
class SpreadsheetInput(object):
7398
"""
7499
Base class describing a spreadsheet input. Has stubs which are
@@ -133,76 +158,28 @@ def convert_types(self, in_dict):
133158

134159
def unflatten(self):
135160
main_sheet_by_ocid = OrderedDict()
136-
for line in self.get_main_sheet_lines():
137-
if all(x == '' for x in line.values()):
138-
continue
139-
root_id_or_none = line[self.root_id] if self.root_id else None
140-
if root_id_or_none not in main_sheet_by_ocid:
141-
main_sheet_by_ocid[root_id_or_none] = TemporaryDict('id')
142-
if not self.parser:
143-
main_sheet_by_ocid[root_id_or_none].append(unflatten_line(self.convert_types(line)))
144-
else:
145-
main_sheet_by_ocid[root_id_or_none].append(unflatten_main_with_parser(self.parser, line, self.timezone))
146-
147-
for sheet_name, lines in self.get_sub_sheets_lines():
148-
for i, line in enumerate(lines):
149-
line_number = i+2
150-
try:
151-
if all(x == '' for x in line.values()):
152-
continue
153-
id_fields = {k: v for k, v in line.items() if
154-
k.split(':')[0].endswith('/id') and
155-
k.startswith(self.main_sheet_name)}
156-
line_without_id_fields = OrderedDict(
157-
(k, v) for k, v in line.items()
158-
if k not in id_fields and (not k or k != self.root_id))
159-
raw_id_fields_with_values = {k.split(':')[0]: v for k, v in id_fields.items() if v}
160-
if not raw_id_fields_with_values:
161-
warn('Line {} of sheet {} has no parent id fields populated,'
162-
'skipping.'.format(line_number, sheet_name))
163-
continue
164-
sheet_context_names = {k.split(':')[0]: k.split(':')[1] if len(k.split(':')) > 1 else None
165-
for k, v in id_fields.items() if v}
166-
167-
try:
168-
id_field = find_deepest_id_field(raw_id_fields_with_values)
169-
except ConflictingIDFieldsError:
170-
warn('Multiple conflicting ID fields have been filled in on line {} of sheet {},'
171-
'skipping that line.'.format(line_number, sheet_name))
172-
continue
173-
174-
try:
175-
context = path_search(
176-
{self.main_sheet_name: main_sheet_by_ocid[line[self.root_id] if self.root_id else None]},
177-
id_field.split('/')[:-1],
178-
id_fields=raw_id_fields_with_values,
179-
top=True
180-
)
181-
except IDFieldMissing as e:
182-
warn('The parent id field "{}" was expected, but not present on line {} of sheet {}.'.format(
183-
e.args[0], line_number, sheet_name))
184-
continue
185-
186-
sheet_context_name = sheet_context_names[id_field] or sheet_name
187-
# Added the following line to support the usecase in test_nested_sub_sheet
188-
context = path_search(context, sheet_context_name.split('/')[:-1])
189-
unflattened = unflatten_line(self.convert_types(line_without_id_fields))
190-
sheet_context_base_name = sheet_context_name.split('/')[-1]
191-
if sheet_context_base_name not in context:
192-
context[sheet_context_base_name] = TemporaryDict(keyfield='id')
193-
elif context[sheet_context_base_name].top_sheet:
194-
# Overwirte any rolled up data from the main sheet
195-
print(context[sheet_context_base_name].data, unflattened)
196-
if context[sheet_context_base_name].data.get(None) != unflattened:
197-
warn('Conflict between main sheet and sub sheet {}, using values from sub sheet'.format(sheet_context_base_name))
198-
context[sheet_context_base_name] = TemporaryDict(keyfield='id')
199-
context[sheet_context_base_name].append(unflattened)
200-
except Exception as e: # pylint: disable=W0703
201-
# Deliberately catch all exceptions for a line, so that
202-
# all lines without exceptions will still be processed.
203-
print('An error occured whilst parsing line {} of sheet {}"'.format(line_number, sheet_name))
204-
traceback.print_exc()
205-
sys.exit()
161+
# Eventually we should get rid of the concept of a "main sheet entirely"
162+
for sheet_name, lines in [(self.main_sheet_name, self.get_main_sheet_lines())] + list(self.get_sub_sheets_lines()):
163+
for line in lines:
164+
if all(x == '' for x in line.values()):
165+
continue
166+
root_id_or_none = line[self.root_id] if self.root_id else None
167+
unflattened = unflatten_main_with_parser(self.parser, line, self.timezone)
168+
if root_id_or_none not in main_sheet_by_ocid:
169+
main_sheet_by_ocid[root_id_or_none] = TemporaryDict('id')
170+
if 'id' in unflattened and unflattened['id'] in main_sheet_by_ocid[root_id_or_none]:
171+
merge(
172+
main_sheet_by_ocid[root_id_or_none][unflattened.get('id')],
173+
unflattened,
174+
{
175+
'sheet_name': sheet_name,
176+
'root_id': self.root_id,
177+
'root_id_or_none': root_id_or_none,
178+
'id': unflattened.get('id')
179+
}
180+
)
181+
else:
182+
main_sheet_by_ocid[root_id_or_none].append(unflattened)
206183

207184
temporarydicts_to_lists(main_sheet_by_ocid)
208185

@@ -301,7 +278,7 @@ def list_as_dicts_to_temporary_dicts(unflattened):
301278

302279

303280
def unflatten_main_with_parser(parser, line, timezone):
304-
unflattened = {}
281+
unflattened = OrderedDict()
305282
for path, value in line.items():
306283
if value is None or value == '':
307284
continue
@@ -311,7 +288,10 @@ def unflatten_main_with_parser(parser, line, timezone):
311288
if isint(path_item):
312289
continue
313290
path_till_now = '/'.join([item for item in path_list[:num+1] if not isint(item)])
314-
current_type = parser.flattened.get(path_till_now)
291+
if parser:
292+
current_type = parser.flattened.get(path_till_now)
293+
else:
294+
current_type = None
315295
try:
316296
next_path_item = path_list[num+1]
317297
except IndexError:
@@ -331,7 +311,7 @@ def unflatten_main_with_parser(parser, line, timezone):
331311
current_path[path_item] = list_as_dict
332312
new_path = list_as_dict.get(list_index)
333313
if new_path is None:
334-
new_path = {}
314+
new_path = OrderedDict()
335315
list_as_dict[list_index] = new_path
336316
current_path = new_path
337317
continue
@@ -340,7 +320,7 @@ def unflatten_main_with_parser(parser, line, timezone):
340320
if current_type == 'object' or (not current_type and next_path_item):
341321
new_path = current_path.get(path_item)
342322
if new_path is None:
343-
new_path = {}
323+
new_path = OrderedDict()
344324
current_path[path_item] = new_path
345325
current_path = new_path
346326
continue

flattentool/json_input.py

Lines changed: 13 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -24,32 +24,9 @@ class BadlyFormedJSONError(ValueError):
2424

2525

2626
def sheet_key_field(sheet, key, id_key=None):
27-
"""
28-
Check for a key in the sheet, and return it with any suffix (following a ':') that might be present).
29-
30-
If a key does not exist, it will be created.
31-
32-
"""
33-
if id_key:
34-
if key in sheet: # If the key exists without a suffix, use that
35-
return key
36-
elif sheet.name == id_key: # also use without a suffix if the suffix matches the sheet name
37-
sheet.append(key)
38-
return key
39-
else: # else use it with the :id_key suffix
40-
if not key+':'+id_key in sheet:
41-
sheet.append(key+':'+id_key)
42-
return key+':'+id_key
43-
else:
44-
keys = [x for x in sheet if x.split(':')[0] == key]
45-
if not keys:
46-
sheet.append(key)
47-
return key
48-
elif len(keys) > 1:
49-
# This shouldn't ever happen, as the schema parser shouldn't output sheets like this...
50-
raise ValueError('Sheet contains two conflicting keys')
51-
else:
52-
return keys[0]
27+
if key not in sheet:
28+
sheet.append(key)
29+
return key
5330

5431
def sheet_key_title(sheet, key, id_key=None):
5532
"""
@@ -78,14 +55,12 @@ def __init__(self, json_filename=None, root_json_dict=None, main_sheet_name='mai
7855
self.root_id = root_id
7956
self.use_titles = use_titles
8057
if schema_parser:
81-
self.sub_sheet_mapping = {'/'.join(k.split('/')[1:]): v for k,v in schema_parser.sub_sheet_mapping.items()}
8258
self.main_sheet = schema_parser.main_sheet
8359
self.sub_sheets = schema_parser.sub_sheets
8460
# Rollup is pulled from the schema_parser, as rollup is only possible if a schema parser is specified
8561
self.rollup = schema_parser.rollup
8662
self.schema_parser = schema_parser
8763
else:
88-
self.sub_sheet_mapping = {}
8964
self.rollup = False
9065

9166
if json_filename is None and root_json_dict is None:
@@ -111,7 +86,7 @@ def parse(self):
11186
for json_dict in root_json_list:
11287
self.parse_json_dict(json_dict, sheet=self.main_sheet)
11388

114-
def parse_json_dict(self, json_dict, sheet, json_key=None, id_extra_parent_name='', parent_name='', flattened_dict=None, parent_id_fields=None):
89+
def parse_json_dict(self, json_dict, sheet, json_key=None, parent_name='', flattened_dict=None, parent_id_fields=None, top_level_of_sub_sheet=False):
11590
"""
11691
Parse a json dictionary.
11792
@@ -134,7 +109,7 @@ def parse_json_dict(self, json_dict, sheet, json_key=None, id_extra_parent_name=
134109
else:
135110
top = False
136111

137-
if parent_name == '':
112+
if top_level_of_sub_sheet:
138113
# Only add the IDs for the top level of object in an array
139114
for k, v in parent_id_fields.items():
140115
flattened_dict[sheet_key(sheet, k, id_key=json_key)] = v
@@ -143,7 +118,7 @@ def parse_json_dict(self, json_dict, sheet, json_key=None, id_extra_parent_name=
143118
parent_id_fields[self.root_id] = json_dict[self.root_id]
144119

145120
if 'id' in json_dict:
146-
parent_id_fields[self.main_sheet_name+'/'+id_extra_parent_name+parent_name+'id'] = json_dict['id']
121+
parent_id_fields[parent_name+'id'] = json_dict['id']
147122

148123

149124
for key, value in json_dict.items():
@@ -168,18 +143,18 @@ def parse_json_dict(self, json_dict, sheet, json_key=None, id_extra_parent_name=
168143
if self.rollup and parent_name == '': # Rollup only currently possible to main sheet
169144
if len(value) == 1:
170145
for k, v in value[0].items():
171-
if parent_name+key+'[]/'+k in self.schema_parser.main_sheet:
146+
if parent_name+key+'/0/'+k in self.schema_parser.main_sheet:
172147
if type(v) in BASIC_TYPES:
173-
flattened_dict[sheet_key(sheet, parent_name+key+'[]/'+k)] = v
148+
flattened_dict[sheet_key(sheet, parent_name+key+'/0/'+k)] = v
174149
else:
175150
raise ValueError('Rolled up values must be basic types')
176151
elif len(value) > 1:
177152
for k in set(sum((list(x.keys()) for x in value), [])):
178153
warn('More than one value supplied for "{}". Could not provide rollup, so adding a warning to the relevant cell(s) in the spreadsheet.'.format(parent_name+key))
179-
if parent_name+key+'[]/'+k in self.schema_parser.main_sheet:
180-
flattened_dict[sheet_key(sheet, parent_name+key+'[]/'+k)] = 'WARNING: More than one value supplied, consult the relevant sub-sheet for the data.'
154+
if parent_name+key+'/0/'+k in self.schema_parser.main_sheet:
155+
flattened_dict[sheet_key(sheet, parent_name+key+'/0/'+k)] = 'WARNING: More than one value supplied, consult the relevant sub-sheet for the data.'
181156

182-
sub_sheet_name = self.sub_sheet_mapping[key] if key in self.sub_sheet_mapping else key
157+
sub_sheet_name = key
183158
if sub_sheet_name not in self.sub_sheets:
184159
self.sub_sheets[sub_sheet_name] = Sheet(name=sub_sheet_name)
185160

@@ -190,7 +165,8 @@ def parse_json_dict(self, json_dict, sheet, json_key=None, id_extra_parent_name=
190165
sheet=self.sub_sheets[sub_sheet_name],
191166
json_key=key,
192167
parent_id_fields=parent_id_fields,
193-
id_extra_parent_name=parent_name+key+'[]/')
168+
parent_name=parent_name+key+'/0/',
169+
top_level_of_sub_sheet=True)
194170
else:
195171
raise ValueError('Unsupported type {}'.format(type(value)))
196172

0 commit comments

Comments
 (0)