Skip to content

Commit 17c1bc0

Browse files
committed
[#85] Main sheet use schema and allow json pointer
Adds the following * A new method to unflatten the main page. * Adds flattened property to Schema parser that more reflects json pointer removes * Use schema to convert types The new unflatten should produce output the same as before meaning that multiple sheets will still work. If no schema is supplied then fallback to old behaviour with types. Various tests had to be fixed as if the schema says the field is a string then it converts it. Breaking change: If schema is supplied and the main table uses ":type" notation then it will mistake this is a subobject not a types.
1 parent 760afa2 commit 17c1bc0

File tree

4 files changed

+237
-138
lines changed

4 files changed

+237
-138
lines changed

flattentool/input.py

Lines changed: 126 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,46 @@
2828
except ImportError:
2929
from UserDict import UserDict # pylint: disable=F0401
3030

31+
def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
32+
if value == '' or value is None:
33+
return None
34+
if type_string == 'number':
35+
try:
36+
return Decimal(value)
37+
except (TypeError, ValueError, InvalidOperation):
38+
warn('Non-numeric value "{}" found in number column, returning as string instead.'.format(value))
39+
return text_type(value)
40+
elif type_string == 'integer':
41+
try:
42+
return int(value)
43+
except (TypeError, ValueError):
44+
warn('Non-integer value "{}" found in integer column, returning as string instead.'.format(value))
45+
return text_type(value)
46+
elif type_string == 'boolean':
47+
value = text_type(value)
48+
if value.lower() in ['true', '1']:
49+
return True
50+
elif value.lower() in ['false', '0']:
51+
return False
52+
else:
53+
warn('Unrecognised value for boolean: "{}", returning as string instead'.format(value))
54+
return text_type(value)
55+
elif type_string in ('array', 'array_array', 'string_array'):
56+
value = text_type(value)
57+
if ',' in value:
58+
return [x.split(',') for x in value.split(';')]
59+
else:
60+
return value.split(';')
61+
elif type_string == 'string':
62+
if type(value) == datetime.datetime:
63+
return timezone.localize(value).isoformat()
64+
return text_type(value)
65+
elif type_string == '':
66+
if type(value) == datetime.datetime:
67+
return timezone.localize(value).isoformat()
68+
return value if type(value) in [int] else text_type(value)
69+
else:
70+
raise ValueError('Unrecognised type: "{}"'.format(type_string))
3171

3272
class SpreadsheetInput(object):
3373
"""
@@ -79,56 +119,15 @@ def get_sheet_lines(self, sheet_name):
79119
def read_sheets(self):
80120
raise NotImplementedError
81121

82-
def convert_type(self, type_string, value):
83-
if value == '' or value is None:
84-
return None
85-
if type_string == 'number':
86-
try:
87-
return Decimal(value)
88-
except (TypeError, ValueError, InvalidOperation):
89-
warn('Non-numeric value "{}" found in number column, returning as string instead.'.format(value))
90-
return text_type(value)
91-
elif type_string == 'integer':
92-
try:
93-
return int(value)
94-
except (TypeError, ValueError):
95-
warn('Non-integer value "{}" found in integer column, returning as string instead.'.format(value))
96-
return text_type(value)
97-
elif type_string == 'boolean':
98-
value = text_type(value)
99-
if value.lower() in ['true', '1']:
100-
return True
101-
elif value.lower() in ['false', '0']:
102-
return False
103-
else:
104-
warn('Unrecognised value for boolean: "{}", returning as string instead'.format(value))
105-
return text_type(value)
106-
elif type_string == 'array':
107-
value = text_type(value)
108-
if ',' in value:
109-
return [x.split(',') for x in value.split(';')]
110-
else:
111-
return value.split(';')
112-
elif type_string == 'string':
113-
if type(value) == datetime.datetime:
114-
return self.timezone.localize(value).isoformat()
115-
return text_type(value)
116-
elif type_string == '':
117-
if type(value) == datetime.datetime:
118-
return self.timezone.localize(value).isoformat()
119-
return value if type(value) in [int] else text_type(value)
120-
else:
121-
raise ValueError('Unrecognised type: "{}"'.format(type_string))
122-
123122

124123
def convert_types(self, in_dict):
125124
out_dict = OrderedDict()
126125
for key, value in in_dict.items():
127126
parts = key.split(':')
128127
if len(parts) > 1:
129-
out_dict[parts[0]] = self.convert_type(parts[1], value)
128+
out_dict[parts[0]] = convert_type(parts[1], value, self.timezone)
130129
else:
131-
out_dict[parts[0]] = self.convert_type('', value)
130+
out_dict[parts[0]] = convert_type('', value, self.timezone)
132131
return out_dict
133132

134133

@@ -140,7 +139,10 @@ def unflatten(self):
140139
root_id_or_none = line[self.root_id] if self.root_id else None
141140
if root_id_or_none not in main_sheet_by_ocid:
142141
main_sheet_by_ocid[root_id_or_none] = TemporaryDict('id')
143-
main_sheet_by_ocid[root_id_or_none].append(unflatten_line(self.convert_types(line)))
142+
if not self.parser:
143+
main_sheet_by_ocid[root_id_or_none].append(unflatten_line(self.convert_types(line)))
144+
else:
145+
main_sheet_by_ocid[root_id_or_none].append(unflatten_main_with_parser(self.parser, line, self.timezone))
144146

145147
for sheet_name, lines in self.get_sub_sheets_lines():
146148
for i, line in enumerate(lines):
@@ -274,6 +276,86 @@ def unflatten_line(line):
274276
path_search(unflattened, fields[:-1], top_sheet=True)[fields[-1]] = v
275277
return unflattened
276278

279+
def isint(string):
280+
try:
281+
int(string)
282+
return True
283+
except ValueError:
284+
return False
285+
286+
class ListAsDict(dict):
287+
pass
288+
289+
def list_as_dicts_to_temporary_dicts(unflattened):
290+
for key, value in list(unflattened.items()):
291+
if hasattr(value, 'items'):
292+
if not value:
293+
unflattened.pop(key)
294+
list_as_dicts_to_temporary_dicts(value)
295+
if isinstance(value, ListAsDict):
296+
temporarydict = TemporaryDict("id")
297+
for index in sorted(value.keys()):
298+
temporarydict.append(value[index])
299+
unflattened[key] = temporarydict
300+
return unflattened
301+
302+
303+
def unflatten_main_with_parser(parser, line, timezone):
304+
unflattened = {}
305+
for path, value in line.items():
306+
if not value:
307+
continue
308+
current_path = unflattened
309+
path_list = [item.rstrip('[]') for item in path.split('/')]
310+
for num, path_item in enumerate(path_list):
311+
if isint(path_item):
312+
continue
313+
path_till_now = '/'.join([item for item in path_list[:num+1] if not isint(item)])
314+
current_type = parser.flattened.get(path_till_now)
315+
try:
316+
next_path_item = path_list[num+1]
317+
except IndexError:
318+
next_path_item = ''
319+
320+
## Array
321+
list_index = -1
322+
if isint(next_path_item):
323+
if current_type and current_type != 'array':
324+
raise ValueError("There is an array at '{}' when the schema says there should be a '{}'".format(path_till_now, current_type))
325+
list_index = int(next_path_item)
326+
327+
if isint(next_path_item) or current_type == 'array':
328+
list_as_dict = current_path.get(path_item)
329+
if list_as_dict is None:
330+
list_as_dict = ListAsDict()
331+
current_path[path_item] = list_as_dict
332+
new_path = list_as_dict.get(list_index)
333+
if new_path is None:
334+
new_path = {}
335+
list_as_dict[list_index] = new_path
336+
current_path = new_path
337+
continue
338+
339+
## Object
340+
if current_type == 'object' or (not current_type and next_path_item):
341+
new_path = current_path.get(path_item)
342+
if new_path is None:
343+
new_path = {}
344+
current_path[path_item] = new_path
345+
current_path = new_path
346+
continue
347+
if current_type and current_type != 'object' and next_path_item:
348+
raise ValueError("There is an object or list at '{}' but it should be an {}".format(path_till_now, current_type))
349+
350+
## Other Types
351+
converted_value = convert_type(current_type or '', value, timezone)
352+
if converted_value:
353+
current_path[path_item] = converted_value
354+
355+
unflattened = list_as_dicts_to_temporary_dicts(unflattened)
356+
return unflattened
357+
358+
277359

278360
class IDFieldMissing(KeyError):
279361
pass

flattentool/schema.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ def lookup_header(self, title_header):
3030
def lookup_header_list(self, title_header_list):
3131
first_title = title_header_list[0]
3232
remaining_titles = title_header_list[1:]
33+
if isinstance(first_title, int):
34+
return first_title + '/' + self[first_title].lookup_header_list(remaining_titles)
3335
if first_title in self:
3436
if remaining_titles:
3537
return self[first_title].property_name + '/' + self[first_title].lookup_header_list(remaining_titles)
@@ -69,6 +71,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
6971
self.root_id = root_id
7072
self.use_titles = use_titles
7173
self.title_lookup = TitleLookup()
74+
self.flattened = {}
7275

7376
if root_schema_dict is None and schema_filename is None:
7477
raise ValueError('One of schema_filename or root_schema_dict must be supplied')
@@ -86,7 +89,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
8689
self.root_schema_dict = root_schema_dict
8790

8891
def parse(self):
89-
fields = self.parse_schema_dict(self.main_sheet_name, self.root_schema_dict)
92+
fields = self.parse_schema_dict(self.main_sheet_name, '', self.root_schema_dict)
9093
for field, title in fields:
9194
if self.use_titles:
9295
if not title:
@@ -96,7 +99,9 @@ def parse(self):
9699
else:
97100
self.main_sheet.append(field)
98101

99-
def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, title_lookup=None):
102+
def parse_schema_dict(self, parent_name, parent_path, schema_dict, parent_id_fields=None, title_lookup=None):
103+
if parent_path:
104+
parent_path = parent_path + '/'
100105
parent_id_fields = parent_id_fields or []
101106
title_lookup = self.title_lookup if title_lookup is None else title_lookup
102107
if 'properties' in schema_dict:
@@ -114,8 +119,10 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
114119
title_lookup[title].property_name = property_name
115120

116121
if 'object' in property_type_set:
122+
self.flattened[parent_path+property_name] = "object"
117123
for field, child_title in self.parse_schema_dict(
118124
parent_name+'/'+property_name,
125+
parent_path+property_name,
119126
property_schema_dict,
120127
parent_id_fields=id_fields,
121128
title_lookup=title_lookup.get(title)):
@@ -126,10 +133,13 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
126133
)
127134

128135
elif 'array' in property_type_set:
136+
self.flattened[parent_path+property_name] = "array"
129137
type_set = get_property_type_set(property_schema_dict['items'])
130138
if 'string' in type_set:
139+
self.flattened[parent_path+property_name] = "string_array"
131140
yield property_name+':array', title
132141
elif 'array' in type_set:
142+
self.flattened[parent_path+property_name] = "array_array"
133143
if 'string' in get_property_type_set(property_schema_dict['items']['items']):
134144
yield property_name+':array', title
135145
else:
@@ -152,6 +162,7 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
152162
for field in id_fields:
153163
sub_sheet.add_field(field+':'+property_name, id_field=True)
154164
fields = self.parse_schema_dict(parent_name+'/'+property_name+'[]',
165+
parent_path+property_name,
155166
property_schema_dict['items'],
156167
parent_id_fields=id_fields,
157168
title_lookup=title_lookup.get(title))
@@ -178,12 +189,16 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
178189
else:
179190
raise ValueError
180191
elif 'string' in property_type_set:
192+
self.flattened[parent_path+property_name] = "string"
181193
yield property_name, title
182194
elif 'number' in property_type_set:
195+
self.flattened[parent_path+property_name] = "number"
183196
yield property_name+':number', title
184197
elif 'integer' in property_type_set:
198+
self.flattened[parent_path+property_name] = "integer"
185199
yield property_name+':integer', title
186200
elif 'boolean' in property_type_set:
201+
self.flattened[parent_path+property_name] = "boolean"
187202
yield property_name+':boolean', title
188203
else:
189204
warn('Unrecognised types {} for property "{}" with context "{}",'

0 commit comments

Comments
 (0)