Skip to content

Commit 46405b7

Browse files
committed
Merge pull request #89 from OpenDataServices/85-use-schema-json-pointer
[#85] Main sheet use schema and allow json pointer
2 parents 760afa2 + 3df403f commit 46405b7

File tree

5 files changed

+369
-142
lines changed

5 files changed

+369
-142
lines changed

flattentool/input.py

Lines changed: 126 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,46 @@
2828
except ImportError:
2929
from UserDict import UserDict # pylint: disable=F0401
3030

31+
def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
32+
if value == '' or value is None:
33+
return None
34+
if type_string == 'number':
35+
try:
36+
return Decimal(value)
37+
except (TypeError, ValueError, InvalidOperation):
38+
warn('Non-numeric value "{}" found in number column, returning as string instead.'.format(value))
39+
return text_type(value)
40+
elif type_string == 'integer':
41+
try:
42+
return int(value)
43+
except (TypeError, ValueError):
44+
warn('Non-integer value "{}" found in integer column, returning as string instead.'.format(value))
45+
return text_type(value)
46+
elif type_string == 'boolean':
47+
value = text_type(value)
48+
if value.lower() in ['true', '1']:
49+
return True
50+
elif value.lower() in ['false', '0']:
51+
return False
52+
else:
53+
warn('Unrecognised value for boolean: "{}", returning as string instead'.format(value))
54+
return text_type(value)
55+
elif type_string in ('array', 'array_array', 'string_array'):
56+
value = text_type(value)
57+
if ',' in value:
58+
return [x.split(',') for x in value.split(';')]
59+
else:
60+
return value.split(';')
61+
elif type_string == 'string':
62+
if type(value) == datetime.datetime:
63+
return timezone.localize(value).isoformat()
64+
return text_type(value)
65+
elif type_string == '':
66+
if type(value) == datetime.datetime:
67+
return timezone.localize(value).isoformat()
68+
return value if type(value) in [int] else text_type(value)
69+
else:
70+
raise ValueError('Unrecognised type: "{}"'.format(type_string))
3171

3272
class SpreadsheetInput(object):
3373
"""
@@ -79,56 +119,15 @@ def get_sheet_lines(self, sheet_name):
79119
def read_sheets(self):
80120
raise NotImplementedError
81121

82-
def convert_type(self, type_string, value):
83-
if value == '' or value is None:
84-
return None
85-
if type_string == 'number':
86-
try:
87-
return Decimal(value)
88-
except (TypeError, ValueError, InvalidOperation):
89-
warn('Non-numeric value "{}" found in number column, returning as string instead.'.format(value))
90-
return text_type(value)
91-
elif type_string == 'integer':
92-
try:
93-
return int(value)
94-
except (TypeError, ValueError):
95-
warn('Non-integer value "{}" found in integer column, returning as string instead.'.format(value))
96-
return text_type(value)
97-
elif type_string == 'boolean':
98-
value = text_type(value)
99-
if value.lower() in ['true', '1']:
100-
return True
101-
elif value.lower() in ['false', '0']:
102-
return False
103-
else:
104-
warn('Unrecognised value for boolean: "{}", returning as string instead'.format(value))
105-
return text_type(value)
106-
elif type_string == 'array':
107-
value = text_type(value)
108-
if ',' in value:
109-
return [x.split(',') for x in value.split(';')]
110-
else:
111-
return value.split(';')
112-
elif type_string == 'string':
113-
if type(value) == datetime.datetime:
114-
return self.timezone.localize(value).isoformat()
115-
return text_type(value)
116-
elif type_string == '':
117-
if type(value) == datetime.datetime:
118-
return self.timezone.localize(value).isoformat()
119-
return value if type(value) in [int] else text_type(value)
120-
else:
121-
raise ValueError('Unrecognised type: "{}"'.format(type_string))
122-
123122

124123
def convert_types(self, in_dict):
125124
out_dict = OrderedDict()
126125
for key, value in in_dict.items():
127126
parts = key.split(':')
128127
if len(parts) > 1:
129-
out_dict[parts[0]] = self.convert_type(parts[1], value)
128+
out_dict[parts[0]] = convert_type(parts[1], value, self.timezone)
130129
else:
131-
out_dict[parts[0]] = self.convert_type('', value)
130+
out_dict[parts[0]] = convert_type('', value, self.timezone)
132131
return out_dict
133132

134133

@@ -140,7 +139,10 @@ def unflatten(self):
140139
root_id_or_none = line[self.root_id] if self.root_id else None
141140
if root_id_or_none not in main_sheet_by_ocid:
142141
main_sheet_by_ocid[root_id_or_none] = TemporaryDict('id')
143-
main_sheet_by_ocid[root_id_or_none].append(unflatten_line(self.convert_types(line)))
142+
if not self.parser:
143+
main_sheet_by_ocid[root_id_or_none].append(unflatten_line(self.convert_types(line)))
144+
else:
145+
main_sheet_by_ocid[root_id_or_none].append(unflatten_main_with_parser(self.parser, line, self.timezone))
144146

145147
for sheet_name, lines in self.get_sub_sheets_lines():
146148
for i, line in enumerate(lines):
@@ -274,6 +276,86 @@ def unflatten_line(line):
274276
path_search(unflattened, fields[:-1], top_sheet=True)[fields[-1]] = v
275277
return unflattened
276278

279+
def isint(string):
280+
try:
281+
int(string)
282+
return True
283+
except ValueError:
284+
return False
285+
286+
class ListAsDict(dict):
287+
pass
288+
289+
def list_as_dicts_to_temporary_dicts(unflattened):
290+
for key, value in list(unflattened.items()):
291+
if hasattr(value, 'items'):
292+
if not value:
293+
unflattened.pop(key)
294+
list_as_dicts_to_temporary_dicts(value)
295+
if isinstance(value, ListAsDict):
296+
temporarydict = TemporaryDict("id")
297+
for index in sorted(value.keys()):
298+
temporarydict.append(value[index])
299+
unflattened[key] = temporarydict
300+
return unflattened
301+
302+
303+
def unflatten_main_with_parser(parser, line, timezone):
304+
unflattened = {}
305+
for path, value in line.items():
306+
if not value:
307+
continue
308+
current_path = unflattened
309+
path_list = [item.rstrip('[]') for item in path.split('/')]
310+
for num, path_item in enumerate(path_list):
311+
if isint(path_item):
312+
continue
313+
path_till_now = '/'.join([item for item in path_list[:num+1] if not isint(item)])
314+
current_type = parser.flattened.get(path_till_now)
315+
try:
316+
next_path_item = path_list[num+1]
317+
except IndexError:
318+
next_path_item = ''
319+
320+
## Array
321+
list_index = -1
322+
if isint(next_path_item):
323+
if current_type and current_type != 'array':
324+
raise ValueError("There is an array at '{}' when the schema says there should be a '{}'".format(path_till_now, current_type))
325+
list_index = int(next_path_item)
326+
327+
if isint(next_path_item) or current_type == 'array':
328+
list_as_dict = current_path.get(path_item)
329+
if list_as_dict is None:
330+
list_as_dict = ListAsDict()
331+
current_path[path_item] = list_as_dict
332+
new_path = list_as_dict.get(list_index)
333+
if new_path is None:
334+
new_path = {}
335+
list_as_dict[list_index] = new_path
336+
current_path = new_path
337+
continue
338+
339+
## Object
340+
if current_type == 'object' or (not current_type and next_path_item):
341+
new_path = current_path.get(path_item)
342+
if new_path is None:
343+
new_path = {}
344+
current_path[path_item] = new_path
345+
current_path = new_path
346+
continue
347+
if current_type and current_type != 'object' and next_path_item:
348+
raise ValueError("There is an object or list at '{}' but it should be an {}".format(path_till_now, current_type))
349+
350+
## Other Types
351+
converted_value = convert_type(current_type or '', value, timezone)
352+
if converted_value:
353+
current_path[path_item] = converted_value
354+
355+
unflattened = list_as_dicts_to_temporary_dicts(unflattened)
356+
return unflattened
357+
358+
277359

278360
class IDFieldMissing(KeyError):
279361
pass

flattentool/schema.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ def lookup_header(self, title_header):
3030
def lookup_header_list(self, title_header_list):
3131
first_title = title_header_list[0]
3232
remaining_titles = title_header_list[1:]
33+
try:
34+
int(first_title)
35+
return first_title + '/' + self.lookup_header_list(remaining_titles)
36+
except ValueError:
37+
pass
38+
3339
if first_title in self:
3440
if remaining_titles:
3541
return self[first_title].property_name + '/' + self[first_title].lookup_header_list(remaining_titles)
@@ -49,7 +55,7 @@ def __getitem__(self, key):
4955
raise KeyError
5056
else:
5157
return self.data[key.replace(' ', '').lower()]
52-
58+
5359
def __contains__(self, key):
5460
if key is None:
5561
return False
@@ -69,6 +75,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
6975
self.root_id = root_id
7076
self.use_titles = use_titles
7177
self.title_lookup = TitleLookup()
78+
self.flattened = {}
7279

7380
if root_schema_dict is None and schema_filename is None:
7481
raise ValueError('One of schema_filename or root_schema_dict must be supplied')
@@ -86,7 +93,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
8693
self.root_schema_dict = root_schema_dict
8794

8895
def parse(self):
89-
fields = self.parse_schema_dict(self.main_sheet_name, self.root_schema_dict)
96+
fields = self.parse_schema_dict(self.main_sheet_name, '', self.root_schema_dict)
9097
for field, title in fields:
9198
if self.use_titles:
9299
if not title:
@@ -96,7 +103,9 @@ def parse(self):
96103
else:
97104
self.main_sheet.append(field)
98105

99-
def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, title_lookup=None):
106+
def parse_schema_dict(self, parent_name, parent_path, schema_dict, parent_id_fields=None, title_lookup=None):
107+
if parent_path:
108+
parent_path = parent_path + '/'
100109
parent_id_fields = parent_id_fields or []
101110
title_lookup = self.title_lookup if title_lookup is None else title_lookup
102111
if 'properties' in schema_dict:
@@ -114,22 +123,27 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
114123
title_lookup[title].property_name = property_name
115124

116125
if 'object' in property_type_set:
126+
self.flattened[parent_path+property_name] = "object"
117127
for field, child_title in self.parse_schema_dict(
118128
parent_name+'/'+property_name,
129+
parent_path+property_name,
119130
property_schema_dict,
120131
parent_id_fields=id_fields,
121132
title_lookup=title_lookup.get(title)):
122133
yield (
123134
property_name+'/'+field,
124135
# TODO ambiguous use of "title"
125-
(title+':'+child_title if title and child_title else None)
136+
(title+':'+child_title if title and child_title else None)
126137
)
127138

128139
elif 'array' in property_type_set:
140+
self.flattened[parent_path+property_name] = "array"
129141
type_set = get_property_type_set(property_schema_dict['items'])
130142
if 'string' in type_set:
143+
self.flattened[parent_path+property_name] = "string_array"
131144
yield property_name+':array', title
132145
elif 'array' in type_set:
146+
self.flattened[parent_path+property_name] = "array_array"
133147
if 'string' in get_property_type_set(property_schema_dict['items']['items']):
134148
yield property_name+':array', title
135149
else:
@@ -152,6 +166,7 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
152166
for field in id_fields:
153167
sub_sheet.add_field(field+':'+property_name, id_field=True)
154168
fields = self.parse_schema_dict(parent_name+'/'+property_name+'[]',
169+
parent_path+property_name,
155170
property_schema_dict['items'],
156171
parent_id_fields=id_fields,
157172
title_lookup=title_lookup.get(title))
@@ -178,12 +193,16 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
178193
else:
179194
raise ValueError
180195
elif 'string' in property_type_set:
196+
self.flattened[parent_path+property_name] = "string"
181197
yield property_name, title
182198
elif 'number' in property_type_set:
199+
self.flattened[parent_path+property_name] = "number"
183200
yield property_name+':number', title
184201
elif 'integer' in property_type_set:
202+
self.flattened[parent_path+property_name] = "integer"
185203
yield property_name+':integer', title
186204
elif 'boolean' in property_type_set:
205+
self.flattened[parent_path+property_name] = "boolean"
187206
yield property_name+':boolean', title
188207
else:
189208
warn('Unrecognised types {} for property "{}" with context "{}",'

0 commit comments

Comments
 (0)