OpenDataServices
diff --git a/‎flattentool/input.py‎
Lines changed: 126 additions & 44 deletions b/‎flattentool/input.py‎
Lines changed: 126 additions & 44 deletions
diff --git a/‎flattentool/schema.py‎
Lines changed: 17 additions & 2 deletions b/‎flattentool/schema.py‎
Lines changed: 17 additions & 2 deletions
@@ -28,6 +28,46 @@
 except ImportError:
     from UserDict import UserDict  # pylint: disable=F0401
 
+def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
+    if value == '' or value is None:
+        return None
+    if type_string == 'number':
+        try:
+            return Decimal(value)
+        except (TypeError, ValueError, InvalidOperation):
+            warn('Non-numeric value "{}" found in number column, returning as string instead.'.format(value))
+            return text_type(value)
+    elif type_string == 'integer':
+        try:
+            return int(value)
+        except (TypeError, ValueError):
+            warn('Non-integer value "{}" found in integer column, returning as string instead.'.format(value))
+            return text_type(value)
+    elif type_string == 'boolean':
+        value = text_type(value)
+        if value.lower() in ['true', '1']:
+            return True
+        elif value.lower() in ['false', '0']:
+            return False
+        else:
+            warn('Unrecognised value for boolean: "{}", returning as string instead'.format(value))
+            return text_type(value)
+    elif type_string in ('array', 'array_array', 'string_array'):
+        value = text_type(value)
+        if ',' in value:
+            return [x.split(',') for x in value.split(';')]
+        else:
+            return value.split(';')
+    elif type_string == 'string':
+        if type(value) == datetime.datetime:
+            return timezone.localize(value).isoformat()
+        return text_type(value)
+    elif type_string == '':
+        if type(value) == datetime.datetime:
+            return timezone.localize(value).isoformat()
+        return value if type(value) in [int] else text_type(value)
+    else:
+        raise ValueError('Unrecognised type: "{}"'.format(type_string))
 
 class SpreadsheetInput(object):
     """
@@ -79,56 +119,15 @@ def get_sheet_lines(self, sheet_name):
     def read_sheets(self):
         raise NotImplementedError
 
-    def convert_type(self, type_string, value):
-        if value == '' or value is None:
-            return None
-        if type_string == 'number':
-            try:
-                return Decimal(value)
-            except (TypeError, ValueError, InvalidOperation):
-                warn('Non-numeric value "{}" found in number column, returning as string instead.'.format(value))
-                return text_type(value)
-        elif type_string == 'integer':
-            try:
-                return int(value)
-            except (TypeError, ValueError):
-                warn('Non-integer value "{}" found in integer column, returning as string instead.'.format(value))
-                return text_type(value)
-        elif type_string == 'boolean':
-            value = text_type(value)
-            if value.lower() in ['true', '1']:
-                return True
-            elif value.lower() in ['false', '0']:
-                return False
-            else:
-                warn('Unrecognised value for boolean: "{}", returning as string instead'.format(value))
-                return text_type(value)
-        elif type_string == 'array':
-            value = text_type(value)
-            if ',' in value:
-                return [x.split(',') for x in value.split(';')]
-            else:
-                return value.split(';')
-        elif type_string == 'string':
-            if type(value) == datetime.datetime:
-                return self.timezone.localize(value).isoformat()
-            return text_type(value)
-        elif type_string == '':
-            if type(value) == datetime.datetime:
-                return self.timezone.localize(value).isoformat()
-            return value if type(value) in [int] else text_type(value)
-        else:
-            raise ValueError('Unrecognised type: "{}"'.format(type_string))
-
 
     def convert_types(self, in_dict):
         out_dict = OrderedDict()
         for key, value in in_dict.items():
             parts = key.split(':')
             if len(parts) > 1:
-                out_dict[parts[0]] = self.convert_type(parts[1], value)
+                out_dict[parts[0]] = convert_type(parts[1], value, self.timezone)
             else:
-                out_dict[parts[0]] = self.convert_type('', value)
+                out_dict[parts[0]] = convert_type('', value, self.timezone)
         return out_dict
 
 
@@ -140,7 +139,10 @@ def unflatten(self):
             root_id_or_none = line[self.root_id] if self.root_id else None
             if root_id_or_none not in main_sheet_by_ocid:
                 main_sheet_by_ocid[root_id_or_none] = TemporaryDict('id')
-            main_sheet_by_ocid[root_id_or_none].append(unflatten_line(self.convert_types(line)))
+            if not self.parser:
+                main_sheet_by_ocid[root_id_or_none].append(unflatten_line(self.convert_types(line)))
+            else:
+                main_sheet_by_ocid[root_id_or_none].append(unflatten_main_with_parser(self.parser, line, self.timezone))
 
         for sheet_name, lines in self.get_sub_sheets_lines():
             for i, line in enumerate(lines):
@@ -274,6 +276,86 @@ def unflatten_line(line):
         path_search(unflattened, fields[:-1], top_sheet=True)[fields[-1]] = v
     return unflattened
 
+def isint(string):
+    try:
+        int(string)
+        return True
+    except ValueError:
+        return False
+
+class ListAsDict(dict):
+    pass
+
+def list_as_dicts_to_temporary_dicts(unflattened):
+    for key, value in list(unflattened.items()):
+        if hasattr(value, 'items'):
+            if not value:
+                unflattened.pop(key)
+            list_as_dicts_to_temporary_dicts(value)
+        if isinstance(value, ListAsDict):
+            temporarydict = TemporaryDict("id")
+            for index in sorted(value.keys()):
+                temporarydict.append(value[index])
+            unflattened[key] = temporarydict
+    return unflattened
+
+
+def unflatten_main_with_parser(parser, line, timezone):
+    unflattened = {}
+    for path, value in line.items():
+        if not value:
+            continue
+        current_path = unflattened
+        path_list = [item.rstrip('[]') for item in path.split('/')]
+        for num, path_item in enumerate(path_list):
+            if isint(path_item):
+                continue
+            path_till_now = '/'.join([item for item in path_list[:num+1] if not isint(item)])
+            current_type = parser.flattened.get(path_till_now)
+            try:
+                next_path_item = path_list[num+1]
+            except IndexError:
+                next_path_item = ''
+
+            ## Array
+            list_index = -1
+            if isint(next_path_item):
+                if current_type and current_type != 'array':
+                    raise ValueError("There is an array at '{}' when the schema says there should be a '{}'".format(path_till_now, current_type))
+                list_index = int(next_path_item)
+
+            if isint(next_path_item) or current_type == 'array':
+                list_as_dict = current_path.get(path_item)
+                if list_as_dict is None:
+                    list_as_dict = ListAsDict()
+                    current_path[path_item] = list_as_dict
+                new_path = list_as_dict.get(list_index)
+                if new_path is None:
+                    new_path = {}
+                    list_as_dict[list_index] = new_path
+                current_path = new_path
+                continue
+
+            ## Object
+            if current_type == 'object' or (not current_type and next_path_item):
+                new_path = current_path.get(path_item)
+                if new_path is None:
+                    new_path = {}
+                    current_path[path_item] = new_path
+                current_path = new_path
+                continue
+            if current_type and current_type != 'object' and next_path_item:
+                raise ValueError("There is an object or list at '{}' but it should be an {}".format(path_till_now, current_type))
+
+            ## Other Types
+            converted_value = convert_type(current_type or '', value, timezone)
+            if converted_value:
+                current_path[path_item] = converted_value
+
+    unflattened = list_as_dicts_to_temporary_dicts(unflattened)
+    return unflattened
+
+
 
 class IDFieldMissing(KeyError):
     pass
 
@@ -30,6 +30,8 @@ def lookup_header(self, title_header):
     def lookup_header_list(self, title_header_list):
         first_title = title_header_list[0]
         remaining_titles = title_header_list[1:]
+        if isinstance(first_title, int):
+            return first_title + '/' + self[first_title].lookup_header_list(remaining_titles)
         if first_title in self:
             if remaining_titles:
                 return self[first_title].property_name + '/' + self[first_title].lookup_header_list(remaining_titles)
@@ -69,6 +71,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
         self.root_id = root_id
         self.use_titles = use_titles
         self.title_lookup = TitleLookup()
+        self.flattened = {}
 
         if root_schema_dict is None and schema_filename is  None:
             raise ValueError('One of schema_filename or root_schema_dict must be supplied')
@@ -86,7 +89,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
             self.root_schema_dict = root_schema_dict
 
     def parse(self):
-        fields = self.parse_schema_dict(self.main_sheet_name, self.root_schema_dict)
+        fields = self.parse_schema_dict(self.main_sheet_name, '', self.root_schema_dict)
         for field, title in fields:
             if self.use_titles:
                 if not title:
@@ -96,7 +99,9 @@ def parse(self):
             else:
                 self.main_sheet.append(field)
 
-    def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, title_lookup=None):
+    def parse_schema_dict(self, parent_name, parent_path, schema_dict, parent_id_fields=None, title_lookup=None):
+        if parent_path:
+            parent_path = parent_path + '/'
         parent_id_fields = parent_id_fields or []
         title_lookup = self.title_lookup if title_lookup is None else title_lookup
         if 'properties' in schema_dict:
@@ -114,8 +119,10 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
                     title_lookup[title].property_name = property_name
 
                 if 'object' in property_type_set:
+                    self.flattened[parent_path+property_name] = "object"
                     for field, child_title in self.parse_schema_dict(
                             parent_name+'/'+property_name,
+                            parent_path+property_name,
                             property_schema_dict,
                             parent_id_fields=id_fields,
                             title_lookup=title_lookup.get(title)):
@@ -126,10 +133,13 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
                         )
 
                 elif 'array' in property_type_set:
+                    self.flattened[parent_path+property_name] = "array"
                     type_set = get_property_type_set(property_schema_dict['items'])
                     if 'string' in type_set:
+                        self.flattened[parent_path+property_name] = "string_array"
                         yield property_name+':array', title
                     elif 'array' in type_set:
+                        self.flattened[parent_path+property_name] = "array_array"
                         if 'string' in get_property_type_set(property_schema_dict['items']['items']):
                             yield property_name+':array', title
                         else:
@@ -152,6 +162,7 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
                         for field in id_fields:
                             sub_sheet.add_field(field+':'+property_name, id_field=True)
                         fields = self.parse_schema_dict(parent_name+'/'+property_name+'[]',
+                                parent_path+property_name,
                                 property_schema_dict['items'],
                                 parent_id_fields=id_fields,
                                 title_lookup=title_lookup.get(title))
@@ -178,12 +189,16 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, tit
                     else:
                         raise ValueError
                 elif 'string' in property_type_set:
+                    self.flattened[parent_path+property_name] = "string"
                     yield property_name, title
                 elif 'number' in property_type_set:
+                    self.flattened[parent_path+property_name] = "number"
                     yield property_name+':number', title
                 elif 'integer' in property_type_set:
+                    self.flattened[parent_path+property_name] = "integer"
                     yield property_name+':integer', title
                 elif 'boolean' in property_type_set:
+                    self.flattened[parent_path+property_name] = "boolean"
                     yield property_name+':boolean', title
                 else:
                     warn('Unrecognised types {} for property "{}" with context "{}",'