[#50] Lookup titles individually, instead of the whole header

Bjwebb · Bjwebb · commit 802adbbedf88 · 2016-02-16T12:13:39.000Z
This commit fixes the tests in test_input_SpreadsheetInput_unflatten.py
but breaks the tests in test_unflatten.py and test_roundtrip.py (the
TODOs in input.py explain how).
diff --git a/flattentool/input.py b/flattentool/input.py
@@ -36,15 +36,19 @@ class SpreadsheetInput(object):
     or csv).
 
     """
-    def convert_dict_titles(self, dicts, titles):
+    def convert_dict_titles(self, dicts):
         """
-        Replace titles with field names in the given list of dictionaries (``dicts``) using the mapping in ``titles``.
+        Replace titles with field names in the given list of dictionaries
+        (``dicts``) using the titles lookup in the schema parser.
 
         """
-        titles = titles or {}
-        titles_map = {title.replace(' ', '').lower(): title for title in titles}
-        for d in dicts:
-            yield { (titles[titles_map[k.replace(' ', '').lower()]] if k.replace(' ', '').lower() in titles_map else (k if '/' in k else k.replace(':','/'))):v for k,v in d.items() }
+        # TODO add this to TitleLookup. Breaks the tests in test_unflatten
+        # titles_map = {title.replace(' ', '').lower(): title for title in titles}
+        if self.parser:
+            for d in dicts:
+                yield { self.parser.title_lookup.lookup_header(k):v for k,v in d.items() }
+        else:
+            return dicts
 
     def __init__(self, input_name='', main_sheet_name='', timezone_name='UTC', root_id='ocid', convert_titles=False):
         self.input_name = input_name
@@ -53,17 +57,21 @@ def __init__(self, input_name='', main_sheet_name='', timezone_name='UTC', root_
         self.timezone = pytz.timezone(timezone_name)
         self.root_id = root_id
         self.convert_titles = convert_titles
+        self.parser = None
 
     def get_main_sheet_lines(self):
         if self.convert_titles:
-            return self.convert_dict_titles(self.get_sheet_lines(self.main_sheet_name), self.parser.main_sheet.titles)
+            return self.convert_dict_titles(self.get_sheet_lines(self.main_sheet_name))
         else:
             return self.get_sheet_lines(self.main_sheet_name)
 
     def get_sub_sheets_lines(self):
         for sub_sheet_name in self.sub_sheet_names:
             if self.convert_titles:
-                yield sub_sheet_name, self.convert_dict_titles(self.get_sheet_lines(sub_sheet_name), self.parser.sub_sheets[sub_sheet_name].titles if sub_sheet_name in self.parser.sub_sheets else None)
+                # TODO: This won't work properly any more (breaks roundtrip
+                # tests, but we should also have something more like unit
+                # tests!)
+                yield sub_sheet_name, self.convert_dict_titles(self.get_sheet_lines(sub_sheet_name))
             else:
                 yield sub_sheet_name, self.get_sheet_lines(sub_sheet_name)
 
diff --git a/flattentool/schema.py b/flattentool/schema.py
@@ -1,7 +1,9 @@
 """Classes for reading from a JSON schema"""
 
 from __future__ import print_function
+from __future__ import unicode_literals
 from collections import OrderedDict
+from six.moves import UserDict
 import jsonref
 from warnings import warn
 from flattentool.sheet import Sheet
@@ -15,6 +17,24 @@ def get_property_type_set(property_schema_dict):
         return set(property_type)
 
 
+class TitleLookup(UserDict):
+    property_name = None
+
+    def lookup_header(self, title_header):
+        return self.lookup_header_list(title_header.split(':'))
+
+    def lookup_header_list(self, title_header_list):
+        first_title = title_header_list[0]
+        remaining_titles = title_header_list[1:]
+        if first_title in self:
+            if remaining_titles:
+                return self[first_title].property_name + '/' + self[first_title].lookup_header_list(remaining_titles)
+            else:
+                return self[first_title].property_name
+        else:
+            return '/'.join(title_header_list)
+
+
 class SchemaParser(object):
     """Parse the fields of a JSON schema into a flattened structure."""
 
@@ -26,6 +46,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
         self.rollup = rollup
         self.root_id = root_id
         self.use_titles = use_titles
+        self.title_lookup = TitleLookup()
 
         if root_schema_dict is None and schema_filename is  None:
             raise ValueError('One of schema_filename or root_schema_dict must be supplied')
@@ -52,11 +73,10 @@ def parse(self):
                     self.main_sheet.append(title)
             else:
                 self.main_sheet.append(field)
-            if title:
-                self.main_sheet.titles[title] = field
 
-    def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
+    def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, title_lookup=None):
         parent_id_fields = parent_id_fields or []
+        title_lookup = self.title_lookup if title_lookup is None else title_lookup
         if 'properties' in schema_dict:
             if 'id' in schema_dict['properties']:
                 id_fields = parent_id_fields + [parent_name+'/id']
@@ -67,11 +87,20 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
                 property_type_set = get_property_type_set(property_schema_dict)
 
                 title = property_schema_dict.get('title')
+                title_lookup[title] = TitleLookup()
+                title_lookup[title].property_name = property_name
 
                 if 'object' in property_type_set:
-                    for field, child_title in self.parse_schema_dict(parent_name+'/'+property_name, property_schema_dict,
-                                                        parent_id_fields=id_fields):
-                        yield property_name+'/'+field, (title+':'+child_title if title and child_title else None) # TODO ambiguous use of "title"
+                    for field, child_title in self.parse_schema_dict(
+                            parent_name+'/'+property_name,
+                            property_schema_dict,
+                            parent_id_fields=id_fields,
+                            title_lookup=title_lookup[title]):
+                        yield (
+                            property_name+'/'+field,
+                            # TODO ambiguous use of "title"
+                            (title+':'+child_title if title and child_title else None) 
+                        )
 
                 elif 'array' in property_type_set:
                     type_set = get_property_type_set(property_schema_dict['items'])
@@ -83,6 +112,7 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
                         else:
                             raise ValueError
                     elif 'object' in type_set:
+                        title_lookup[title].property_name = property_name+'[]'
                         if hasattr(property_schema_dict['items'], '__reference__'):
                             sub_sheet_name = property_schema_dict['items'].__reference__['$ref'].split('/')[-1]
                         else:
@@ -98,7 +128,8 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
                             sub_sheet.add_field(field+':'+property_name, id_field=True)
                         fields = self.parse_schema_dict(parent_name+'/'+property_name+'[]',
                                 property_schema_dict['items'],
-                                parent_id_fields=id_fields)
+                                parent_id_fields=id_fields,
+                                title_lookup=title_lookup[title])
 
                         rolledUp = set()
 
@@ -110,8 +141,6 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
                                     sub_sheet.add_field(child_title)
                             else:
                                 sub_sheet.add_field(field)
-                            if child_title:
-                                self.sub_sheets[sub_sheet_name].titles[child_title] = field
                             if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict['rollUp']:
                                 rolledUp.add(field)
                                 yield property_name+'[]/'+field, (title+':'+child_title if title and child_title else None)
diff --git a/flattentool/tests/test_input_SpreadsheetInput_unflatten.py b/flattentool/tests/test_input_SpreadsheetInput_unflatten.py
@@ -232,7 +232,7 @@ def create_schema(root_id):
         }]
     ),
     # Nested titles should be converted individually
-    pytest.mark.xfail((
+    (
         [{
             'ROOT_ID_TITLE': 1,
             'Identifier': 2,
@@ -244,7 +244,7 @@ def create_schema(root_id):
             'id': 2,
             'testB': {'testC': 3, 'Not in schema': 4}
         }]
-    )),
+    ),
     # Unicode
     (
         [{
@@ -289,7 +289,7 @@ def create_schema(root_id):
     ),
     # Properties of a single item array shouldn't need to be in rollUp list
     # for their titles to be converted
-    pytest.mark.xfail((
+    (
         [{
             'ROOT_ID_TITLE': 1,
             'Identifier': 2,
@@ -304,9 +304,9 @@ def create_schema(root_id):
                 'testC': 4
             }]
         }]
-    )),
+    ),
     # Single item array, titles should be converted individually
-    pytest.mark.xfail((
+    (
         [{
             'ROOT_ID_TITLE': 1,
             'Identifier': 2,
@@ -316,9 +316,12 @@ def create_schema(root_id):
         [{
             'ROOT_ID': 1,
             'id': 2,
-            'testR': {'testC': 3, 'Not in schema': 4}
+            'testR': [{
+                'testC': 3,
+                'Not in schema': 4
+            }]
         }]
-    )),
+    ),
     # Empty
     (
         [{
@@ -380,8 +383,7 @@ def test_unflatten(convert_titles, use_schema, root_id, root_id_kwargs, input_li
             root_schema_dict=create_schema(root_id) if use_schema else {},
             main_sheet_name='custom_main',
             root_id=root_id,
-            rollup=True,
-            use_titles=True
+            rollup=True
         )
         parser.parse()
         spreadsheet_input.parser = parser