Skip to content

Commit 760afa2

Browse files
committed
Merge pull request #86 from OpenDataServices/50-better-titles
Convert titles individually, and regardless of rollUp
2 parents 0c98e69 + 6a66251 commit 760afa2

File tree

6 files changed

+1245
-933
lines changed

6 files changed

+1245
-933
lines changed

flattentool/input.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,19 @@ class SpreadsheetInput(object):
3636
or csv).
3737
3838
"""
39-
def convert_dict_titles(self, dicts, titles):
39+
def convert_dict_titles(self, dicts, title_lookup=None):
4040
"""
41-
Replace titles with field names in the given list of dictionaries (``dicts``) using the mapping in ``titles``.
41+
Replace titles with field names in the given list of dictionaries
42+
(``dicts``) using the titles lookup in the schema parser.
4243
4344
"""
44-
titles = titles or {}
45-
titles_map = {title.replace(' ', '').lower(): title for title in titles}
45+
if self.parser:
46+
title_lookup = title_lookup or self.parser.title_lookup
4647
for d in dicts:
47-
yield { (titles[titles_map[k.replace(' ', '').lower()]] if k.replace(' ', '').lower() in titles_map else (k if '/' in k else k.replace(':','/'))):v for k,v in d.items() }
48+
if title_lookup:
49+
yield { title_lookup.lookup_header(k):v for k,v in d.items() }
50+
else:
51+
yield d
4852

4953
def __init__(self, input_name='', main_sheet_name='', timezone_name='UTC', root_id='ocid', convert_titles=False):
5054
self.input_name = input_name
@@ -53,17 +57,19 @@ def __init__(self, input_name='', main_sheet_name='', timezone_name='UTC', root_
5357
self.timezone = pytz.timezone(timezone_name)
5458
self.root_id = root_id
5559
self.convert_titles = convert_titles
60+
self.parser = None
5661

5762
def get_main_sheet_lines(self):
5863
if self.convert_titles:
59-
return self.convert_dict_titles(self.get_sheet_lines(self.main_sheet_name), self.parser.main_sheet.titles)
64+
return self.convert_dict_titles(self.get_sheet_lines(self.main_sheet_name))
6065
else:
6166
return self.get_sheet_lines(self.main_sheet_name)
6267

6368
def get_sub_sheets_lines(self):
6469
for sub_sheet_name in self.sub_sheet_names:
6570
if self.convert_titles:
66-
yield sub_sheet_name, self.convert_dict_titles(self.get_sheet_lines(sub_sheet_name), self.parser.sub_sheets[sub_sheet_name].titles if sub_sheet_name in self.parser.sub_sheets else None)
71+
yield sub_sheet_name, self.convert_dict_titles(self.get_sheet_lines(sub_sheet_name),
72+
self.parser.sub_sheets[sub_sheet_name].title_lookup if sub_sheet_name in self.parser.sub_sheets else None)
6773
else:
6874
yield sub_sheet_name, self.get_sheet_lines(sub_sheet_name)
6975

flattentool/schema.py

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""Classes for reading from a JSON schema"""
22

33
from __future__ import print_function
4+
from __future__ import unicode_literals
45
from collections import OrderedDict
6+
from six.moves import UserDict
57
import jsonref
68
from warnings import warn
79
from flattentool.sheet import Sheet
@@ -15,6 +17,46 @@ def get_property_type_set(property_schema_dict):
1517
return set(property_type)
1618

1719

20+
class TitleLookup(UserDict):
21+
property_name = None
22+
23+
def lookup_header(self, title_header):
24+
# Ignore titles with a / in, as they may contain types
25+
# https://github.com/OpenDataServices/flatten-tool/issues/56
26+
if '/' in title_header:
27+
return title_header
28+
return self.lookup_header_list(title_header.split(':'))
29+
30+
def lookup_header_list(self, title_header_list):
31+
first_title = title_header_list[0]
32+
remaining_titles = title_header_list[1:]
33+
if first_title in self:
34+
if remaining_titles:
35+
return self[first_title].property_name + '/' + self[first_title].lookup_header_list(remaining_titles)
36+
else:
37+
return self[first_title].property_name
38+
else:
39+
# If we can't look up the title, treat it and any children as
40+
# field names directly.
41+
# Strip spaces off these.
42+
return '/'.join(x.strip(' ') for x in title_header_list)
43+
44+
def __setitem__(self, key, value):
45+
self.data[key.replace(' ', '').lower()] = value
46+
47+
def __getitem__(self, key):
48+
if key is None:
49+
raise KeyError
50+
else:
51+
return self.data[key.replace(' ', '').lower()]
52+
53+
def __contains__(self, key):
54+
if key is None:
55+
return False
56+
else:
57+
return key.replace(' ', '').lower() in self.data
58+
59+
1860
class SchemaParser(object):
1961
"""Parse the fields of a JSON schema into a flattened structure."""
2062

@@ -26,6 +68,7 @@ def __init__(self, schema_filename=None, root_schema_dict=None, main_sheet_name=
2668
self.rollup = rollup
2769
self.root_id = root_id
2870
self.use_titles = use_titles
71+
self.title_lookup = TitleLookup()
2972

3073
if root_schema_dict is None and schema_filename is None:
3174
raise ValueError('One of schema_filename or root_schema_dict must be supplied')
@@ -52,11 +95,10 @@ def parse(self):
5295
self.main_sheet.append(title)
5396
else:
5497
self.main_sheet.append(field)
55-
if title:
56-
self.main_sheet.titles[title] = field
5798

58-
def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
99+
def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None, title_lookup=None):
59100
parent_id_fields = parent_id_fields or []
101+
title_lookup = self.title_lookup if title_lookup is None else title_lookup
60102
if 'properties' in schema_dict:
61103
if 'id' in schema_dict['properties']:
62104
id_fields = parent_id_fields + [parent_name+'/id']
@@ -67,11 +109,21 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
67109
property_type_set = get_property_type_set(property_schema_dict)
68110

69111
title = property_schema_dict.get('title')
112+
if title:
113+
title_lookup[title] = TitleLookup()
114+
title_lookup[title].property_name = property_name
70115

71116
if 'object' in property_type_set:
72-
for field, child_title in self.parse_schema_dict(parent_name+'/'+property_name, property_schema_dict,
73-
parent_id_fields=id_fields):
74-
yield property_name+'/'+field, (title+':'+child_title if title and child_title else None) # TODO ambiguous use of "title"
117+
for field, child_title in self.parse_schema_dict(
118+
parent_name+'/'+property_name,
119+
property_schema_dict,
120+
parent_id_fields=id_fields,
121+
title_lookup=title_lookup.get(title)):
122+
yield (
123+
property_name+'/'+field,
124+
# TODO ambiguous use of "title"
125+
(title+':'+child_title if title and child_title else None)
126+
)
75127

76128
elif 'array' in property_type_set:
77129
type_set = get_property_type_set(property_schema_dict['items'])
@@ -83,6 +135,8 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
83135
else:
84136
raise ValueError
85137
elif 'object' in type_set:
138+
if title:
139+
title_lookup[title].property_name = property_name+'[]'
86140
if hasattr(property_schema_dict['items'], '__reference__'):
87141
sub_sheet_name = property_schema_dict['items'].__reference__['$ref'].split('/')[-1]
88142
else:
@@ -93,12 +147,14 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
93147
if sub_sheet_name not in self.sub_sheets:
94148
self.sub_sheets[sub_sheet_name] = Sheet(root_id=self.root_id, name=sub_sheet_name)
95149
sub_sheet = self.sub_sheets[sub_sheet_name]
150+
sub_sheet.title_lookup = title_lookup.get(title)
96151

97152
for field in id_fields:
98153
sub_sheet.add_field(field+':'+property_name, id_field=True)
99154
fields = self.parse_schema_dict(parent_name+'/'+property_name+'[]',
100155
property_schema_dict['items'],
101-
parent_id_fields=id_fields)
156+
parent_id_fields=id_fields,
157+
title_lookup=title_lookup.get(title))
102158

103159
rolledUp = set()
104160

@@ -110,8 +166,6 @@ def parse_schema_dict(self, parent_name, schema_dict, parent_id_fields=None):
110166
sub_sheet.add_field(child_title)
111167
else:
112168
sub_sheet.add_field(field)
113-
if child_title:
114-
self.sub_sheets[sub_sheet_name].titles[child_title] = field
115169
if self.rollup and 'rollUp' in property_schema_dict and field in property_schema_dict['rollUp']:
116170
rolledUp.add(field)
117171
yield property_name+'[]/'+field, (title+':'+child_title if title and child_title else None)

0 commit comments

Comments
 (0)