Skip to content

Commit e379634

Browse files
authored
Merge pull request #167 from OpenDataServices/130-skip-lines
Add a configuration row, with commands ignore, skipLines and headerRows
2 parents dfdf61d + be1944a commit e379634

File tree

10 files changed

+173
-15
lines changed

10 files changed

+173
-15
lines changed

examples/help/unflatten/expected.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ usage: flatten-tool unflatten [-h] -f INPUT_FORMAT [--xml] [--id-name ID_NAME]
99
[--metatab-schema METATAB_SCHEMA]
1010
[--metatab-only]
1111
[--metatab-vertical-orientation]
12+
[--default-configuration DEFAULT_CONFIGURATION]
1213
input_name
1314

1415
positional arguments:
@@ -60,4 +61,7 @@ optional arguments:
6061
--metatab-vertical-orientation
6162
Read metatab so that headings are in the first column
6263
and data is read vertically. Only for XLSX not CSV
64+
--default-configuration DEFAULT_CONFIGURATION
65+
Comma seperated list of default parsing commands for
66+
all sheets. Only for XLSX not CSV
6367

flattentool/__init__.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from flattentool.output import FORMATS_SUFFIX
55
from flattentool.input import FORMATS as INPUT_FORMATS
66
from flattentool.xml_output import toxml
7+
from flattentool.lib import parse_sheet_configuration
78
import sys
89
import json
910
import codecs
@@ -112,6 +113,7 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
112113
vertical_orientation=False,
113114
metatab_name=None, metatab_only=False, metatab_schema='',
114115
metatab_vertical_orientation=False,
116+
default_configuration='',
115117
**_):
116118
"""
117119
Unflatten a flat structure (spreadsheet - csv or xlsx) into a nested structure (JSON).
@@ -131,6 +133,10 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
131133
base = OrderedDict()
132134

133135

136+
base_configuration = parse_sheet_configuration(
137+
[item.strip() for item in default_configuration.split(",")]
138+
)
139+
134140
cell_source_map_data = OrderedDict()
135141
heading_source_map_data = OrderedDict()
136142

@@ -144,7 +150,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
144150
convert_titles=convert_titles,
145151
vertical_orientation=metatab_vertical_orientation,
146152
id_name=id_name,
147-
xml=xml
153+
xml=xml,
154+
use_configuration=False
148155
)
149156
if metatab_schema:
150157
parser = SchemaParser(schema_filename=metatab_schema)
@@ -163,6 +170,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
163170
## strip off meta/ from start of source map as actually data is at top level
164171
heading_source_map_data[key[5:]] = value
165172

173+
base_configuration = spreadsheet_input.sheet_configuration.get(metatab_name) or base_configuration
174+
166175
if result:
167176
base.update(result[0])
168177

@@ -177,7 +186,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
177186
exclude_sheets=[metatab_name],
178187
vertical_orientation=vertical_orientation,
179188
id_name=id_name,
180-
xml=xml
189+
xml=xml,
190+
base_configuration=base_configuration
181191
)
182192
if schema:
183193
parser = SchemaParser(schema_filename=schema, rollup=True, root_id=root_id)

flattentool/cli.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ def create_parser():
155155
"--metatab-vertical-orientation",
156156
action='store_true',
157157
help="Read metatab so that headings are in the first column and data is read vertically. Only for XLSX not CSV")
158+
parser_unflatten.add_argument(
159+
"--default-configuration",
160+
help="Comma seperated list of default parsing commands for all sheets. Only for XLSX not CSV")
158161

159162
return parser
160163

flattentool/input.py

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import pytz
1818
from openpyxl.utils import _get_column_letter, column_index_from_string
1919
from flattentool.exceptions import DataErrorWarning
20+
from flattentool.lib import isint, parse_sheet_configuration
2021

2122

2223
class Cell:
@@ -133,6 +134,7 @@ def merge(base, mergee, debug_info=None):
133134
# This happens when a parent record finds the first a child record of a known type
134135
base[key] = v
135136

137+
136138
class SpreadsheetInput(object):
137139
"""
138140
Base class describing a spreadsheet input. Has stubs which are
@@ -164,7 +166,9 @@ def __init__(self,
164166
include_sheets=[],
165167
exclude_sheets=[],
166168
id_name='id',
167-
xml=False
169+
xml=False,
170+
base_configuration={},
171+
use_configuration=True
168172
):
169173
self.input_name = input_name
170174
self.root_list_path = root_list_path
@@ -178,6 +182,9 @@ def __init__(self,
178182
self.vertical_orientation = vertical_orientation
179183
self.include_sheets = include_sheets
180184
self.exclude_sheets = exclude_sheets
185+
self.base_configuration = base_configuration or {}
186+
self.sheet_configuration = {}
187+
self.use_configuration = use_configuration
181188

182189
def get_sub_sheets_lines(self):
183190
for sub_sheet_name in self.sub_sheet_names:
@@ -187,6 +194,13 @@ def get_sub_sheets_lines(self):
187194
else:
188195
yield sub_sheet_name, self.get_sheet_lines(sub_sheet_name)
189196

197+
def configure_sheets(self):
198+
for sub_sheet_name in self.sub_sheet_names:
199+
self.sheet_configuration[sub_sheet_name] = parse_sheet_configuration(self.get_sheet_configuration(sub_sheet_name))
200+
201+
def get_sheet_configuration(self, sheet_name):
202+
return []
203+
190204
def get_sheet_lines(self, sheet_name):
191205
raise NotImplementedError
192206

@@ -203,6 +217,9 @@ def do_unflatten(self):
203217
sheet_name, lines = sheet
204218
try:
205219
actual_headings = self.get_sheet_headings(sheet_name)
220+
# If sheet is empty or too many lines have been skipped
221+
if not actual_headings:
222+
continue
206223
found = OrderedDict()
207224
last_col = len(actual_headings)
208225
# We want to ignore data in earlier columns, so we look
@@ -430,6 +447,7 @@ def read_sheets(self):
430447
except ValueError:
431448
pass
432449
self.sub_sheet_names = sheet_names
450+
self.configure_sheets()
433451

434452
def get_sheet_lines(self, sheet_name):
435453
if sys.version > '3': # If Python 3 or greater
@@ -460,23 +478,60 @@ def read_sheets(self):
460478

461479
sheet_names = list(sheet for sheet in self.sheet_names_map.keys())
462480
self.sub_sheet_names = sheet_names
481+
self.configure_sheets()
463482

464483
def get_sheet_headings(self, sheet_name):
465484
worksheet = self.workbook[self.sheet_names_map[sheet_name]]
485+
sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
486+
configuration_line = 1 if sheet_configuration else 0
487+
if not sheet_configuration:
488+
sheet_configuration = self.base_configuration
489+
if not self.use_configuration:
490+
sheet_configuration = {}
491+
492+
skip_rows = sheet_configuration.get("skipRows", 0)
493+
if sheet_configuration.get("ignore"):
494+
# returning empty headers is a proxy for no data in the sheet.
495+
return []
466496

467497
if self.vertical_orientation:
468-
return [cell.value for cell in worksheet.columns[0]]
498+
return [cell.value for cell in worksheet.columns[skip_rows][configuration_line:]]
499+
500+
try:
501+
return [cell.value for cell in worksheet.rows[skip_rows + configuration_line]]
502+
except IndexError:
503+
# If the heading line is after data in the spreadsheet. i.e when skipRows
504+
return []
469505

470-
return [cell.value for cell in worksheet.rows[0]]
506+
def get_sheet_configuration(self, sheet_name):
507+
worksheet = self.workbook[self.sheet_names_map[sheet_name]]
508+
if worksheet.rows[0][0].value == '#':
509+
return [cell.value for num, cell in enumerate(worksheet.rows[0]) if num != 0 and cell.value]
510+
else:
511+
return []
471512

472513
def get_sheet_lines(self, sheet_name):
514+
sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
515+
configuration_line = 1 if sheet_configuration else 0
516+
if not sheet_configuration:
517+
sheet_configuration = self.base_configuration
518+
if not self.use_configuration:
519+
sheet_configuration = {}
520+
521+
skip_rows = sheet_configuration.get("skipRows", 0)
522+
header_rows = sheet_configuration.get("headerRows", 1)
523+
524+
473525
worksheet = self.workbook[self.sheet_names_map[sheet_name]]
474526
if self.vertical_orientation:
475-
header_row = worksheet.columns[0]
476-
remaining_rows = worksheet.columns[1:]
527+
header_row = worksheet.columns[skip_rows]
528+
remaining_rows = worksheet.columns[skip_rows + header_rows:]
529+
if configuration_line:
530+
header_row = header_row[1:]
531+
remaining_rows = [row[1:] for row in remaining_rows]
477532
else:
478-
header_row = worksheet.rows[0]
479-
remaining_rows = worksheet.rows[1:]
533+
header_row = worksheet.rows[skip_rows + configuration_line]
534+
remaining_rows = worksheet.rows[skip_rows + configuration_line + header_rows:]
480535

481536
coli_to_header = ({i: x.value for i, x in enumerate(header_row) if x.value is not None})
482537
for row in remaining_rows:
@@ -489,12 +544,6 @@ def get_sheet_lines(self, sheet_name):
489544
}
490545

491546

492-
def isint(string):
493-
try:
494-
int(string)
495-
return True
496-
except ValueError:
497-
return False
498547

499548
class ListAsDict(dict):
500549
pass

flattentool/lib.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
def isint(string):
2+
try:
3+
int(string)
4+
return True
5+
except ValueError:
6+
return False
7+
8+
def parse_sheet_configuration(configuration_list):
9+
configuration = {}
10+
for item in configuration_list:
11+
parts = item.split()
12+
if (len(parts) == 2 and parts[0].lower() == "skiprows" and isint(parts[1])):
13+
configuration['skipRows'] = max(int(parts[1]), 0)
14+
if (len(parts) == 2 and parts[0].lower() == "headerrows" and isint(parts[1])):
15+
configuration['headerRows'] = max(int(parts[1]), 1)
16+
if (len(parts) == 1 and parts[0].lower() == "ignore"):
17+
configuration['ignore'] = True
18+
return configuration
4.74 KB
Binary file not shown.
5.82 KB
Binary file not shown.
4.79 KB
Binary file not shown.
7.02 KB
Binary file not shown.

flattentool/tests/test_init.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,3 +1138,77 @@ def test_bad_format(tmpdir):
11381138
input_format=None,
11391139
output_name=tmpdir.join('meta_unflattened.json').strpath,
11401140
)
1141+
1142+
def test_commands_single_sheet(tmpdir):
1143+
1144+
unflatten(
1145+
'flattentool/tests/fixtures/xlsx/commands_in_file.xlsx',
1146+
input_format='xlsx',
1147+
output_name=tmpdir.join('command_single_unflattened.json').strpath,
1148+
cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
1149+
heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
1150+
)
1151+
1152+
unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
1153+
1154+
assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
1155+
1156+
def test_commands_metatab(tmpdir):
1157+
1158+
unflatten(
1159+
'flattentool/tests/fixtures/xlsx/commands_in_metatab.xlsx',
1160+
input_format='xlsx',
1161+
output_name=tmpdir.join('command_metatab_unflattened.json').strpath,
1162+
cell_source_map=tmpdir.join('command_metatab_source_map.json').strpath,
1163+
heading_source_map=tmpdir.join('command_metatab_heading_source_map.json').strpath,
1164+
metatab_name='Meta',
1165+
metatab_vertical_orientation=True
1166+
)
1167+
1168+
unflattened = json.load(tmpdir.join('command_metatab_unflattened.json'))
1169+
1170+
assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}, {'actual': 'actual', 'headings': 'Other data', 'some': 'some'}],
1171+
'some': 'data'}
1172+
1173+
def test_commands_single_sheet_default(tmpdir):
1174+
1175+
unflatten(
1176+
'flattentool/tests/fixtures/xlsx/commands_defaulted.xlsx',
1177+
input_format='xlsx',
1178+
output_name=tmpdir.join('command_single_unflattened.json').strpath,
1179+
cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
1180+
heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
1181+
default_configuration="SkipRows 1, headerrows 2",
1182+
)
1183+
1184+
unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
1185+
1186+
assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
1187+
1188+
1189+
unflatten(
1190+
'flattentool/tests/fixtures/xlsx/commands_defaulted.xlsx',
1191+
input_format='xlsx',
1192+
output_name=tmpdir.join('command_single_unflattened.json').strpath,
1193+
cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
1194+
heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
1195+
default_configuration="SkipRows 1",
1196+
)
1197+
1198+
unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
1199+
1200+
assert unflattened == {'main': [{'actual': 'other', 'headings': 'headings', 'some': 'some'}, {'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
1201+
1202+
def test_commands_ignore(tmpdir):
1203+
1204+
unflatten(
1205+
'flattentool/tests/fixtures/xlsx/commands_ignore.xlsx',
1206+
input_format='xlsx',
1207+
output_name=tmpdir.join('command_single_unflattened.json').strpath,
1208+
cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
1209+
heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
1210+
)
1211+
1212+
unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
1213+
1214+
assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}

0 commit comments

Comments
 (0)