OpenDataServices
diff --git a/‎examples/help/unflatten/expected.txt‎
Lines changed: 4 additions & 0 deletions b/‎examples/help/unflatten/expected.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎flattentool/__init__.py‎
Lines changed: 12 additions & 2 deletions b/‎flattentool/__init__.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎flattentool/cli.py‎
Lines changed: 3 additions & 0 deletions b/‎flattentool/cli.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎flattentool/input.py‎
Lines changed: 62 additions & 13 deletions b/‎flattentool/input.py‎
Lines changed: 62 additions & 13 deletions
diff --git a/‎flattentool/lib.py‎
Lines changed: 18 additions & 0 deletions b/‎flattentool/lib.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎flattentool/tests/fixtures/xlsx/commands_defaulted.xlsx‎
4.74 KB b/‎flattentool/tests/fixtures/xlsx/commands_defaulted.xlsx‎
4.74 KB
diff --git a/‎flattentool/tests/fixtures/xlsx/commands_ignore.xlsx‎
5.82 KB b/‎flattentool/tests/fixtures/xlsx/commands_ignore.xlsx‎
5.82 KB
diff --git a/‎flattentool/tests/fixtures/xlsx/commands_in_file.xlsx‎
4.79 KB b/‎flattentool/tests/fixtures/xlsx/commands_in_file.xlsx‎
4.79 KB
diff --git a/‎flattentool/tests/fixtures/xlsx/commands_in_metatab.xlsx‎
7.02 KB b/‎flattentool/tests/fixtures/xlsx/commands_in_metatab.xlsx‎
7.02 KB
diff --git a/‎flattentool/tests/test_init.py‎
Lines changed: 74 additions & 0 deletions b/‎flattentool/tests/test_init.py‎
Lines changed: 74 additions & 0 deletions
@@ -9,6 +9,7 @@ usage: flatten-tool unflatten [-h] -f INPUT_FORMAT [--xml] [--id-name ID_NAME]
                               [--metatab-schema METATAB_SCHEMA]
                               [--metatab-only]
                               [--metatab-vertical-orientation]
+                              [--default-configuration DEFAULT_CONFIGURATION]
                               input_name
 
 positional arguments:
@@ -60,4 +61,7 @@ optional arguments:
   --metatab-vertical-orientation
                         Read metatab so that headings are in the first column
                         and data is read vertically. Only for XLSX not CSV
+  --default-configuration DEFAULT_CONFIGURATION
+                        Comma seperated list of default parsing commands for
+                        all sheets. Only for XLSX not CSV
 
@@ -4,6 +4,7 @@
 from flattentool.output import FORMATS_SUFFIX
 from flattentool.input import FORMATS as INPUT_FORMATS
 from flattentool.xml_output import toxml
+from flattentool.lib import parse_sheet_configuration
 import sys
 import json
 import codecs
@@ -112,6 +113,7 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
               vertical_orientation=False,
               metatab_name=None, metatab_only=False, metatab_schema='',
               metatab_vertical_orientation=False,
+              default_configuration='',
               **_):
     """
     Unflatten a flat structure (spreadsheet - csv or xlsx) into a nested structure (JSON).
@@ -131,6 +133,10 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
         base = OrderedDict()
 
 
+    base_configuration = parse_sheet_configuration(
+        [item.strip() for item in default_configuration.split(",")]
+    )
+
     cell_source_map_data = OrderedDict()
     heading_source_map_data = OrderedDict()
 
@@ -144,7 +150,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
             convert_titles=convert_titles,
             vertical_orientation=metatab_vertical_orientation,
             id_name=id_name,
-            xml=xml
+            xml=xml,
+            use_configuration=False
         )
         if metatab_schema:
             parser = SchemaParser(schema_filename=metatab_schema)
@@ -163,6 +170,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
             ## strip off meta/ from start of source map as actually data is at top level
             heading_source_map_data[key[5:]] = value
 
+        base_configuration = spreadsheet_input.sheet_configuration.get(metatab_name) or base_configuration
+
         if result:
             base.update(result[0])
 
@@ -177,7 +186,8 @@ def unflatten(input_name, base_json=None, input_format=None, output_name=None,
             exclude_sheets=[metatab_name],
             vertical_orientation=vertical_orientation,
             id_name=id_name,
-            xml=xml
+            xml=xml,
+            base_configuration=base_configuration
         )
         if schema:
             parser = SchemaParser(schema_filename=schema, rollup=True, root_id=root_id)
 
@@ -155,6 +155,9 @@ def create_parser():
         "--metatab-vertical-orientation",
         action='store_true',
         help="Read metatab so that headings are in the first column and data is read vertically. Only for XLSX not CSV")
+    parser_unflatten.add_argument(
+        "--default-configuration",
+        help="Comma seperated list of default parsing commands for all sheets. Only for XLSX not CSV")
 
     return parser
 
 
@@ -17,6 +17,7 @@
 import pytz
 from openpyxl.utils import _get_column_letter, column_index_from_string
 from flattentool.exceptions import DataErrorWarning
+from flattentool.lib import isint, parse_sheet_configuration
 
 
 class Cell:
@@ -133,6 +134,7 @@ def merge(base, mergee, debug_info=None):
             # This happens when a parent record finds the first a child record of a known type
             base[key] = v
 
+
 class SpreadsheetInput(object):
     """
     Base class describing a spreadsheet input. Has stubs which are
@@ -164,7 +166,9 @@ def __init__(self,
                  include_sheets=[],
                  exclude_sheets=[],
                  id_name='id',
-                 xml=False
+                 xml=False,
+                 base_configuration={},
+                 use_configuration=True
                 ):
         self.input_name = input_name
         self.root_list_path = root_list_path
@@ -178,6 +182,9 @@ def __init__(self,
         self.vertical_orientation = vertical_orientation
         self.include_sheets = include_sheets
         self.exclude_sheets = exclude_sheets
+        self.base_configuration = base_configuration or {}
+        self.sheet_configuration = {}
+        self.use_configuration = use_configuration
 
     def get_sub_sheets_lines(self):
         for sub_sheet_name in self.sub_sheet_names:
@@ -187,6 +194,13 @@ def get_sub_sheets_lines(self):
             else:
                 yield sub_sheet_name, self.get_sheet_lines(sub_sheet_name)
 
+    def configure_sheets(self):
+        for sub_sheet_name in self.sub_sheet_names:
+            self.sheet_configuration[sub_sheet_name] = parse_sheet_configuration(self.get_sheet_configuration(sub_sheet_name)) 
+
+    def get_sheet_configuration(self, sheet_name):
+        return []
+
     def get_sheet_lines(self, sheet_name):
         raise NotImplementedError
 
@@ -203,6 +217,9 @@ def do_unflatten(self):
             sheet_name, lines = sheet
             try:
                 actual_headings = self.get_sheet_headings(sheet_name)
+                # If sheet is empty or too many lines have been skipped
+                if not actual_headings:
+                    continue
                 found = OrderedDict()
                 last_col = len(actual_headings)
                 # We want to ignore data in earlier columns, so we look
@@ -430,6 +447,7 @@ def read_sheets(self):
             except ValueError:
                 pass
         self.sub_sheet_names = sheet_names
+        self.configure_sheets()
 
     def get_sheet_lines(self, sheet_name):
         if sys.version > '3':  # If Python 3 or greater
@@ -460,23 +478,60 @@ def read_sheets(self):
 
         sheet_names = list(sheet for sheet in self.sheet_names_map.keys())
         self.sub_sheet_names = sheet_names
+        self.configure_sheets()
 
     def get_sheet_headings(self, sheet_name):
         worksheet = self.workbook[self.sheet_names_map[sheet_name]]
+        sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
+        configuration_line = 1 if sheet_configuration else 0
+        if not sheet_configuration:
+            sheet_configuration = self.base_configuration
+        if not self.use_configuration:
+            sheet_configuration = {}
+
+        skip_rows = sheet_configuration.get("skipRows", 0)
+        if sheet_configuration.get("ignore"):
+            # returning empty headers is a proxy for no data in the sheet.
+            return []
 
         if self.vertical_orientation:
-            return [cell.value for cell in worksheet.columns[0]]
+            return [cell.value for cell in worksheet.columns[skip_rows][configuration_line:]]
+
+        try:
+            return [cell.value for cell in worksheet.rows[skip_rows + configuration_line]]
+        except IndexError:
+            # If the heading line is after data in the spreadsheet. i.e when skipRows
+            return []
 
-        return [cell.value for cell in worksheet.rows[0]]
+    def get_sheet_configuration(self, sheet_name):
+        worksheet = self.workbook[self.sheet_names_map[sheet_name]]
+        if worksheet.rows[0][0].value == '#':
+            return [cell.value for num, cell in enumerate(worksheet.rows[0]) if num != 0 and cell.value]
+        else:
+            return []
 
     def get_sheet_lines(self, sheet_name):
+        sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
+        configuration_line = 1 if sheet_configuration else 0
+        if not sheet_configuration:
+            sheet_configuration = self.base_configuration
+        if not self.use_configuration:
+            sheet_configuration = {}
+
+        skip_rows = sheet_configuration.get("skipRows", 0)
+        header_rows = sheet_configuration.get("headerRows", 1)
+
+
         worksheet = self.workbook[self.sheet_names_map[sheet_name]]
         if self.vertical_orientation:
-            header_row = worksheet.columns[0]
-            remaining_rows = worksheet.columns[1:]
+            header_row = worksheet.columns[skip_rows]
+            remaining_rows = worksheet.columns[skip_rows + header_rows:]
+            if configuration_line:
+                header_row = header_row[1:]
+                remaining_rows = [row[1:] for row in remaining_rows]
         else:
-            header_row = worksheet.rows[0]
-            remaining_rows = worksheet.rows[1:]
+            header_row = worksheet.rows[skip_rows + configuration_line]
+            remaining_rows = worksheet.rows[skip_rows + configuration_line + header_rows:]
 
         coli_to_header = ({i: x.value for i, x in enumerate(header_row) if x.value is not None})
         for row in remaining_rows:
@@ -489,12 +544,6 @@ def get_sheet_lines(self, sheet_name):
 }
 
 
-def isint(string):
-    try:
-        int(string)
-        return True
-    except ValueError:
-        return False
 
 class ListAsDict(dict):
     pass
 
@@ -0,0 +1,18 @@
+def isint(string):
+    try:
+        int(string)
+        return True
+    except ValueError:
+        return False
+
+def parse_sheet_configuration(configuration_list):
+    configuration = {}
+    for item in configuration_list:
+        parts = item.split()
+        if (len(parts) == 2 and parts[0].lower() == "skiprows" and isint(parts[1])):
+            configuration['skipRows'] = max(int(parts[1]), 0)
+        if (len(parts) == 2 and parts[0].lower() == "headerrows" and isint(parts[1])):
+            configuration['headerRows'] = max(int(parts[1]), 1)
+        if (len(parts) == 1 and parts[0].lower() == "ignore"):
+            configuration['ignore'] = True
+    return configuration
@@ -1138,3 +1138,77 @@ def test_bad_format(tmpdir):
             input_format=None,
             output_name=tmpdir.join('meta_unflattened.json').strpath,
             )
+
+def test_commands_single_sheet(tmpdir):
+
+    unflatten(
+        'flattentool/tests/fixtures/xlsx/commands_in_file.xlsx',
+        input_format='xlsx',
+        output_name=tmpdir.join('command_single_unflattened.json').strpath,
+        cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
+        heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
+        )
+
+    unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
+
+    assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
+
+def test_commands_metatab(tmpdir):
+
+    unflatten(
+        'flattentool/tests/fixtures/xlsx/commands_in_metatab.xlsx',
+        input_format='xlsx',
+        output_name=tmpdir.join('command_metatab_unflattened.json').strpath,
+        cell_source_map=tmpdir.join('command_metatab_source_map.json').strpath,
+        heading_source_map=tmpdir.join('command_metatab_heading_source_map.json').strpath,
+        metatab_name='Meta',
+        metatab_vertical_orientation=True
+        )
+
+    unflattened = json.load(tmpdir.join('command_metatab_unflattened.json'))
+
+    assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}, {'actual': 'actual', 'headings': 'Other data', 'some': 'some'}],
+                           'some': 'data'}
+
+def test_commands_single_sheet_default(tmpdir):
+
+    unflatten(
+        'flattentool/tests/fixtures/xlsx/commands_defaulted.xlsx',
+        input_format='xlsx',
+        output_name=tmpdir.join('command_single_unflattened.json').strpath,
+        cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
+        heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
+        default_configuration="SkipRows 1, headerrows 2",
+        )
+
+    unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
+
+    assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
+
+
+    unflatten(
+        'flattentool/tests/fixtures/xlsx/commands_defaulted.xlsx',
+        input_format='xlsx',
+        output_name=tmpdir.join('command_single_unflattened.json').strpath,
+        cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
+        heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
+        default_configuration="SkipRows 1",
+        )
+
+    unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
+
+    assert unflattened == {'main': [{'actual': 'other', 'headings': 'headings', 'some': 'some'}, {'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
+
+def test_commands_ignore(tmpdir):
+
+    unflatten(
+        'flattentool/tests/fixtures/xlsx/commands_ignore.xlsx',
+        input_format='xlsx',
+        output_name=tmpdir.join('command_single_unflattened.json').strpath,
+        cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
+        heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
+        )
+
+    unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
+
+    assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}