Merge branch 'master' into 151-xml-to-spreadsheet

Bjwebb · web-flow · commit 084608c6ef31 · 2017-10-12T17:19:07.000+01:00
diff --git a/examples/cafe/relationship-merge-multiple/expected_stderr.json b/examples/cafe/relationship-merge-multiple/expected_stderr.json
@@ -1,4 +1,4 @@
-... DataErrorWarning: Conflict when merging field "name" for id "CAFE-HEALTH" in sheet b: "Healthy Cafe" != "Incorrect value". If you were not expecting merging you may have a duplicate ID.
+... DataErrorWarning: You may have a duplicate Identifier: We couldn't merge these rows with the id "CAFE-HEALTH": field "name" in sheet "b": one cell has the value: "Healthy Cafe", the other cell has the value: "Incorrect value"
   DataErrorWarning)
-... DataErrorWarning: Conflict when merging field "number_of_tables" for id "CAFE-HEALTH" in sheet d: "3" != "4". If you were not expecting merging you may have a duplicate ID.
+... DataErrorWarning: You may have a duplicate Identifier: We couldn't merge these rows with the id "CAFE-HEALTH": field "number_of_tables" in sheet "d": one cell has the value: "3", the other cell has the value: "4"
   DataErrorWarning)
diff --git a/examples/cafe/relationship-merge-single/expected_stderr.json b/examples/cafe/relationship-merge-single/expected_stderr.json
@@ -1,4 +1,4 @@
-... DataErrorWarning: Conflict when merging field "name" for id "CAFE-HEALTH" in sheet data: "Healthy Cafe" != "Vegetarian Cafe". If you were not expecting merging you may have a duplicate ID.
+... DataErrorWarning: You may have a duplicate Identifier: We couldn't merge these rows with the id "CAFE-HEALTH": field "name" in sheet "data": one cell has the value: "Healthy Cafe", the other cell has the value: "Vegetarian Cafe"
   DataErrorWarning)
-... DataErrorWarning: Conflict when merging field "number_of_tables" for id "CAFE-HEALTH" in sheet data: "3" != "4". If you were not expecting merging you may have a duplicate ID.
+... DataErrorWarning: You may have a duplicate Identifier: We couldn't merge these rows with the id "CAFE-HEALTH": field "number_of_tables" in sheet "data": one cell has the value: "3", the other cell has the value: "4"
   DataErrorWarning)
diff --git a/flattentool/input.py b/flattentool/input.py
@@ -94,6 +94,8 @@ def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
     elif type_string == '':
         if type(value) == datetime.datetime:
             return timezone.localize(value).isoformat()
+        if type(value) == float and int(value) == value:
+            return int(value)
         return value if type(value) in [int] else text_type(value)
     else:
         raise ValueError('Unrecognised type: "{}"'.format(type_string))
@@ -151,8 +153,8 @@ def merge(base, mergee, debug_info=None):
                     if debug_info.get('root_id'):
                         id_info = '{} "{}", '.format(debug_info.get('root_id'), debug_info.get('root_id_or_none'))+id_info
                     warn(
-                        'Conflict when merging field "{}" for {} in sheet {}: "{}" != "{}". If you were not expecting merging you may have a duplicate ID.'.format(
-                            key, id_info, debug_info.get('sheet_name'), base_value, value),
+                        'You may have a duplicate Identifier: We couldn\'t merge these rows with the {}: field "{}" in sheet "{}": one cell has the value: "{}", the other cell has the value: "{}"'.format(
+                            id_info, key, debug_info.get('sheet_name'), base_value, value),
                         DataErrorWarning)
                 else:
                     base[key].sub_cells.append(v)
@@ -572,7 +574,8 @@ def get_sheet_headings(self, sheet_name):
             sheet_configuration = {}
 
         skip_rows = sheet_configuration.get("skipRows", 0)
-        if sheet_configuration.get("ignore"):
+        if (sheet_configuration.get("ignore") or
+            (sheet_configuration.get("hashcomments") and sheet_name.startswith('#'))):
             # returning empty headers is a proxy for no data in the sheet.
             return []
 
@@ -615,7 +618,13 @@ def get_sheet_lines(self, sheet_name):
             header_row = worksheet.rows[skip_rows + configuration_line]
             remaining_rows = worksheet.rows[skip_rows + configuration_line + header_rows:]
 
-        coli_to_header = ({i: x.value for i, x in enumerate(header_row) if x.value is not None})
+        coli_to_header = {}
+        for i, header in enumerate(header_row):
+            if header.value is None:
+                continue
+            if sheet_configuration.get("hashcomments") and str(header.value).startswith('#'):
+                continue
+            coli_to_header[i] = header.value
         for row in remaining_rows:
             yield OrderedDict((coli_to_header[i], x.value) for i, x in enumerate(row) if i in coli_to_header)
 
diff --git a/flattentool/lib.py b/flattentool/lib.py
@@ -15,4 +15,6 @@ def parse_sheet_configuration(configuration_list):
             configuration['headerRows'] = max(int(parts[1]), 1)
         if (len(parts) == 1 and parts[0].lower() == "ignore"):
             configuration['ignore'] = True
+        if (len(parts) == 1 and parts[0].lower() in ("hashcomments", "hashcomment")):
+            configuration['hashcomments'] = True
     return configuration
diff --git a/flattentool/tests/fixtures/xlsx/commands_hashcomments.xlsx b/flattentool/tests/fixtures/xlsx/commands_hashcomments.xlsx
diff --git a/flattentool/tests/fixtures/xlsx/integer2.xlsx b/flattentool/tests/fixtures/xlsx/integer2.xlsx
diff --git a/flattentool/tests/test_init.py b/flattentool/tests/test_init.py
@@ -1240,3 +1240,20 @@ def test_commands_ignore(tmpdir):
     unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
 
     assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
+
+def test_commands_hashcomments(tmpdir):
+
+    unflatten(
+        'flattentool/tests/fixtures/xlsx/commands_hashcomments.xlsx',
+        input_format='xlsx',
+        output_name=tmpdir.join('commands_hashcomments_unflattened.json').strpath,
+        cell_source_map=tmpdir.join('commands_hashcomments_source_map.json').strpath,
+        heading_source_map=tmpdir.join('commands_hashcomments_heading_source_map.json').strpath,
+        metatab_name='Meta',
+        metatab_vertical_orientation=True
+        )
+
+    unflattened = json.load(tmpdir.join('commands_hashcomments_unflattened.json'))
+
+    assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}, {'actual': 'actual', 'headings': 'Other data', 'some': 'some'}],
+                           'some': 'data'}
diff --git a/flattentool/tests/test_input_SpreadsheetInput.py b/flattentool/tests/test_input_SpreadsheetInput.py
@@ -98,8 +98,25 @@ def test_xlsx_input_integer(self):
 
         assert list(xlsxinput.get_sheet_lines('main')) == \
             [{'colA': 1}]
+        if sys.version_info[0] == 2:
+            assert type(list(xlsxinput.get_sheet_lines('main'))[0]['colA']) == long
+        else:
+            assert type(list(xlsxinput.get_sheet_lines('main'))[0]['colA']) == int
         assert xlsxinput.sub_sheet_names == ['main']
 
+    def test_xlsx_input_integer2(self):
+        xlsxinput = XLSXInput(input_name='flattentool/tests/fixtures/xlsx/integer2.xlsx')
+
+        xlsxinput.read_sheets()
+
+        assert list(xlsxinput.get_sheet_lines('Sheet1')) == \
+            [{'activity-status/@code': 2}]
+        # This is a float, but is converted to an int in the unflatten step, see
+        # test_input_SpreadsheetInput_unflatten.py
+        # 'Basic with float'
+        assert type(list(xlsxinput.get_sheet_lines('Sheet1'))[0]['activity-status/@code']) == float
+        assert xlsxinput.sub_sheet_names == ['Sheet1']
+
     def test_xlsx_input_formula(self):
         """ When a forumla is present, we should use the value, rather than the
         formula itself. """
diff --git a/flattentool/tests/test_input_SpreadsheetInput_unflatten.py b/flattentool/tests/test_input_SpreadsheetInput_unflatten.py
@@ -61,6 +61,24 @@ def inject_root_id(root_id, d):
         [],
         True
     ),
+    (
+        'Basic with float',
+        # 3.0 is converted to 3
+        # This is needed to handle google docs xlsx properly
+        # https://github.com/OpenDataServices/cove/issues/838
+        [{
+            'ROOT_ID': '1',
+            'id': 2,
+            'testA': 3.0
+        }],
+        [{
+                'ROOT_ID': '1',
+                'id': 2,
+                'testA': 3
+        }],
+        [],
+        True
+    ),
     (
         'Basic with zero',
         [{