Merge branch 'master' into cove-611-merging-message

Bjwebb · web-flow · commit f8bc49df3b1b · 2017-10-04T09:31:45.000+01:00
diff --git a/flattentool/input.py b/flattentool/input.py
@@ -40,6 +40,7 @@ def __init__(self, cell_value, cell_location):
 except ImportError:
     from UserDict import UserDict  # pylint: disable=F0401
 
+
 def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
     if value == '' or value is None:
         return None
@@ -93,6 +94,8 @@ def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
     elif type_string == '':
         if type(value) == datetime.datetime:
             return timezone.localize(value).isoformat()
+        if type(value) == float and int(value) == value:
+            return int(value)
         return value if type(value) in [int] else text_type(value)
     else:
         raise ValueError('Unrecognised type: "{}"'.format(type_string))
@@ -446,18 +449,29 @@ class CSVInput(SpreadsheetInput):
     encoding = 'utf-8'
 
     def get_sheet_headings(self, sheet_name):
+        sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
+        configuration_line = 1 if sheet_configuration else 0
+        if not sheet_configuration:
+            sheet_configuration = self.base_configuration
+        if not self.use_configuration:
+            sheet_configuration = {}
+        skip_rows = sheet_configuration.get("skipRows", 0)
+        if sheet_configuration.get("ignore"):
+            # returning empty headers is a proxy for no data in the sheet.
+            return []
+
         if sys.version > '3':  # If Python 3 or greater
             with open(os.path.join(self.input_name, sheet_name+'.csv'), encoding=self.encoding) as main_sheet_file:
                 r = csvreader(main_sheet_file)
-                for row in enumerate(r):
-                    # Just return the first row
-                    return row[1]
+                for num, row in enumerate(r):
+                    if num == (skip_rows + configuration_line):
+                        return row
         else:  # If Python 2
             with open(os.path.join(self.input_name, sheet_name+'.csv')) as main_sheet_file:
                 r = csvreader(main_sheet_file, encoding=self.encoding)
-                for row in enumerate(r):
-                    # Just return the first row
-                    return row[1]
+                for num, row in enumerate(r):
+                    if num == (skip_rows + configuration_line):
+                        return row
 
     def read_sheets(self):
         sheet_file_names = os.listdir(self.input_name)
@@ -472,21 +486,66 @@ def read_sheets(self):
             except ValueError:
                 pass
         self.sub_sheet_names = sheet_names
+        self.sheet_names_map = OrderedDict((sheet_name, sheet_name) for sheet_name in sheet_names)
         self.configure_sheets()
 
+    def generate_rows(self, dictreader, sheet_name):
+        sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
+        configuration_line = 1 if sheet_configuration else 0
+        if not sheet_configuration:
+            sheet_configuration = self.base_configuration
+        if not self.use_configuration:
+            sheet_configuration = {}
+
+        skip_rows = sheet_configuration.get("skipRows", 0)
+        header_rows = sheet_configuration.get("headerRows", 1)
+        for i in range(0, configuration_line + skip_rows):
+            previous_row = next(dictreader.reader)
+        if sys.version > '3':  # If Python 3 or greater
+            fieldnames = dictreader.fieldnames
+        else:
+            # unicodecsv dictreader always reads the headingline first
+            # so in the case of there being any rows to skip look at 
+            # previous row and use that for fieldnames.
+            if (configuration_line + skip_rows):
+                fieldnames = previous_row
+                dictreader.fieldnames = fieldnames
+                dictreader.unicode_fieldnames = fieldnames
+            else:
+                fieldnames = dictreader.unicode_fieldnames 
+        for i in range(0, header_rows - 1):
+            next(dictreader.reader)
+        for line in dictreader:
+            yield OrderedDict((fieldname, line[fieldname]) for fieldname in fieldnames)
+
+    def get_sheet_configuration(self, sheet_name):
+        if sys.version > '3':  # If Python 3 or greater
+            with open(os.path.join(self.input_name, sheet_name+'.csv'), encoding=self.encoding) as main_sheet_file:
+                r = csvreader(main_sheet_file)
+                heading_row = next(r)
+        else:  # If Python 2
+            with open(os.path.join(self.input_name, sheet_name+'.csv')) as main_sheet_file:
+                r = csvreader(main_sheet_file, encoding=self.encoding)
+                heading_row = next(r)
+        if heading_row[0] == '#':
+            return heading_row[1:]
+        return []
+
+
+
     def get_sheet_lines(self, sheet_name):
         if sys.version > '3':  # If Python 3 or greater
             # Pass the encoding to the open function
             with open(os.path.join(self.input_name, sheet_name+'.csv'), encoding=self.encoding) as main_sheet_file:
                 dictreader = DictReader(main_sheet_file)
-                for line in dictreader:
-                    yield OrderedDict((fieldname, line[fieldname]) for fieldname in dictreader.fieldnames)
+                for row in self.generate_rows(dictreader, sheet_name):
+                    yield row
         else:  # If Python 2
             # Pass the encoding to DictReader
             with open(os.path.join(self.input_name, sheet_name+'.csv')) as main_sheet_file:
                 dictreader = DictReader(main_sheet_file, encoding=self.encoding)
-                for line in dictreader:
-                    yield OrderedDict((fieldname, line[fieldname]) for fieldname in dictreader.fieldnames)
+                for row in self.generate_rows(dictreader, sheet_name):
+                    yield row
 
 
 class XLSXInput(SpreadsheetInput):
diff --git a/flattentool/tests/fixtures/csv/commands_in_file/ignore.csv b/flattentool/tests/fixtures/csv/commands_in_file/ignore.csv
@@ -0,0 +1,3 @@
+#,ignore
+bla,bla,bla
+bla,bla,bla
diff --git a/flattentool/tests/fixtures/csv/commands_in_file/sheet1.csv b/flattentool/tests/fixtures/csv/commands_in_file/sheet1.csv
@@ -0,0 +1,5 @@
+#,skipRows 1,HeaderRows 2
+,,
+some,actual,headings
+some,other,headings
+some,actual,data
diff --git a/flattentool/tests/fixtures/xlsx/integer2.xlsx b/flattentool/tests/fixtures/xlsx/integer2.xlsx
diff --git a/flattentool/tests/test_init.py b/flattentool/tests/test_init.py
@@ -1156,7 +1156,7 @@ def test_bad_format(tmpdir):
             output_name=tmpdir.join('meta_unflattened.json').strpath,
             )
 
-def test_commands_single_sheet(tmpdir):
+def test_commands_single_sheet_xlsx(tmpdir):
 
     unflatten(
         'flattentool/tests/fixtures/xlsx/commands_in_file.xlsx',
@@ -1170,6 +1170,17 @@ def test_commands_single_sheet(tmpdir):
 
     assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
 
+def test_commands_single_sheet_csv(tmpdir):
+    unflatten(
+        'flattentool/tests/fixtures/csv/commands_in_file',
+        input_format='csv',
+        output_name=tmpdir.join('command_single_unflattened.json').strpath,
+        cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
+        heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
+        )
+    unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
+    assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
+
 def test_commands_metatab(tmpdir):
 
     unflatten(
diff --git a/flattentool/tests/test_input_SpreadsheetInput.py b/flattentool/tests/test_input_SpreadsheetInput.py
@@ -98,8 +98,25 @@ def test_xlsx_input_integer(self):
 
         assert list(xlsxinput.get_sheet_lines('main')) == \
             [{'colA': 1}]
+        if sys.version_info[0] == 2:
+            assert type(list(xlsxinput.get_sheet_lines('main'))[0]['colA']) == long
+        else:
+            assert type(list(xlsxinput.get_sheet_lines('main'))[0]['colA']) == int
         assert xlsxinput.sub_sheet_names == ['main']
 
+    def test_xlsx_input_integer2(self):
+        xlsxinput = XLSXInput(input_name='flattentool/tests/fixtures/xlsx/integer2.xlsx')
+
+        xlsxinput.read_sheets()
+
+        assert list(xlsxinput.get_sheet_lines('Sheet1')) == \
+            [{'activity-status/@code': 2}]
+        # This is a float, but is converted to an int in the unflatten step, see
+        # test_input_SpreadsheetInput_unflatten.py
+        # 'Basic with float'
+        assert type(list(xlsxinput.get_sheet_lines('Sheet1'))[0]['activity-status/@code']) == float
+        assert xlsxinput.sub_sheet_names == ['Sheet1']
+
     def test_xlsx_input_formula(self):
         """ When a forumla is present, we should use the value, rather than the
         formula itself. """
diff --git a/flattentool/tests/test_input_SpreadsheetInput_unflatten.py b/flattentool/tests/test_input_SpreadsheetInput_unflatten.py
@@ -61,6 +61,24 @@ def inject_root_id(root_id, d):
         [],
         True
     ),
+    (
+        'Basic with float',
+        # 3.0 is converted to 3
+        # This is needed to handle google docs xlsx properly
+        # https://github.com/OpenDataServices/cove/issues/838
+        [{
+            'ROOT_ID': '1',
+            'id': 2,
+            'testA': 3.0
+        }],
+        [{
+                'ROOT_ID': '1',
+                'id': 2,
+                'testA': 3
+        }],
+        [],
+        True
+    ),
     (
         'Basic with zero',
         [{

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#,ignore`
	`2`	`+bla,bla,bla`
	`3`	`+bla,bla,bla`