Skip to content

Commit f8bc49d

Browse files
authored
Merge branch 'master' into cove-611-merging-message
2 parents 560a3c7 + 3fb433e commit f8bc49d

File tree

7 files changed

+124
-11
lines changed

7 files changed

+124
-11
lines changed

flattentool/input.py

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def __init__(self, cell_value, cell_location):
4040
except ImportError:
4141
from UserDict import UserDict # pylint: disable=F0401
4242

43+
4344
def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
4445
if value == '' or value is None:
4546
return None
@@ -93,6 +94,8 @@ def convert_type(type_string, value, timezone = pytz.timezone('UTC')):
9394
elif type_string == '':
9495
if type(value) == datetime.datetime:
9596
return timezone.localize(value).isoformat()
97+
if type(value) == float and int(value) == value:
98+
return int(value)
9699
return value if type(value) in [int] else text_type(value)
97100
else:
98101
raise ValueError('Unrecognised type: "{}"'.format(type_string))
@@ -446,18 +449,29 @@ class CSVInput(SpreadsheetInput):
446449
encoding = 'utf-8'
447450

448451
def get_sheet_headings(self, sheet_name):
452+
sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
453+
configuration_line = 1 if sheet_configuration else 0
454+
if not sheet_configuration:
455+
sheet_configuration = self.base_configuration
456+
if not self.use_configuration:
457+
sheet_configuration = {}
458+
skip_rows = sheet_configuration.get("skipRows", 0)
459+
if sheet_configuration.get("ignore"):
460+
# returning empty headers is a proxy for no data in the sheet.
461+
return []
462+
449463
if sys.version > '3': # If Python 3 or greater
450464
with open(os.path.join(self.input_name, sheet_name+'.csv'), encoding=self.encoding) as main_sheet_file:
451465
r = csvreader(main_sheet_file)
452-
for row in enumerate(r):
453-
# Just return the first row
454-
return row[1]
466+
for num, row in enumerate(r):
467+
if num == (skip_rows + configuration_line):
468+
return row
455469
else: # If Python 2
456470
with open(os.path.join(self.input_name, sheet_name+'.csv')) as main_sheet_file:
457471
r = csvreader(main_sheet_file, encoding=self.encoding)
458-
for row in enumerate(r):
459-
# Just return the first row
460-
return row[1]
472+
for num, row in enumerate(r):
473+
if num == (skip_rows + configuration_line):
474+
return row
461475

462476
def read_sheets(self):
463477
sheet_file_names = os.listdir(self.input_name)
@@ -472,21 +486,66 @@ def read_sheets(self):
472486
except ValueError:
473487
pass
474488
self.sub_sheet_names = sheet_names
489+
self.sheet_names_map = OrderedDict((sheet_name, sheet_name) for sheet_name in sheet_names)
475490
self.configure_sheets()
476491

492+
def generate_rows(self, dictreader, sheet_name):
493+
sheet_configuration = self.sheet_configuration[self.sheet_names_map[sheet_name]]
494+
configuration_line = 1 if sheet_configuration else 0
495+
if not sheet_configuration:
496+
sheet_configuration = self.base_configuration
497+
if not self.use_configuration:
498+
sheet_configuration = {}
499+
500+
skip_rows = sheet_configuration.get("skipRows", 0)
501+
header_rows = sheet_configuration.get("headerRows", 1)
502+
for i in range(0, configuration_line + skip_rows):
503+
previous_row = next(dictreader.reader)
504+
if sys.version > '3': # If Python 3 or greater
505+
fieldnames = dictreader.fieldnames
506+
else:
507+
# unicodecsv dictreader always reads the headingline first
508+
# so in the case of there being any rows to skip look at
509+
# previous row and use that for fieldnames.
510+
if (configuration_line + skip_rows):
511+
fieldnames = previous_row
512+
dictreader.fieldnames = fieldnames
513+
dictreader.unicode_fieldnames = fieldnames
514+
else:
515+
fieldnames = dictreader.unicode_fieldnames
516+
for i in range(0, header_rows - 1):
517+
next(dictreader.reader)
518+
for line in dictreader:
519+
yield OrderedDict((fieldname, line[fieldname]) for fieldname in fieldnames)
520+
521+
def get_sheet_configuration(self, sheet_name):
522+
if sys.version > '3': # If Python 3 or greater
523+
with open(os.path.join(self.input_name, sheet_name+'.csv'), encoding=self.encoding) as main_sheet_file:
524+
r = csvreader(main_sheet_file)
525+
heading_row = next(r)
526+
else: # If Python 2
527+
with open(os.path.join(self.input_name, sheet_name+'.csv')) as main_sheet_file:
528+
r = csvreader(main_sheet_file, encoding=self.encoding)
529+
heading_row = next(r)
530+
if heading_row[0] == '#':
531+
return heading_row[1:]
532+
return []
533+
534+
535+
477536
def get_sheet_lines(self, sheet_name):
478537
if sys.version > '3': # If Python 3 or greater
479538
# Pass the encoding to the open function
480539
with open(os.path.join(self.input_name, sheet_name+'.csv'), encoding=self.encoding) as main_sheet_file:
481540
dictreader = DictReader(main_sheet_file)
482-
for line in dictreader:
483-
yield OrderedDict((fieldname, line[fieldname]) for fieldname in dictreader.fieldnames)
541+
for row in self.generate_rows(dictreader, sheet_name):
542+
yield row
484543
else: # If Python 2
485544
# Pass the encoding to DictReader
486545
with open(os.path.join(self.input_name, sheet_name+'.csv')) as main_sheet_file:
487546
dictreader = DictReader(main_sheet_file, encoding=self.encoding)
488-
for line in dictreader:
489-
yield OrderedDict((fieldname, line[fieldname]) for fieldname in dictreader.fieldnames)
547+
for row in self.generate_rows(dictreader, sheet_name):
548+
yield row
490549

491550

492551
class XLSXInput(SpreadsheetInput):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#,ignore
2+
bla,bla,bla
3+
bla,bla,bla
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#,skipRows 1,HeaderRows 2
2+
,,
3+
some,actual,headings
4+
some,other,headings
5+
some,actual,data
3.58 KB
Binary file not shown.

flattentool/tests/test_init.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1156,7 +1156,7 @@ def test_bad_format(tmpdir):
11561156
output_name=tmpdir.join('meta_unflattened.json').strpath,
11571157
)
11581158

1159-
def test_commands_single_sheet(tmpdir):
1159+
def test_commands_single_sheet_xlsx(tmpdir):
11601160

11611161
unflatten(
11621162
'flattentool/tests/fixtures/xlsx/commands_in_file.xlsx',
@@ -1170,6 +1170,17 @@ def test_commands_single_sheet(tmpdir):
11701170

11711171
assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
11721172

1173+
def test_commands_single_sheet_csv(tmpdir):
1174+
unflatten(
1175+
'flattentool/tests/fixtures/csv/commands_in_file',
1176+
input_format='csv',
1177+
output_name=tmpdir.join('command_single_unflattened.json').strpath,
1178+
cell_source_map=tmpdir.join('command_single_source_map.json').strpath,
1179+
heading_source_map=tmpdir.join('command_single_heading_source_map.json').strpath,
1180+
)
1181+
unflattened = json.load(tmpdir.join('command_single_unflattened.json'))
1182+
assert unflattened == {'main': [{'actual': 'actual', 'headings': 'data', 'some': 'some'}]}
1183+
11731184
def test_commands_metatab(tmpdir):
11741185

11751186
unflatten(

flattentool/tests/test_input_SpreadsheetInput.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,25 @@ def test_xlsx_input_integer(self):
9898

9999
assert list(xlsxinput.get_sheet_lines('main')) == \
100100
[{'colA': 1}]
101+
if sys.version_info[0] == 2:
102+
assert type(list(xlsxinput.get_sheet_lines('main'))[0]['colA']) == long
103+
else:
104+
assert type(list(xlsxinput.get_sheet_lines('main'))[0]['colA']) == int
101105
assert xlsxinput.sub_sheet_names == ['main']
102106

107+
def test_xlsx_input_integer2(self):
108+
xlsxinput = XLSXInput(input_name='flattentool/tests/fixtures/xlsx/integer2.xlsx')
109+
110+
xlsxinput.read_sheets()
111+
112+
assert list(xlsxinput.get_sheet_lines('Sheet1')) == \
113+
[{'activity-status/@code': 2}]
114+
# This is a float, but is converted to an int in the unflatten step, see
115+
# test_input_SpreadsheetInput_unflatten.py
116+
# 'Basic with float'
117+
assert type(list(xlsxinput.get_sheet_lines('Sheet1'))[0]['activity-status/@code']) == float
118+
assert xlsxinput.sub_sheet_names == ['Sheet1']
119+
103120
def test_xlsx_input_formula(self):
104121
""" When a forumla is present, we should use the value, rather than the
105122
formula itself. """

flattentool/tests/test_input_SpreadsheetInput_unflatten.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,24 @@ def inject_root_id(root_id, d):
6161
[],
6262
True
6363
),
64+
(
65+
'Basic with float',
66+
# 3.0 is converted to 3
67+
# This is needed to handle google docs xlsx properly
68+
# https://github.com/OpenDataServices/cove/issues/838
69+
[{
70+
'ROOT_ID': '1',
71+
'id': 2,
72+
'testA': 3.0
73+
}],
74+
[{
75+
'ROOT_ID': '1',
76+
'id': 2,
77+
'testA': 3
78+
}],
79+
[],
80+
True
81+
),
6482
(
6583
'Basic with zero',
6684
[{

0 commit comments

Comments
 (0)