Skip to content

Commit cc7c8a7

Browse files
committed
ExcelReader: Migrate to openpyxl
1 parent 50d46b2 commit cc7c8a7

File tree

5 files changed

+77
-27
lines changed

5 files changed

+77
-27
lines changed

Orange/data/io.py

Lines changed: 63 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import subprocess
77
import sys
88
import warnings
9+
from typing import List, Iterable
910

1011
from ast import literal_eval
1112
from collections import OrderedDict, Counter, defaultdict
@@ -26,6 +27,7 @@
2627

2728
import xlrd
2829
import xlsxwriter
30+
import openpyxl
2931

3032
from Orange.data import (
3133
_io, is_discrete_values, MISSING_VALUES, Table, Domain, Variable,
@@ -405,15 +407,15 @@ def __init__(self, filename):
405407
self.sheet = None
406408

407409
@property
408-
def sheets(self):
410+
def sheets(self) -> List:
409411
"""FileFormats with a notion of sheets should override this property
410412
to return a list of sheet names in the file.
411413
412414
Returns
413415
-------
414416
a list of sheet names
415417
"""
416-
return ()
418+
return []
417419

418420
def select_sheet(self, sheet):
419421
"""Select sheet to be read
@@ -1007,33 +1009,12 @@ def __init__(self, filename):
10071009
super().__init__(filename=filename)
10081010
self._workbook = None
10091011

1010-
@property
1011-
def workbook(self):
1012-
if not self._workbook:
1013-
self._workbook = xlrd.open_workbook(self.filename, on_demand=True)
1014-
return self._workbook
1015-
1016-
@property
1017-
@lru_cache(1)
1018-
def sheets(self):
1019-
if self.workbook:
1020-
return self.workbook.sheet_names()
1021-
else:
1022-
return ()
1012+
def get_cells(self) -> Iterable:
1013+
raise NotImplementedError
10231014

10241015
def read(self):
1025-
if self.sheet:
1026-
ss = self.workbook.sheet_by_name(self.sheet)
1027-
else:
1028-
ss = self.workbook.sheet_by_index(0)
10291016
try:
1030-
first_row = next(i for i in range(ss.nrows) if any(ss.row_values(i)))
1031-
first_col = next(i for i in range(ss.ncols) if ss.cell_value(first_row, i))
1032-
row_len = ss.row_len(first_row)
1033-
cells = filter(any,
1034-
[[str(ss.cell_value(row, col)) if col < ss.row_len(row) else ''
1035-
for col in range(first_col, row_len)]
1036-
for row in range(first_row, ss.nrows)])
1017+
cells = self.get_cells()
10371018
table = self.data_table(cells)
10381019
table.name = path.splitext(path.split(self.filename)[-1])[0]
10391020
if self.sheet:
@@ -1069,12 +1050,68 @@ class ExcelReader(_BaseExcelReader):
10691050
EXTENSIONS = ('.xlsx',)
10701051
DESCRIPTION = 'Microsoft Excel spreadsheet'
10711052

1053+
@property
1054+
def workbook(self) -> openpyxl.Workbook:
1055+
if not self._workbook:
1056+
self._workbook = openpyxl.load_workbook(self.filename)
1057+
return self._workbook
1058+
1059+
@property
1060+
@lru_cache(1)
1061+
def sheets(self) -> List:
1062+
return self.workbook.sheetnames if self.workbook else []
1063+
1064+
def get_cells(self) -> Iterable:
1065+
def str_(x):
1066+
return str(x) if x is not None else ""
1067+
1068+
sheet = self._get_active_sheet()
1069+
cells = ([str_(sheet.cell(row, col).value)
1070+
for col in range(sheet.min_column, sheet.max_column + 1)]
1071+
for row in range(sheet.min_row, sheet.max_row + 1))
1072+
return filter(any, cells)
1073+
1074+
def _get_active_sheet(self) -> openpyxl.worksheet.worksheet.Worksheet:
1075+
if self.sheet:
1076+
return self.workbook[self.sheet]
1077+
else:
1078+
return self.workbook.active
1079+
10721080

10731081
class XlsReader(_BaseExcelReader):
10741082
"""Reader for .xls files"""
10751083
EXTENSIONS = ('.xls',)
10761084
DESCRIPTION = 'Microsoft Excel 97-2004 spreadsheet'
10771085

1086+
@property
1087+
def workbook(self) -> xlrd.Book:
1088+
if not self._workbook:
1089+
self._workbook = xlrd.open_workbook(self.filename, on_demand=True)
1090+
return self._workbook
1091+
1092+
@property
1093+
@lru_cache(1)
1094+
def sheets(self) -> List:
1095+
return self.workbook.sheet_names() if self.workbook else []
1096+
1097+
def get_cells(self) -> Iterable:
1098+
sheet = self._get_active_sheet()
1099+
first_row = next(i for i in range(sheet.nrows)
1100+
if any(sheet.row_values(i)))
1101+
first_col = next(i for i in range(sheet.ncols)
1102+
if sheet.cell_value(first_row, i))
1103+
row_len = sheet.row_len(first_row)
1104+
return filter(any, ([str(sheet.cell_value(row, col))
1105+
if col < sheet.row_len(row) else ''
1106+
for col in range(first_col, row_len)]
1107+
for row in range(first_row, sheet.nrows)))
1108+
1109+
def _get_active_sheet(self) -> xlrd.sheet.Sheet:
1110+
if self.sheet:
1111+
return self.workbook.sheet_by_name(self.sheet)
1112+
else:
1113+
return self.workbook.sheet_by_index(0)
1114+
10781115

10791116
class DotReader(FileFormat):
10801117
"""Writer for dot (graph) files"""

Orange/tests/test_tab_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def test_sheets(self):
195195
file1 = io.StringIO("\n".join("xd dbac"))
196196
reader = TabReader(file1)
197197

198-
self.assertEqual(reader.sheets, ())
198+
self.assertEqual(reader.sheets, [])
199199

200200
def test_attributes_saving(self):
201201
tempdir = tempfile.mkdtemp()

Orange/tests/test_xlsx_reader.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,18 @@ def wrapper(self):
3535
return wrapper
3636

3737

38+
class TestExcelReader(unittest.TestCase):
39+
def test_read_round_floats(self):
40+
table = read_file(get_xlsx_reader, "round_floats.xlsx")
41+
domain = table.domain
42+
self.assertIsNone(domain.class_var)
43+
self.assertEqual(len(domain.metas), 0)
44+
self.assertEqual(len(domain.attributes), 3)
45+
self.assertIsInstance(domain[0], ContinuousVariable)
46+
self.assertIsInstance(domain[1], ContinuousVariable)
47+
self.assertListEqual(domain[2].values, ["1", "2"])
48+
49+
3850
class TestExcelHeader0(unittest.TestCase):
3951
@test_xlsx_xls
4052
def test_read(self, reader: Callable[[str], io.FileFormat]):
25.3 KB
Binary file not shown.

requirements-core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ requests
2121
openTSNE>=0.3.11
2222
pandas
2323
pyyaml
24+
openpyxl

0 commit comments

Comments
 (0)