Skip to content

Commit 79fe1be

Browse files
authored
Merge pull request #4279 from VesnaT/xlsx_reader
[FIX] ExcelReader: Migrate to openpyxl
2 parents 8e88469 + cc7c8a7 commit 79fe1be

File tree

9 files changed

+144
-53
lines changed

9 files changed

+144
-53
lines changed

MANIFEST.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
recursive-include Orange *.pyx *.py *.c *.cpp README* LICENSE
33
recursive-include Orange/datasets *.tab *.basket *.info *.dst *.metadata
44

5-
recursive-include Orange/tests *.tab *.basket *.xlsx *.pkl *.pkl.gz
5+
recursive-include Orange/tests *.tab *.basket *.xlsx *.xls *.pkl *.pkl.gz
66

77
recursive-include Orange/canvas *ico *.png *.svg *.ico
88
recursive-include Orange/canvas/workflows *.ows

Orange/data/io.py

Lines changed: 77 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import subprocess
77
import sys
88
import warnings
9+
from typing import List, Iterable
910

1011
from ast import literal_eval
1112
from collections import OrderedDict, Counter, defaultdict
@@ -26,6 +27,7 @@
2627

2728
import xlrd
2829
import xlsxwriter
30+
import openpyxl
2931

3032
from Orange.data import (
3133
_io, is_discrete_values, MISSING_VALUES, Table, Domain, Variable,
@@ -405,15 +407,15 @@ def __init__(self, filename):
405407
self.sheet = None
406408

407409
@property
408-
def sheets(self):
410+
def sheets(self) -> List:
409411
"""FileFormats with a notion of sheets should override this property
410412
to return a list of sheet names in the file.
411413
412414
Returns
413415
-------
414416
a list of sheet names
415417
"""
416-
return ()
418+
return []
417419

418420
def select_sheet(self, sheet):
419421
"""Select sheet to be read
@@ -998,44 +1000,21 @@ def constr_vars(inds):
9981000
return table
9991001

10001002

1001-
class ExcelReader(FileFormat):
1002-
"""Reader for excel files"""
1003-
EXTENSIONS = ('.xlsx',)
1004-
DESCRIPTION = 'Microsoft Excel spreadsheet'
1003+
class _BaseExcelReader(FileFormat):
1004+
"""Base class for reading excel files"""
10051005
SUPPORT_COMPRESSED = False
10061006
SUPPORT_SPARSE_DATA = False
10071007

10081008
def __init__(self, filename):
10091009
super().__init__(filename=filename)
10101010
self._workbook = None
10111011

1012-
@property
1013-
def workbook(self):
1014-
if not self._workbook:
1015-
self._workbook = xlrd.open_workbook(self.filename, on_demand=True)
1016-
return self._workbook
1017-
1018-
@property
1019-
@lru_cache(1)
1020-
def sheets(self):
1021-
if self.workbook:
1022-
return self.workbook.sheet_names()
1023-
else:
1024-
return ()
1012+
def get_cells(self) -> Iterable:
1013+
raise NotImplementedError
10251014

10261015
def read(self):
1027-
if self.sheet:
1028-
ss = self.workbook.sheet_by_name(self.sheet)
1029-
else:
1030-
ss = self.workbook.sheet_by_index(0)
10311016
try:
1032-
first_row = next(i for i in range(ss.nrows) if any(ss.row_values(i)))
1033-
first_col = next(i for i in range(ss.ncols) if ss.cell_value(first_row, i))
1034-
row_len = ss.row_len(first_row)
1035-
cells = filter(any,
1036-
[[str(ss.cell_value(row, col)) if col < ss.row_len(row) else ''
1037-
for col in range(first_col, row_len)]
1038-
for row in range(first_row, ss.nrows)])
1017+
cells = self.get_cells()
10391018
table = self.data_table(cells)
10401019
table.name = path.splitext(path.split(self.filename)[-1])[0]
10411020
if self.sheet:
@@ -1066,6 +1045,74 @@ def write_file(cls, filename, data):
10661045
workbook.close()
10671046

10681047

1048+
class ExcelReader(_BaseExcelReader):
1049+
"""Reader for .xlsx files"""
1050+
EXTENSIONS = ('.xlsx',)
1051+
DESCRIPTION = 'Microsoft Excel spreadsheet'
1052+
1053+
@property
1054+
def workbook(self) -> openpyxl.Workbook:
1055+
if not self._workbook:
1056+
self._workbook = openpyxl.load_workbook(self.filename)
1057+
return self._workbook
1058+
1059+
@property
1060+
@lru_cache(1)
1061+
def sheets(self) -> List:
1062+
return self.workbook.sheetnames if self.workbook else []
1063+
1064+
def get_cells(self) -> Iterable:
1065+
def str_(x):
1066+
return str(x) if x is not None else ""
1067+
1068+
sheet = self._get_active_sheet()
1069+
cells = ([str_(sheet.cell(row, col).value)
1070+
for col in range(sheet.min_column, sheet.max_column + 1)]
1071+
for row in range(sheet.min_row, sheet.max_row + 1))
1072+
return filter(any, cells)
1073+
1074+
def _get_active_sheet(self) -> openpyxl.worksheet.worksheet.Worksheet:
1075+
if self.sheet:
1076+
return self.workbook[self.sheet]
1077+
else:
1078+
return self.workbook.active
1079+
1080+
1081+
class XlsReader(_BaseExcelReader):
1082+
"""Reader for .xls files"""
1083+
EXTENSIONS = ('.xls',)
1084+
DESCRIPTION = 'Microsoft Excel 97-2004 spreadsheet'
1085+
1086+
@property
1087+
def workbook(self) -> xlrd.Book:
1088+
if not self._workbook:
1089+
self._workbook = xlrd.open_workbook(self.filename, on_demand=True)
1090+
return self._workbook
1091+
1092+
@property
1093+
@lru_cache(1)
1094+
def sheets(self) -> List:
1095+
return self.workbook.sheet_names() if self.workbook else []
1096+
1097+
def get_cells(self) -> Iterable:
1098+
sheet = self._get_active_sheet()
1099+
first_row = next(i for i in range(sheet.nrows)
1100+
if any(sheet.row_values(i)))
1101+
first_col = next(i for i in range(sheet.ncols)
1102+
if sheet.cell_value(first_row, i))
1103+
row_len = sheet.row_len(first_row)
1104+
return filter(any, ([str(sheet.cell_value(row, col))
1105+
if col < sheet.row_len(row) else ''
1106+
for col in range(first_col, row_len)]
1107+
for row in range(first_row, sheet.nrows)))
1108+
1109+
def _get_active_sheet(self) -> xlrd.sheet.Sheet:
1110+
if self.sheet:
1111+
return self.workbook.sheet_by_name(self.sheet)
1112+
else:
1113+
return self.workbook.sheet_by_index(0)
1114+
1115+
10691116
class DotReader(FileFormat):
10701117
"""Writer for dot (graph) files"""
10711118
EXTENSIONS = ('.dot', '.gv')

Orange/tests/test_tab_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def test_sheets(self):
195195
file1 = io.StringIO("\n".join("xd dbac"))
196196
reader = TabReader(file1)
197197

198-
self.assertEqual(reader.sheets, ())
198+
self.assertEqual(reader.sheets, [])
199199

200200
def test_attributes_saving(self):
201201
tempdir = tempfile.mkdtemp()

Orange/tests/test_xlsx_reader.py

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import unittest
55
import os
6+
from functools import wraps
7+
from typing import Callable
68

79
import numpy as np
810

@@ -13,13 +15,42 @@ def get_dataset(name):
1315
return os.path.join(os.path.dirname(__file__), "xlsx_files", name)
1416

1517

16-
def read_file(name):
17-
return io.ExcelReader(get_dataset(name)).read()
18+
def get_xlsx_reader(name: str) -> io.ExcelReader:
19+
return io.ExcelReader(get_dataset(name))
20+
21+
22+
def get_xls_reader(name: str) -> io.XlsReader:
23+
return io.XlsReader(get_dataset(name))
24+
25+
26+
def read_file(reader: Callable, name: str) -> Table:
27+
return reader(name).read()
28+
29+
30+
def test_xlsx_xls(f):
31+
@wraps(f)
32+
def wrapper(self):
33+
f(self, get_xlsx_reader)
34+
f(self, get_xls_reader)
35+
return wrapper
36+
37+
38+
class TestExcelReader(unittest.TestCase):
39+
def test_read_round_floats(self):
40+
table = read_file(get_xlsx_reader, "round_floats.xlsx")
41+
domain = table.domain
42+
self.assertIsNone(domain.class_var)
43+
self.assertEqual(len(domain.metas), 0)
44+
self.assertEqual(len(domain.attributes), 3)
45+
self.assertIsInstance(domain[0], ContinuousVariable)
46+
self.assertIsInstance(domain[1], ContinuousVariable)
47+
self.assertListEqual(domain[2].values, ["1", "2"])
1848

1949

2050
class TestExcelHeader0(unittest.TestCase):
21-
def test_read(self):
22-
table = read_file("header_0.xlsx")
51+
@test_xlsx_xls
52+
def test_read(self, reader: Callable[[str], io.FileFormat]):
53+
table = read_file(reader, "header_0.xlsx")
2354
domain = table.domain
2455
self.assertIsNone(domain.class_var)
2556
self.assertEqual(len(domain.metas), 0)
@@ -35,29 +66,37 @@ def test_read(self):
3566

3667

3768
class TextExcelSheets(unittest.TestCase):
38-
def setUp(self):
39-
self.reader = io.ExcelReader(get_dataset("header_0_sheet.xlsx"))
40-
41-
def test_sheets(self):
42-
self.assertSequenceEqual(self.reader.sheets,
69+
@test_xlsx_xls
70+
def test_sheets(self, reader: Callable[[str], io.FileFormat]):
71+
reader = reader("header_0_sheet.xlsx")
72+
self.assertSequenceEqual(reader.sheets,
4373
["Sheet1", "my_sheet", "Sheet3"])
4474

45-
def test_named_sheet(self):
46-
self.reader.select_sheet("my_sheet")
47-
table = self.reader.read()
75+
@test_xlsx_xls
76+
def test_named_sheet(self, reader: Callable[[str], io.FileFormat]):
77+
reader = reader("header_0_sheet.xlsx")
78+
reader.select_sheet("my_sheet")
79+
table = reader.read()
4880
self.assertEqual(len(table.domain.attributes), 4)
4981
self.assertEqual(table.name, 'header_0_sheet-my_sheet')
5082

51-
def test_named_sheet_table(self):
83+
def test_named_sheet_table_xlsx(self):
5284
table = Table.from_file(get_dataset("header_0_sheet.xlsx"),
5385
sheet="my_sheet")
5486
self.assertEqual(len(table.domain.attributes), 4)
5587
self.assertEqual(table.name, 'header_0_sheet-my_sheet')
5688

89+
def test_named_sheet_table_xls(self):
90+
table = Table.from_file(get_dataset("header_0_sheet.xls"),
91+
sheet="my_sheet")
92+
self.assertEqual(len(table.domain.attributes), 4)
93+
self.assertEqual(table.name, 'header_0_sheet-my_sheet')
94+
5795

5896
class TestExcelHeader1(unittest.TestCase):
59-
def test_no_flags(self):
60-
table = read_file("header_1_no_flags.xlsx")
97+
@test_xlsx_xls
98+
def test_no_flags(self, reader: Callable[[str], io.FileFormat]):
99+
table = read_file(reader, "header_1_no_flags.xlsx")
61100
domain = table.domain
62101
self.assertEqual(len(domain.metas), 0)
63102
self.assertEqual(len(domain.attributes), 4)
@@ -74,8 +113,9 @@ def test_no_flags(self):
74113
[0, 0, np.nan, 0]]))
75114
np.testing.assert_equal(table.Y, np.array([]).reshape(3, 0))
76115

77-
def test_flags(self):
78-
table = read_file("header_1_flags.xlsx")
116+
@test_xlsx_xls
117+
def test_flags(self, reader: Callable[[str], io.FileFormat]):
118+
table = read_file(reader, "header_1_flags.xlsx")
79119
domain = table.domain
80120

81121
self.assertEqual(len(domain.attributes), 1)
@@ -104,8 +144,9 @@ def test_flags(self):
104144

105145

106146
class TestExcelHeader3(unittest.TestCase):
107-
def test_read(self):
108-
table = read_file("header_3.xlsx")
147+
@test_xlsx_xls
148+
def test_read(self, reader: Callable[[str], io.FileFormat]):
149+
table = read_file(reader, "header_3.xlsx")
109150
domain = table.domain
110151

111152
self.assertEqual(len(domain.attributes), 2)
59.5 KB
Binary file not shown.
25.3 KB
Binary file not shown.

Orange/widgets/data/owsave.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import os.path
22

33
from Orange.data.table import Table
4-
from Orange.data.io import TabReader, CSVReader, PickleReader, ExcelReader
4+
from Orange.data.io import TabReader, CSVReader, PickleReader, ExcelReader, \
5+
XlsReader
56
from Orange.widgets import gui, widget
67
from Orange.widgets.widget import Input
78
from Orange.widgets.settings import Setting
@@ -21,7 +22,7 @@ class OWSave(OWSaveBase):
2122

2223
settings_version = 2
2324

24-
writers = [TabReader, CSVReader, PickleReader, ExcelReader]
25+
writers = [TabReader, CSVReader, PickleReader, ExcelReader, XlsReader]
2526
filters = {
2627
**{f"{w.DESCRIPTION} (*{w.EXTENSIONS[0]})": w
2728
for w in writers},

requirements-core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ requests
2121
openTSNE>=0.3.11
2222
pandas
2323
pyyaml
24+
openpyxl

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ def configuration(parent_package='', top_path=None):
210210
"Orange.widgets.unsupervised": ["icons/*.svg"],
211211
"Orange.widgets.utils": ["_webview/*.js"],
212212
"Orange.tests": ["xlsx_files/*.xlsx", "datasets/*.tab",
213+
"xlsx_files/*.xls",
213214
"datasets/*.basket", "datasets/*.csv",
214215
"datasets/*.pkl", "datasets/*.pkl.gz"]
215216
}

0 commit comments

Comments
 (0)