Skip to content

Commit c82bde2

Browse files
authored
Merge pull request #5736 from markotoplak/owfile-reader
[ENH] File: explicit file format choice
2 parents 41f6906 + 9586906 commit c82bde2

File tree

3 files changed

+171
-38
lines changed

3 files changed

+171
-38
lines changed

Orange/data/io_base.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131
PICKLE_PROTOCOL = 4
3232

3333

34+
class MissingReaderException(IOError):
35+
# subclasses IOError for backward compatibility
36+
pass
37+
38+
3439
class Flags:
3540
"""Parser for column flags (i.e. third header row)"""
3641
DELIMITER = ' '
@@ -551,7 +556,7 @@ def get_reader(cls, filename):
551556
if fnmatch(path.basename(filename), '*' + ext):
552557
return reader(filename)
553558

554-
raise IOError('No readers for file "{}"'.format(filename))
559+
raise MissingReaderException('No readers for file "{}"'.format(filename))
555560

556561
@classmethod
557562
def set_table_metadata(cls, filename, table):

Orange/widgets/data/owfile.py

Lines changed: 103 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,14 @@
99
QStyle, QComboBox, QMessageBox, QGridLayout, QLabel, \
1010
QLineEdit, QSizePolicy as Policy, QCompleter
1111
from AnyQt.QtCore import Qt, QTimer, QSize, QUrl
12+
from AnyQt.QtGui import QBrush
1213

14+
from orangewidget.utils.filedialogs import format_filter
1315
from orangewidget.workflow.drophandler import SingleUrlDropHandler
1416

1517
from Orange.data.table import Table, get_sample_datasets_dir
1618
from Orange.data.io import FileFormat, UrlReader, class_from_qualified_name
19+
from Orange.data.io_base import MissingReaderException
1720
from Orange.util import log_warnings
1821
from Orange.widgets import widget, gui
1922
from Orange.widgets.settings import Setting, ContextSetting, \
@@ -30,6 +33,8 @@
3033
# module's namespace so that old saved settings still work
3134
from Orange.widgets.utils.filedialogs import RecentPath
3235

36+
DEFAULT_READER_TEXT = "Automatically detect type"
37+
3338
log = logging.getLogger(__name__)
3439

3540

@@ -121,6 +126,9 @@ class Outputs:
121126

122127
domain_editor = SettingProvider(DomainEditor)
123128

129+
class Information(widget.OWWidget.Information):
130+
no_file_selected = Msg("No file selected.")
131+
124132
class Warning(widget.OWWidget.Warning):
125133
file_too_big = Msg("The file is too large to load automatically."
126134
" Press Reload to load.")
@@ -137,9 +145,6 @@ class Error(widget.OWWidget.Error):
137145
sheet_error = Msg("Error listing available sheets.")
138146
unknown = Msg("Read error:\n{}")
139147

140-
class NoFileSelected:
141-
pass
142-
143148
UserAdviceMessages = [
144149
widget.Message(
145150
"Use CSV File Import widget for advanced options "
@@ -160,6 +165,23 @@ def __init__(self):
160165
self.loaded_file = ""
161166
self.reader = None
162167

168+
readers = [f for f in FileFormat.formats
169+
if getattr(f, 'read', None)
170+
and getattr(f, "EXTENSIONS", None)]
171+
172+
def group_readers_per_addon_key(w):
173+
# readers from Orange.data.io should go first
174+
def package(w):
175+
package = w.qualified_name().split(".")[:-1]
176+
package = package[:2]
177+
if ".".join(package) == "Orange.data":
178+
return ["0"] # force "Orange" to come first
179+
return package
180+
return package(w), w.DESCRIPTION
181+
182+
self.available_readers = sorted(set(readers),
183+
key=group_readers_per_addon_key)
184+
163185
layout = QGridLayout()
164186
layout.setSpacing(4)
165187
gui.widgetBox(self.controlArea, orientation=layout, box='Source')
@@ -227,6 +249,19 @@ def __init__(self):
227249
completer.setCaseSensitivity(Qt.CaseSensitive)
228250
url_combo.setCompleter(completer)
229251

252+
layout = QGridLayout()
253+
layout.setSpacing(4)
254+
gui.widgetBox(self.controlArea, orientation=layout, box='File Type')
255+
256+
box = gui.hBox(None, addToLayout=False, margin=0)
257+
box.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
258+
self.reader_combo = QComboBox(self)
259+
self.reader_combo.setSizePolicy(Policy.MinimumExpanding, Policy.Fixed)
260+
self.reader_combo.activated[int].connect(self.select_reader)
261+
262+
box.layout().addWidget(self.reader_combo)
263+
layout.addWidget(box, 0, 1)
264+
230265
box = gui.vBox(self.controlArea, "Info")
231266
self.infolabel = gui.widgetLabel(box, 'No data loaded.')
232267

@@ -286,6 +321,23 @@ def select_sheet(self):
286321
self.recent_paths[0].sheet = self.sheet_combo.currentText()
287322
self.load_data()
288323

324+
def select_reader(self, n):
325+
if self.source != self.LOCAL_FILE:
326+
return # ignore for URL's
327+
328+
if self.recent_paths:
329+
path = self.recent_paths[0]
330+
if n == 0: # default
331+
path.file_format = None
332+
self.load_data()
333+
elif n <= len(self.available_readers):
334+
reader = self.available_readers[n - 1]
335+
path.file_format = reader.qualified_name()
336+
self.load_data()
337+
else: # the rest include just qualified names
338+
path.file_format = self.reader_combo.itemText(n)
339+
self.load_data()
340+
289341
def _url_set(self):
290342
url = self.url_combo.currentText()
291343
pos = self.recent_urls.index(url)
@@ -310,10 +362,7 @@ def browse_file(self, in_demos=False):
310362
else:
311363
start_file = self.last_path() or os.path.expanduser("~/")
312364

313-
readers = [f for f in FileFormat.formats
314-
if getattr(f, 'read', None)
315-
and getattr(f, "EXTENSIONS", None)]
316-
filename, reader, _ = open_filename_dialog(start_file, None, readers)
365+
filename, reader, _ = open_filename_dialog(start_file, None, self.available_readers)
317366
if not filename:
318367
return
319368
self.add_path(filename)
@@ -342,19 +391,33 @@ def load_data(self):
342391
self.infolabel.setText("No data.")
343392

344393
def _try_load(self):
394+
self._initialize_reader_combo()
395+
345396
# pylint: disable=broad-except
346-
if self.last_path() and not os.path.exists(self.last_path()):
347-
return self.Error.file_not_found
397+
if self.source == self.LOCAL_FILE:
398+
if self.last_path() is None:
399+
return self.Information.no_file_selected
400+
elif not os.path.exists(self.last_path()):
401+
return self.Error.file_not_found
402+
else:
403+
url = self.url_combo.currentText().strip()
404+
if not url:
405+
return self.Information.no_file_selected
406+
407+
def mark_problematic_reader():
408+
self.reader_combo.setItemData(self.reader_combo.currentIndex(),
409+
QBrush(Qt.red), Qt.ForegroundRole)
348410

349411
try:
350-
self.reader = self._get_reader()
412+
self.reader = self._get_reader() # also sets current reader index
351413
assert self.reader is not None
352-
except Exception:
414+
except MissingReaderException:
415+
mark_problematic_reader()
353416
return self.Error.missing_reader
354-
355-
if self.reader is self.NoFileSelected:
356-
self.Outputs.data.send(None)
357-
return None
417+
except Exception as ex:
418+
mark_problematic_reader()
419+
log.exception(ex)
420+
return lambda x=ex: self.Error.unknown(str(x))
358421

359422
try:
360423
self._update_sheet_combo()
@@ -365,6 +428,7 @@ def _try_load(self):
365428
try:
366429
data = self.reader.read()
367430
except Exception as ex:
431+
mark_problematic_reader()
368432
log.exception(ex)
369433
return lambda x=ex: self.Error.unknown(str(x))
370434
if warnings:
@@ -382,23 +446,31 @@ def _try_load(self):
382446
def _get_reader(self) -> FileFormat:
383447
if self.source == self.LOCAL_FILE:
384448
path = self.last_path()
385-
if path is None:
386-
return self.NoFileSelected
449+
self.reader_combo.setEnabled(True)
387450
if self.recent_paths and self.recent_paths[0].file_format:
388451
qname = self.recent_paths[0].file_format
389-
reader_class = class_from_qualified_name(qname)
452+
qname_index = {r.qualified_name(): i for i, r in enumerate(self.available_readers)}
453+
if qname in qname_index:
454+
self.reader_combo.setCurrentIndex(qname_index[qname] + 1)
455+
else:
456+
# reader may be accessible, but not in self.available_readers
457+
# (perhaps its code was moved)
458+
self.reader_combo.addItem(qname)
459+
self.reader_combo.setCurrentIndex(len(self.reader_combo) - 1)
460+
try:
461+
reader_class = class_from_qualified_name(qname)
462+
except Exception as ex:
463+
raise MissingReaderException(f'Can not find reader "{qname}"') from ex
390464
reader = reader_class(path)
391465
else:
466+
self.reader_combo.setCurrentIndex(0)
392467
reader = FileFormat.get_reader(path)
393468
if self.recent_paths and self.recent_paths[0].sheet:
394469
reader.select_sheet(self.recent_paths[0].sheet)
395470
return reader
396471
else:
397472
url = self.url_combo.currentText().strip()
398-
if url:
399-
return UrlReader(url)
400-
else:
401-
return self.NoFileSelected
473+
return UrlReader(url)
402474

403475
def _update_sheet_combo(self):
404476
if len(self.reader.sheets) < 2:
@@ -420,6 +492,14 @@ def _select_active_sheet(self):
420492
self.reader.select_sheet(None)
421493
self.sheet_combo.setCurrentIndex(0)
422494

495+
def _initialize_reader_combo(self):
496+
self.reader_combo.clear()
497+
filters = [format_filter(f) for f in self.available_readers]
498+
self.reader_combo.addItems([DEFAULT_READER_TEXT] + filters)
499+
self.reader_combo.setCurrentIndex(0)
500+
self.reader_combo.setDisabled(True)
501+
# additional readers may be added in self._get_reader()
502+
423503
@staticmethod
424504
def _describe(table):
425505
def missing_prop(prop):
@@ -551,7 +631,7 @@ def dragEnterEvent(event):
551631
try:
552632
FileFormat.get_reader(urls[0].toLocalFile())
553633
event.acceptProposedAction()
554-
except IOError:
634+
except MissingReaderException:
555635
pass
556636

557637
def dropEvent(self, event):

Orange/widgets/data/tests/test_owfile.py

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from Orange.data.io import TabReader
2626
from Orange.tests import named_file
27-
from Orange.widgets.data.owfile import OWFile, OWFileDropHandler
27+
from Orange.widgets.data.owfile import OWFile, OWFileDropHandler, DEFAULT_READER_TEXT
2828
from Orange.widgets.utils.filedialogs import dialog_formats, format_filter, RecentPath
2929
from Orange.widgets.tests.base import WidgetTest
3030
from Orange.widgets.utils.domaineditor import ComboDelegate, VarTypeDelegate, VarTableModel
@@ -253,12 +253,14 @@ def test_nothing_selected(self):
253253
self.create_widget(OWFile, stored_settings={"recent_paths": []})
254254

255255
widget.Outputs.data.send = Mock()
256-
widget._try_load()
256+
widget.load_data()
257+
self.assertTrue(widget.Information.no_file_selected.is_shown())
257258
widget.Outputs.data.send.assert_called_with(None)
258259

259260
widget.Outputs.data.send.reset_mock()
260261
widget.source = widget.URL
261-
widget._try_load()
262+
widget.load_data()
263+
self.assertTrue(widget.Information.no_file_selected.is_shown())
262264
widget.Outputs.data.send.assert_called_with(None)
263265

264266
def test_check_column_noname(self):
@@ -380,6 +382,7 @@ def open_iris_with_no_spec_format(_a, _b, _c, filters, _e):
380382
self.widget.browse_file()
381383

382384
self.assertIsNone(self.widget.recent_paths[0].file_format)
385+
self.assertEqual(self.widget.reader_combo.currentText(), DEFAULT_READER_TEXT)
383386

384387
def open_iris_with_tab(*_):
385388
return iris.__file__, format_filter(TabReader)
@@ -389,6 +392,7 @@ def open_iris_with_tab(*_):
389392
self.widget.browse_file()
390393

391394
self.assertEqual(self.widget.recent_paths[0].file_format, "Orange.data.io.TabReader")
395+
self.assertTrue(self.widget.reader_combo.currentText().startswith("Tab-separated"))
392396

393397
def test_no_specified_reader(self):
394398
with named_file("", suffix=".tab") as fn:
@@ -397,6 +401,52 @@ def test_no_specified_reader(self):
397401
stored_settings={"recent_paths": [no_class]})
398402
self.widget.load_data()
399403
self.assertTrue(self.widget.Error.missing_reader.is_shown())
404+
self.assertEqual(self.widget.reader_combo.currentText(), "not.a.file.reader.class")
405+
406+
def test_select_reader(self):
407+
filename = FileFormat.locate("iris.tab", dataset_dirs)
408+
409+
# a setting which adds a new qualified name to the reader combo
410+
no_class = RecentPath(filename, None, None, file_format="not.a.file.reader.class")
411+
self.widget = self.create_widget(OWFile,
412+
stored_settings={"recent_paths": [no_class]})
413+
self.widget.load_data()
414+
len_with_qname = len(self.widget.reader_combo)
415+
self.assertEqual(self.widget.reader_combo.currentText(), "not.a.file.reader.class")
416+
self.assertEqual(self.widget.reader, None)
417+
418+
# select the last option, the same reader
419+
self.widget.reader_combo.activated.emit(len_with_qname - 1)
420+
self.assertEqual(len(self.widget.reader_combo), len_with_qname)
421+
self.assertEqual(self.widget.reader_combo.currentText(), "not.a.file.reader.class")
422+
self.assertEqual(self.widget.reader, None)
423+
424+
# select the tab reader
425+
for i in range(len_with_qname):
426+
text = self.widget.reader_combo.itemText(i)
427+
if text.startswith("Tab-separated"):
428+
break
429+
self.widget.reader_combo.activated.emit(i)
430+
self.assertEqual(len(self.widget.reader_combo), len_with_qname - 1)
431+
self.assertTrue(self.widget.reader_combo.currentText().startswith("Tab-separated"))
432+
self.assertIsInstance(self.widget.reader, TabReader)
433+
434+
# select the default reader
435+
self.widget.reader_combo.activated.emit(0)
436+
self.assertEqual(len(self.widget.reader_combo), len_with_qname - 1)
437+
self.assertEqual(self.widget.reader_combo.currentText(), DEFAULT_READER_TEXT)
438+
self.assertIsInstance(self.widget.reader, TabReader)
439+
440+
def test_select_reader_errors(self):
441+
filename = FileFormat.locate("iris.tab", dataset_dirs)
442+
443+
no_class = RecentPath(filename, None, None, file_format="Orange.data.io.ExcelReader")
444+
self.widget = self.create_widget(OWFile,
445+
stored_settings={"recent_paths": [no_class]})
446+
self.widget.load_data()
447+
self.assertIn("Excel", self.widget.reader_combo.currentText())
448+
self.assertTrue(self.widget.Error.unknown.is_shown())
449+
self.assertFalse(self.widget.Error.missing_reader.is_shown())
400450

401451
def test_domain_edit_no_changes(self):
402452
self.open_dataset("iris")
@@ -408,12 +458,12 @@ def test_domain_edit_no_changes(self):
408458
def test_domain_edit_on_sparse_data(self):
409459
iris = Table("iris").to_sparse()
410460

411-
f = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False)
412-
pickle.dump(iris, f)
413-
f.close()
461+
with named_file("", suffix='.pickle') as fn:
462+
with open(fn, "wb") as f:
463+
pickle.dump(iris, f)
414464

415-
self.widget.add_path(f.name)
416-
self.widget.load_data()
465+
self.widget.add_path(fn)
466+
self.widget.load_data()
417467

418468
output = self.get_output(self.widget.Outputs.data)
419469
self.assertIsInstance(output, Table)
@@ -552,9 +602,8 @@ def test_open_moved_workflow(self):
552602
(i.e. sent by email), considering data file is stored in the same
553603
directory as the workflow.
554604
"""
555-
temp_file = tempfile.NamedTemporaryFile(dir=getcwd(), delete=False)
556-
file_name = temp_file.name
557-
temp_file.close()
605+
with tempfile.NamedTemporaryFile(dir=getcwd(), delete=False) as temp_file:
606+
file_name = temp_file.name
558607
base_name = path.basename(file_name)
559608
try:
560609
recent_path = RecentPath(
@@ -575,9 +624,8 @@ def test_files_relocated(self):
575624
"""
576625
This test testes if paths are relocated correctly
577626
"""
578-
temp_file = tempfile.NamedTemporaryFile(dir=getcwd(), delete=False)
579-
file_name = temp_file.name
580-
temp_file.close()
627+
with tempfile.NamedTemporaryFile(dir=getcwd(), delete=False) as temp_file:
628+
file_name = temp_file.name
581629
base_name = path.basename(file_name)
582630
try:
583631
recent_path = RecentPath(

0 commit comments

Comments
 (0)