Skip to content

Commit 5b4a89e

Browse files
authored
Merge pull request #5438 from ales-erjavec/csvimport-indicate-encoding-errors
[ENH] textimport: Mark encoding errors in the preview
2 parents 3773093 + 49d3da6 commit 5b4a89e

File tree

2 files changed

+42
-33
lines changed

2 files changed

+42
-33
lines changed

Orange/widgets/data/owcsvimport.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -579,21 +579,32 @@ def selectedFileFormat(self) -> FileFormat:
579579

580580
def default_options_for_mime_type(
581581
path: str, mime_type: str
582-
) -> Tuple[csv.Dialect, bool]:
582+
) -> Options:
583583
defaults = {
584584
"text/csv": (csv.excel(), True),
585585
"text/tab-separated-values": (csv.excel_tab(), True)
586586
}
587-
dialect, header = csv.excel(), True
587+
dialect, header, encoding = csv.excel(), True, "utf-8"
588588
delimiters = None
589+
try_encodings = ["utf-8", "utf-16", "iso8859-1"]
589590
if mime_type in defaults:
590591
dialect, header = defaults[mime_type]
591592
delimiters = [dialect.delimiter]
592-
try:
593-
dialect, header = sniff_csv_with_path(path, delimiters=delimiters)
594-
except (OSError, UnicodeDecodeError, csv.Error):
595-
pass
596-
return dialect, header
593+
594+
for encoding_ in try_encodings:
595+
try:
596+
dialect, header = sniff_csv_with_path(
597+
path, encoding=encoding_, delimiters=delimiters)
598+
encoding = encoding_
599+
except (OSError, UnicodeError, csv.Error):
600+
pass
601+
else:
602+
break
603+
if header:
604+
rowspec = [(range(0, 1), RowSpec.Header)]
605+
else:
606+
rowspec = []
607+
return Options(dialect=dialect, encoding=encoding, rowspec=rowspec)
597608

598609

599610
class OWCSVFileImport(widget.OWWidget):
@@ -910,11 +921,10 @@ def browse(self, prefixname=None, directory=None):
910921
mb = self._might_be_binary_mb(path)
911922
if mb.exec() == QMessageBox.Cancel:
912923
return
913-
# initialize dialect based on selected format
914-
dialect, header = default_options_for_mime_type(
924+
# initialize options based on selected format
925+
options = default_options_for_mime_type(
915926
path, selected_filter.mime_type,
916927
)
917-
options = None
918928
# Search for path in history.
919929
# If found use the stored params to initialize the import dialog
920930
items = self.itemsFromSettings()
@@ -923,15 +933,6 @@ def browse(self, prefixname=None, directory=None):
923933
_, options_ = items[idx]
924934
if options_ is not None:
925935
options = options_
926-
927-
if options is None:
928-
if not header:
929-
rowspec = []
930-
else:
931-
rowspec = [(range(0, 1), RowSpec.Header)]
932-
options = Options(
933-
encoding="utf-8", dialect=dialect, rowspec=rowspec)
934-
935936
dlg = CSVImportDialog(
936937
self, windowTitle="Import Options", sizeGripEnabled=True)
937938
dlg.setWindowModality(Qt.WindowModal)

Orange/widgets/utils/textimport.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -991,9 +991,11 @@ def __resetPreview(self):
991991
base = CachedBytesIOWrapper(self.__sample, self.__buffer)
992992

993993
wrapper = io.TextIOWrapper(
994-
base, encoding=self.encoding(), errors="replace"
994+
base, encoding=self.encoding(),
995+
# use surrogate escape to validate/detect encoding errors in
996+
# delegates
997+
errors="surrogateescape"
995998
)
996-
997999
rows = csv.reader(
9981000
wrapper, dialect=self.dialect()
9991001
)
@@ -1372,6 +1374,11 @@ def sizeHint(self):
13721374
return sh.expandedTo(QSize(8 * hsection, 20 * vsection))
13731375

13741376

1377+
def is_surrogate_escaped(text: str) -> bool:
1378+
"""Does `text` contain any surrogate escape characters."""
1379+
return any("\udc80" <= c <= "\udcff" for c in text)
1380+
1381+
13751382
class PreviewItemDelegate(QStyledItemDelegate):
13761383
def initStyleOption(self, option, index):
13771384
# type: (QStyleOptionViewItem, QModelIndex) -> None
@@ -1389,6 +1396,18 @@ def initStyleOption(self, option, index):
13891396
if coltype == ColumnType.Numeric or coltype == ColumnType.Time:
13901397
option.displayAlignment = Qt.AlignRight | Qt.AlignVCenter
13911398

1399+
if not self.validate(option.text):
1400+
option.palette.setBrush(
1401+
QPalette.All, QPalette.Text, QBrush(Qt.red, Qt.SolidPattern)
1402+
)
1403+
option.palette.setBrush(
1404+
QPalette.All, QPalette.HighlightedText,
1405+
QBrush(Qt.red, Qt.SolidPattern)
1406+
)
1407+
1408+
def validate(self, value: str) -> bool: # pylint: disable=no-self-use
1409+
return not is_surrogate_escaped(value)
1410+
13921411
def helpEvent(self, event, view, option, index):
13931412
# type: (QHelpEvent, QAbstractItemView, QStyleOptionViewItem, QModelIndex) -> bool
13941413
if event.type() == QEvent.ToolTip:
@@ -1467,17 +1486,6 @@ def __init__(self, *args, converter=None, **kwargs):
14671486
super().__init__(*args, **kwargs)
14681487
self.converter = converter or float
14691488

1470-
def initStyleOption(self, option, index):
1471-
super().initStyleOption(option, index)
1472-
if not self.validate(option.text):
1473-
option.palette.setBrush(
1474-
QPalette.All, QPalette.Text, QBrush(Qt.red, Qt.SolidPattern)
1475-
)
1476-
option.palette.setBrush(
1477-
QPalette.All, QPalette.HighlightedText,
1478-
QBrush(Qt.red, Qt.SolidPattern)
1479-
)
1480-
14811489
def validate(self, value):
14821490
if value in {"NA", "Na", "na", "n/a", "N/A", "?", "", "."}:
14831491
return True
@@ -1486,7 +1494,7 @@ def validate(self, value):
14861494
except ValueError:
14871495
return False
14881496
else:
1489-
return True
1497+
return super().validate(value)
14901498

14911499

14921500
def number_parser(groupsep, decimalsep):

0 commit comments

Comments
 (0)