diff --git a/Orange/data/io_util.py b/Orange/data/io_util.py index f3de0948a3a..ed1e1add50f 100644 --- a/Orange/data/io_util.py +++ b/Orange/data/io_util.py @@ -1,3 +1,4 @@ +import codecs import os.path import subprocess from collections import defaultdict @@ -45,6 +46,16 @@ def open_compressed(filename, *args, _open=open, **kwargs): # Else already a file, just pass it through return filename +def _is_utf8_sig(filename: str) -> bool: + """Does filename start with an UTF-8 BOM.""" + try: + with open(filename, "rb") as f: + bom = f.read(3) + return bom == codecs.BOM_UTF8 + except OSError: # pragma: no cover + return False + + def detect_encoding(filename): """ @@ -59,6 +70,9 @@ def detect_encoding(filename): proc.wait() if proc.returncode == 0: encoding = proc.stdout.read().strip() + # file does not detect/report UTF-8 BOM + if encoding == b'utf-8': + return "utf-8-sig" if _is_utf8_sig(filename) else "utf-8" # file only supports these encodings; for others it says # unknown-8bit or binary. So we give chardet a chance to do # better diff --git a/Orange/tests/test_txt_reader.py b/Orange/tests/test_txt_reader.py index d5a4f1041ec..5a6c92e2f29 100644 --- a/Orange/tests/test_txt_reader.py +++ b/Orange/tests/test_txt_reader.py @@ -7,7 +7,7 @@ from Orange.data import Table, ContinuousVariable, DiscreteVariable from Orange.data.io import CSVReader -from Orange.tests import test_filename +from Orange.tests import test_filename, named_file tab_file = """\ Feature 1\tFeature 2\tFeature 3 @@ -124,6 +124,12 @@ def test_csv_sniffer(self): self.assertEqual(len(data), 8) self.assertEqual(len(data.domain.variables) + len(data.domain.metas), 15) + def test_utf_8_sig(self): + with named_file(csv_file, encoding="utf-8-sig") as f: + reader = CSVReader(f) + data = reader.read() + self.assertEqual(data.domain[0].name, "Feature 1") + if __name__ == "__main__": unittest.main() diff --git a/i18n/si/msgs.jaml b/i18n/si/msgs.jaml index 301a197c8e9..43f27310598 100644 --- a/i18n/si/msgs.jaml +++ b/i18n/si/msgs.jaml @@ -1252,11 +1252,14 @@ data/io_util.py: .gz: false .bz2: false .xz: false + def `_is_utf8_sig`: + rb: false def `detect_encoding`: file: false --brief: false --mime-encoding: false utf-8: false + utf-8-sig: false us-ascii: false iso-8859-1: false utf-7: false