Skip to content

Commit c72c463

Browse files
authored
Merge pull request #7006 from ales-erjavec/table-reader-utf-8-sig
[FIX] io_util: Detect utf-8-sig when using `file` utility
2 parents 70285e3 + 2df43bf commit c72c463

File tree

3 files changed

+24
-1
lines changed

3 files changed

+24
-1
lines changed

Orange/data/io_util.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import codecs
12
import os.path
23
import subprocess
34
from collections import defaultdict
@@ -45,6 +46,16 @@ def open_compressed(filename, *args, _open=open, **kwargs):
4546
# Else already a file, just pass it through
4647
return filename
4748

49+
def _is_utf8_sig(filename: str) -> bool:
50+
"""Does filename start with an UTF-8 BOM."""
51+
try:
52+
with open(filename, "rb") as f:
53+
bom = f.read(3)
54+
return bom == codecs.BOM_UTF8
55+
except OSError: # pragma: no cover
56+
return False
57+
58+
4859

4960
def detect_encoding(filename):
5061
"""
@@ -59,6 +70,9 @@ def detect_encoding(filename):
5970
proc.wait()
6071
if proc.returncode == 0:
6172
encoding = proc.stdout.read().strip()
73+
# file does not detect/report UTF-8 BOM
74+
if encoding == b'utf-8':
75+
return "utf-8-sig" if _is_utf8_sig(filename) else "utf-8"
6276
# file only supports these encodings; for others it says
6377
# unknown-8bit or binary. So we give chardet a chance to do
6478
# better

Orange/tests/test_txt_reader.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from Orange.data import Table, ContinuousVariable, DiscreteVariable
99
from Orange.data.io import CSVReader
10-
from Orange.tests import test_filename
10+
from Orange.tests import test_filename, named_file
1111

1212
tab_file = """\
1313
Feature 1\tFeature 2\tFeature 3
@@ -124,6 +124,12 @@ def test_csv_sniffer(self):
124124
self.assertEqual(len(data), 8)
125125
self.assertEqual(len(data.domain.variables) + len(data.domain.metas), 15)
126126

127+
def test_utf_8_sig(self):
128+
with named_file(csv_file, encoding="utf-8-sig") as f:
129+
reader = CSVReader(f)
130+
data = reader.read()
131+
self.assertEqual(data.domain[0].name, "Feature 1")
132+
127133

128134
if __name__ == "__main__":
129135
unittest.main()

i18n/si/msgs.jaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,11 +1252,14 @@ data/io_util.py:
12521252
.gz: false
12531253
.bz2: false
12541254
.xz: false
1255+
def `_is_utf8_sig`:
1256+
rb: false
12551257
def `detect_encoding`:
12561258
file: false
12571259
--brief: false
12581260
--mime-encoding: false
12591261
utf-8: false
1262+
utf-8-sig: false
12601263
us-ascii: false
12611264
iso-8859-1: false
12621265
utf-7: false

0 commit comments

Comments
 (0)