Skip to content

Commit d5b9815

Browse files
authored
Merge branch 'master' into enhancement/refactor-to-colour-library
2 parents 4928750 + 0d089a5 commit d5b9815

File tree

4 files changed

+47
-4
lines changed

4 files changed

+47
-4
lines changed

app/api/tests/data/example.utf16.csv

106 Bytes
Binary file not shown.

app/api/tests/test_api.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,12 @@ def test_can_upload_classification_csv(self):
841841
file_format='csv',
842842
expected_status=status.HTTP_201_CREATED)
843843

844+
def test_can_upload_csv_with_non_utf8_encoding(self):
845+
self.upload_test_helper(project_id=self.classification_project.id,
846+
filename='example.utf16.csv',
847+
file_format='csv',
848+
expected_status=status.HTTP_201_CREATED)
849+
844850
def test_can_upload_seq2seq_csv(self):
845851
self.upload_test_helper(project_id=self.seq2seq_project.id,
846852
filename='example.csv',

app/api/utils.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from collections import defaultdict
77

88
import conllu
9+
from chardet import UniversalDetector
910
from django.db import transaction
1011
from django.conf import settings
1112
from colour import Color
@@ -246,7 +247,8 @@ class CoNLLParser(FileParser):
246247
"""
247248
def parse(self, file):
248249
data = []
249-
file = io.TextIOWrapper(file, encoding='utf-8')
250+
file = EncodedIO(file)
251+
file = io.TextIOWrapper(file, encoding=file.encoding)
250252

251253
# Add check exception
252254

@@ -301,7 +303,8 @@ class PlainTextParser(FileParser):
301303
```
302304
"""
303305
def parse(self, file):
304-
file = io.TextIOWrapper(file, encoding='utf-8')
306+
file = EncodedIO(file)
307+
file = io.TextIOWrapper(file, encoding=file.encoding)
305308
while True:
306309
batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
307310
if not batch:
@@ -324,7 +327,8 @@ class CSVParser(FileParser):
324327
```
325328
"""
326329
def parse(self, file):
327-
file = io.TextIOWrapper(file, encoding='utf-8')
330+
file = EncodedIO(file)
331+
file = io.TextIOWrapper(file, encoding=file.encoding)
328332
reader = csv.reader(file)
329333
yield from ExcelParser.parse_excel_csv_reader(reader)
330334

@@ -365,7 +369,8 @@ def parse_excel_csv_reader(reader):
365369
class JSONParser(FileParser):
366370

367371
def parse(self, file):
368-
file = io.TextIOWrapper(file, encoding='utf-8')
372+
file = EncodedIO(file)
373+
file = io.TextIOWrapper(file, encoding=file.encoding)
369374
data = []
370375
for i, line in enumerate(file, start=1):
371376
if len(data) >= settings.IMPORT_BATCH_SIZE:
@@ -466,3 +471,34 @@ def readinto(self, b):
466471
return 0 # indicate EOF
467472

468473
return io.BufferedReader(IterStream(), buffer_size=buffer_size)
474+
475+
476+
class EncodedIO(io.RawIOBase):
477+
def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
478+
buffer = b''
479+
detector = UniversalDetector()
480+
481+
while True:
482+
read = fobj.read(buffer_size)
483+
detector.feed(read)
484+
buffer += read
485+
if detector.done or len(read) < buffer_size:
486+
break
487+
488+
if detector.done:
489+
self.encoding = detector.result['encoding']
490+
else:
491+
self.encoding = default_encoding
492+
493+
self._fobj = fobj
494+
self._buffer = buffer
495+
496+
def readable(self):
497+
return self._fobj.readable()
498+
499+
def readinto(self, b):
500+
l = len(b)
501+
chunk = self._buffer or self._fobj.read(l)
502+
output, self._buffer = chunk[:l], chunk[l:]
503+
b[:len(output)] = output
504+
return len(output)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
apache-libcloud==2.4.0
22
applicationinsights==0.11.7
33
colour==0.1.5
4+
chardet==3.0.4
45
coverage==4.5.3
56
dj-database-url==0.5.0
67
Django==2.1.7

0 commit comments

Comments
 (0)