Skip to content

Commit 0d089a5

Browse files
authored
Merge pull request doccano#399 from CatalystCode/bugfix/import-non-utf8-files
Bugfix/Fix data import from non UTF-8 files
2 parents 55c6cc2 + e81ebb3 commit 0d089a5

File tree

4 files changed

+47
-4
lines changed

4 files changed

+47
-4
lines changed

app/api/tests/data/example.utf16.csv

106 Bytes
Binary file not shown.

app/api/tests/test_api.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,12 @@ def test_can_upload_classification_csv(self):
841841
file_format='csv',
842842
expected_status=status.HTTP_201_CREATED)
843843

844+
def test_can_upload_csv_with_non_utf8_encoding(self):
845+
self.upload_test_helper(project_id=self.classification_project.id,
846+
filename='example.utf16.csv',
847+
file_format='csv',
848+
expected_status=status.HTTP_201_CREATED)
849+
844850
def test_can_upload_seq2seq_csv(self):
845851
self.upload_test_helper(project_id=self.seq2seq_project.id,
846852
filename='example.csv',

app/api/utils.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from random import Random
88

99
import conllu
10+
from chardet import UniversalDetector
1011
from django.db import transaction
1112
from django.conf import settings
1213
import pyexcel
@@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
245246
"""
246247
def parse(self, file):
247248
data = []
248-
file = io.TextIOWrapper(file, encoding='utf-8')
249+
file = EncodedIO(file)
250+
file = io.TextIOWrapper(file, encoding=file.encoding)
249251

250252
# Add check exception
251253

@@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
300302
```
301303
"""
302304
def parse(self, file):
303-
file = io.TextIOWrapper(file, encoding='utf-8')
305+
file = EncodedIO(file)
306+
file = io.TextIOWrapper(file, encoding=file.encoding)
304307
while True:
305308
batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
306309
if not batch:
@@ -323,7 +326,8 @@ class CSVParser(FileParser):
323326
```
324327
"""
325328
def parse(self, file):
326-
file = io.TextIOWrapper(file, encoding='utf-8')
329+
file = EncodedIO(file)
330+
file = io.TextIOWrapper(file, encoding=file.encoding)
327331
reader = csv.reader(file)
328332
yield from ExcelParser.parse_excel_csv_reader(reader)
329333

@@ -364,7 +368,8 @@ def parse_excel_csv_reader(reader):
364368
class JSONParser(FileParser):
365369

366370
def parse(self, file):
367-
file = io.TextIOWrapper(file, encoding='utf-8')
371+
file = EncodedIO(file)
372+
file = io.TextIOWrapper(file, encoding=file.encoding)
368373
data = []
369374
for i, line in enumerate(file, start=1):
370375
if len(data) >= settings.IMPORT_BATCH_SIZE:
@@ -506,3 +511,34 @@ def readinto(self, b):
506511
return 0 # indicate EOF
507512

508513
return io.BufferedReader(IterStream(), buffer_size=buffer_size)
514+
515+
516+
class EncodedIO(io.RawIOBase):
517+
def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
518+
buffer = b''
519+
detector = UniversalDetector()
520+
521+
while True:
522+
read = fobj.read(buffer_size)
523+
detector.feed(read)
524+
buffer += read
525+
if detector.done or len(read) < buffer_size:
526+
break
527+
528+
if detector.done:
529+
self.encoding = detector.result['encoding']
530+
else:
531+
self.encoding = default_encoding
532+
533+
self._fobj = fobj
534+
self._buffer = buffer
535+
536+
def readable(self):
537+
return self._fobj.readable()
538+
539+
def readinto(self, b):
540+
l = len(b)
541+
chunk = self._buffer or self._fobj.read(l)
542+
output, self._buffer = chunk[:l], chunk[l:]
543+
b[:len(output)] = output
544+
return len(output)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
apache-libcloud==2.4.0
22
applicationinsights==0.11.7
3+
chardet==3.0.4
34
coverage==4.5.3
45
dj-database-url==0.5.0
56
Django==2.1.7

0 commit comments

Comments
 (0)