6
6
from collections import defaultdict
7
7
8
8
import conllu
9
+ from chardet import UniversalDetector
9
10
from django .db import transaction
10
11
from django .conf import settings
11
12
from colour import Color
@@ -246,7 +247,8 @@ class CoNLLParser(FileParser):
246
247
"""
247
248
def parse (self , file ):
248
249
data = []
249
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
250
+ file = EncodedIO (file )
251
+ file = io .TextIOWrapper (file , encoding = file .encoding )
250
252
251
253
# Add check exception
252
254
@@ -301,7 +303,8 @@ class PlainTextParser(FileParser):
301
303
```
302
304
"""
303
305
def parse (self , file ):
304
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
306
+ file = EncodedIO (file )
307
+ file = io .TextIOWrapper (file , encoding = file .encoding )
305
308
while True :
306
309
batch = list (itertools .islice (file , settings .IMPORT_BATCH_SIZE ))
307
310
if not batch :
@@ -324,7 +327,8 @@ class CSVParser(FileParser):
324
327
```
325
328
"""
326
329
def parse (self , file ):
327
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
330
+ file = EncodedIO (file )
331
+ file = io .TextIOWrapper (file , encoding = file .encoding )
328
332
reader = csv .reader (file )
329
333
yield from ExcelParser .parse_excel_csv_reader (reader )
330
334
@@ -365,7 +369,8 @@ def parse_excel_csv_reader(reader):
365
369
class JSONParser (FileParser ):
366
370
367
371
def parse (self , file ):
368
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
372
+ file = EncodedIO (file )
373
+ file = io .TextIOWrapper (file , encoding = file .encoding )
369
374
data = []
370
375
for i , line in enumerate (file , start = 1 ):
371
376
if len (data ) >= settings .IMPORT_BATCH_SIZE :
@@ -466,3 +471,34 @@ def readinto(self, b):
466
471
return 0 # indicate EOF
467
472
468
473
return io .BufferedReader (IterStream (), buffer_size = buffer_size )
474
+
475
+
476
+ class EncodedIO (io .RawIOBase ):
477
+ def __init__ (self , fobj , buffer_size = io .DEFAULT_BUFFER_SIZE , default_encoding = 'utf-8' ):
478
+ buffer = b''
479
+ detector = UniversalDetector ()
480
+
481
+ while True :
482
+ read = fobj .read (buffer_size )
483
+ detector .feed (read )
484
+ buffer += read
485
+ if detector .done or len (read ) < buffer_size :
486
+ break
487
+
488
+ if detector .done :
489
+ self .encoding = detector .result ['encoding' ]
490
+ else :
491
+ self .encoding = default_encoding
492
+
493
+ self ._fobj = fobj
494
+ self ._buffer = buffer
495
+
496
+ def readable (self ):
497
+ return self ._fobj .readable ()
498
+
499
+ def readinto (self , b ):
500
+ l = len (b )
501
+ chunk = self ._buffer or self ._fobj .read (l )
502
+ output , self ._buffer = chunk [:l ], chunk [l :]
503
+ b [:len (output )] = output
504
+ return len (output )
0 commit comments