7
7
from random import Random
8
8
9
9
import conllu
10
+ from chardet import UniversalDetector
10
11
from django .db import transaction
11
12
from django .conf import settings
12
13
import pyexcel
@@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
245
246
"""
246
247
def parse (self , file ):
247
248
data = []
248
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
249
+ file = EncodedIO (file )
250
+ file = io .TextIOWrapper (file , encoding = file .encoding )
249
251
250
252
# Add check exception
251
253
@@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
300
302
```
301
303
"""
302
304
def parse (self , file ):
303
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
305
+ file = EncodedIO (file )
306
+ file = io .TextIOWrapper (file , encoding = file .encoding )
304
307
while True :
305
308
batch = list (itertools .islice (file , settings .IMPORT_BATCH_SIZE ))
306
309
if not batch :
@@ -323,7 +326,8 @@ class CSVParser(FileParser):
323
326
```
324
327
"""
325
328
def parse (self , file ):
326
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
329
+ file = EncodedIO (file )
330
+ file = io .TextIOWrapper (file , encoding = file .encoding )
327
331
reader = csv .reader (file )
328
332
yield from ExcelParser .parse_excel_csv_reader (reader )
329
333
@@ -364,7 +368,8 @@ def parse_excel_csv_reader(reader):
364
368
class JSONParser (FileParser ):
365
369
366
370
def parse (self , file ):
367
- file = io .TextIOWrapper (file , encoding = 'utf-8' )
371
+ file = EncodedIO (file )
372
+ file = io .TextIOWrapper (file , encoding = file .encoding )
368
373
data = []
369
374
for i , line in enumerate (file , start = 1 ):
370
375
if len (data ) >= settings .IMPORT_BATCH_SIZE :
@@ -506,3 +511,34 @@ def readinto(self, b):
506
511
return 0 # indicate EOF
507
512
508
513
return io .BufferedReader (IterStream (), buffer_size = buffer_size )
514
+
515
+
516
+ class EncodedIO (io .RawIOBase ):
517
+ def __init__ (self , fobj , buffer_size = io .DEFAULT_BUFFER_SIZE , default_encoding = 'utf-8' ):
518
+ buffer = b''
519
+ detector = UniversalDetector ()
520
+
521
+ while True :
522
+ read = fobj .read (buffer_size )
523
+ detector .feed (read )
524
+ buffer += read
525
+ if detector .done or len (read ) < buffer_size :
526
+ break
527
+
528
+ if detector .done :
529
+ self .encoding = detector .result ['encoding' ]
530
+ else :
531
+ self .encoding = default_encoding
532
+
533
+ self ._fobj = fobj
534
+ self ._buffer = buffer
535
+
536
+ def readable (self ):
537
+ return self ._fobj .readable ()
538
+
539
+ def readinto (self , b ):
540
+ l = len (b )
541
+ chunk = self ._buffer or self ._fobj .read (l )
542
+ output , self ._buffer = chunk [:l ], chunk [l :]
543
+ b [:len (output )] = output
544
+ return len (output )
0 commit comments