77from typing import IO , Callable , List , Optional
88
99from unstructured .documents .elements import Element , PageBreak
10+ from unstructured .file_utils .encoding import detect_file_encoding
1011from unstructured .nlp .patterns import LIST_OF_DICTS_PATTERN
1112from unstructured .partition .common import (
1213 _add_element_metadata ,
@@ -190,6 +191,7 @@ def detect_filetype(
190191 content_type : Optional [str ] = None ,
191192 file : Optional [IO ] = None ,
192193 file_filename : Optional [str ] = None ,
194+ encoding : Optional [str ] = "utf-8" ,
193195) -> Optional [FileType ]:
194196 """Use libmagic to determine a file's type. Helps determine which partition brick
195197 to use for a given file. A return value of None indicates a non-supported file type.
@@ -257,10 +259,10 @@ def detect_filetype(
257259 elif extension and extension == ".html" :
258260 return FileType .HTML
259261
260- if _is_text_file_a_json (file = file , filename = filename ):
262+ if _is_text_file_a_json (file = file , filename = filename , encoding = encoding ):
261263 return FileType .JSON
262264
263- if _is_text_file_a_csv (file = file , filename = filename ):
265+ if _is_text_file_a_csv (file = file , filename = filename , encoding = encoding ):
264266 return FileType .CSV
265267
266268 if file and not extension and _check_eml_from_buffer (file = file ) is True :
@@ -333,6 +335,7 @@ def _detect_filetype_from_octet_stream(file: IO) -> FileType:
333335def _read_file_start_for_type_check (
334336 filename : Optional [str ] = None ,
335337 file : Optional [IO ] = None ,
338+ encoding : Optional [str ] = "utf-8" ,
336339) -> str :
337340 """Reads the start of the file and returns the text content."""
338341 exactly_one (filename = filename , file = file )
@@ -345,26 +348,33 @@ def _read_file_start_for_type_check(
345348 file_text = file_content .decode (errors = "ignore" )
346349 file .seek (0 )
347350 if filename is not None :
348- with open (filename ) as f :
349- file_text = f .read (4096 )
351+ try :
352+ with open (filename , encoding = encoding ) as f :
353+ file_text = f .read (4096 )
354+ except UnicodeDecodeError :
355+ encoding , _ = detect_file_encoding (filename = filename )
356+ with open (filename , encoding = encoding ) as f :
357+ file_text = f .read (4096 )
350358 return file_text
351359
352360
353361def _is_text_file_a_json (
354362 filename : Optional [str ] = None ,
355363 file : Optional [IO ] = None ,
364+ encoding : Optional [str ] = "utf-8" ,
356365):
357366 """Detects if a file that has a text/plain MIME type is a JSON file."""
358- file_text = _read_file_start_for_type_check (file = file , filename = filename )
367+ file_text = _read_file_start_for_type_check (file = file , filename = filename , encoding = encoding )
359368 return re .match (LIST_OF_DICTS_PATTERN , file_text ) is not None
360369
361370
362371def _is_text_file_a_csv (
363372 filename : Optional [str ] = None ,
364373 file : Optional [IO ] = None ,
374+ encoding : Optional [str ] = "utf-8" ,
365375):
366376 """Detects if a file that has a text/plain MIME type is a CSV file."""
367- file_text = _read_file_start_for_type_check (file = file , filename = filename )
377+ file_text = _read_file_start_for_type_check (file = file , filename = filename , encoding = encoding )
368378 lines = file_text .strip ().splitlines ()
369379 if len (lines ) < 2 :
370380 return False
0 commit comments