@@ -57,32 +57,66 @@ def defered_temp_file(
5757 return tempfile_fp
5858
5959
60+ class FileWithEncoding :
61+ """A file-like class that preserves line endings and handles encoding."""
62+ def __init__ (self , file , mode , encoding , errors = None ):
63+ # Default to 'strict' error handling if None provided
64+ self .binary_file = open (file , mode = 'rb' )
65+ self .encoding = encoding
66+ self .errors = 'strict' if errors is None else errors
67+ self .mode = mode
68+ self .name = self .binary_file .name
69+
70+ def read (self , size = None ):
71+ data = self .binary_file .read () if size is None else self .binary_file .read (size )
72+ return data .decode (self .encoding , errors = self .errors )
73+
74+ def write (self , data ):
75+ if isinstance (data , str ):
76+ data = data .encode (self .encoding , errors = self .errors )
77+ return self .binary_file .write (data )
78+
79+ def close (self ):
80+ self .binary_file .close ()
81+
82+ def __enter__ (self ):
83+ return self
84+
85+ def __exit__ (self , exc_type , exc_value , traceback ):
86+ self .close ()
87+
6088def open_with_chardet (file , mode = "r" , buffering = - 1 , errors = None , newline = None , closefd = True , opener = None ):
89+ """Opens a file with automatically detected encoding using chardet while preserving line endings.
90+
91+ Args:
92+ file: Path to file to open
93+ mode: Mode to open file in ("r" by default)
94+ buffering: Buffering policy (-1 by default)
95+ errors: How to handle encoding errors (None by default)
96+ newline: How to handle newlines (None by default, which preserves the original line endings)
97+ closefd: Whether to close the descriptor (True by default)
98+ opener: Optional opener function (None by default)
99+
100+ Returns:
101+ A file-like object with the detected encoding that preserves line endings
102+ """
61103 detector = UniversalDetector ()
62- with open (
63- file = file , mode = "rb" , buffering = buffering , errors = errors , newline = newline , closefd = closefd , opener = opener
64- ) as f :
104+ encoding = "utf-8" # Default encoding if file is empty or detection fails
105+
106+ with open ( file , 'rb' ) as f :
65107 while True :
66- line = f .read (1024 )
67- if not line :
108+ chunk = f .read (1024 )
109+ if not chunk :
68110 break
69- detector .feed (line )
111+ detector .feed (chunk )
70112 if detector .done :
71113 break
72114
73115 detector .close ()
74-
75- encoding = detector .result .get ("encoding" , "utf-8" )
76- return open (
77- file = file ,
78- mode = mode ,
79- buffering = buffering ,
80- encoding = encoding ,
81- errors = errors ,
82- newline = newline ,
83- closefd = closefd ,
84- opener = opener ,
85- )
116+ if detector .result ['encoding' ] is not None :
117+ encoding = detector .result ['encoding' ]
118+
119+ return FileWithEncoding (file , mode , encoding , errors )
86120
87121
88122_ENCODING = tiktoken .get_encoding ("cl100k_base" )
0 commit comments