Skip to content

Commit 8e7613d

Browse files
author
patched.codes[bot]
committed
Patched patchwork/common/utils/utils.py
1 parent e0f4770 commit 8e7613d

File tree

1 file changed

+52
-18
lines changed

1 file changed

+52
-18
lines changed

patchwork/common/utils/utils.py

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -57,32 +57,66 @@ def defered_temp_file(
5757
return tempfile_fp
5858

5959

60+
class FileWithEncoding:
61+
"""A file-like class that preserves line endings and handles encoding."""
62+
def __init__(self, file, mode, encoding, errors=None):
63+
# Default to 'strict' error handling if None provided
64+
self.binary_file = open(file, mode='rb')
65+
self.encoding = encoding
66+
self.errors = 'strict' if errors is None else errors
67+
self.mode = mode
68+
self.name = self.binary_file.name
69+
70+
def read(self, size=None):
71+
data = self.binary_file.read() if size is None else self.binary_file.read(size)
72+
return data.decode(self.encoding, errors=self.errors)
73+
74+
def write(self, data):
75+
if isinstance(data, str):
76+
data = data.encode(self.encoding, errors=self.errors)
77+
return self.binary_file.write(data)
78+
79+
def close(self):
80+
self.binary_file.close()
81+
82+
def __enter__(self):
83+
return self
84+
85+
def __exit__(self, exc_type, exc_value, traceback):
86+
self.close()
87+
6088
def open_with_chardet(file, mode="r", buffering=-1, errors=None, newline=None, closefd=True, opener=None):
89+
"""Opens a file with automatically detected encoding using chardet while preserving line endings.
90+
91+
Args:
92+
file: Path to file to open
93+
mode: Mode to open file in ("r" by default)
94+
buffering: Buffering policy (-1 by default)
95+
errors: How to handle encoding errors (None by default)
96+
newline: How to handle newlines (None by default, which preserves the original line endings)
97+
closefd: Whether to close the descriptor (True by default)
98+
opener: Optional opener function (None by default)
99+
100+
Returns:
101+
A file-like object with the detected encoding that preserves line endings
102+
"""
61103
detector = UniversalDetector()
62-
with open(
63-
file=file, mode="rb", buffering=buffering, errors=errors, newline=newline, closefd=closefd, opener=opener
64-
) as f:
104+
encoding = "utf-8" # Default encoding if file is empty or detection fails
105+
106+
with open(file, 'rb') as f:
65107
while True:
66-
line = f.read(1024)
67-
if not line:
108+
chunk = f.read(1024)
109+
if not chunk:
68110
break
69-
detector.feed(line)
111+
detector.feed(chunk)
70112
if detector.done:
71113
break
72114

73115
detector.close()
74-
75-
encoding = detector.result.get("encoding", "utf-8")
76-
return open(
77-
file=file,
78-
mode=mode,
79-
buffering=buffering,
80-
encoding=encoding,
81-
errors=errors,
82-
newline=newline,
83-
closefd=closefd,
84-
opener=opener,
85-
)
116+
if detector.result['encoding'] is not None:
117+
encoding = detector.result['encoding']
118+
119+
return FileWithEncoding(file, mode, encoding, errors)
86120

87121

88122
_ENCODING = tiktoken.get_encoding("cl100k_base")

0 commit comments

Comments
 (0)