Skip to content

Commit 73a293f

Browse files
committed
Be stricter about slightly-invalid text decoding
1 parent 0010944 commit 73a293f

File tree

2 files changed

+6
-8
lines changed

2 files changed

+6
-8
lines changed

CHANGELOG.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@
44

55
Updated Clojure, Common Lisp and Zig parsers.
66

7-
File detection is now stricter with Windows-1252 (Latin 1) encoded
8-
text. Windows-1252 was added in 0.63 and some binary files
9-
(e.g. Brotli compressed files) were incorrectly treated as this
10-
encoding.
7+
Text encoding detection is now stricter, fixing more cases where
8+
binary files were treated as text.
119

1210
Added the `--override-binary` option to force files to be treated as
1311
binary rather than text.

src/files.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ pub(crate) fn guess_content(
223223
let utf8_string = String::from_utf8_lossy(bytes).to_string();
224224
let num_utf8_invalid = utf8_string
225225
.chars()
226-
.take(5000)
226+
.take(50000)
227227
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
228228
.count();
229229
if num_utf8_invalid <= 2 {
@@ -239,7 +239,7 @@ pub(crate) fn guess_content(
239239
let utf16_string = String::from_utf16_lossy(&u16_values);
240240
let num_utf16_invalid = utf16_string
241241
.chars()
242-
.take(5000)
242+
.take(50000)
243243
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
244244
.count();
245245
if num_utf16_invalid <= 1 {
@@ -250,13 +250,13 @@ pub(crate) fn guess_content(
250250
return ProbableFileKind::Text(utf16_string);
251251
}
252252

253-
// If the input bytes are valid Windows-1252 (an extension of
253+
// If the input bytes are mostly valid Windows-1252 (an extension of
254254
// ISO-8859-1 aka Latin 1), treat them as such.
255255
let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes);
256256
if !saw_malformed {
257257
let num_null = latin1_str
258258
.chars()
259-
.take(5000)
259+
.take(50000)
260260
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
261261
.count();
262262
if num_null <= 1 {

0 commit comments

Comments
 (0)