Skip to content

Commit 01dc1a6

Browse files
committed
Fix isProbablyUtf8 to handle multibyte chars over the 1024 byte line
Previously, multibyte chars crossing this line would make a valid UTF-8 string be considered as binary. That's particularly bad because if you're modifying a string containing multibyte chars it might change from utf-8 to binary when a multibyte char is pushed across that line, resulting in a very weird edit experience.
1 parent 64b2a8e commit 01dc1a6

File tree

2 files changed

+101
-2
lines changed

2 files changed

+101
-2
lines changed

src/util/buffer.ts

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,31 @@ const binaryLaxDecoder = new TextDecoder('latin1', { fatal: false });
2222
// as UTF8) but for _editing_ binary data we need a lossless encoding, not utf8.
2323
// (Monaco isn't perfectly lossless editing binary anyway, but we can try).
2424
export function isProbablyUtf8(buffer: Buffer) {
25+
let dataToCheck: Buffer;
26+
if (buffer.byteLength > 1028) {
27+
// We want to trim the data to ~1KB, to avoid checking huge bodies in full.
28+
// Unfortunately, for UTF-8 this isn't safe: if a multibyte char crosses the
29+
// line when we cut the string, our slice of valid UTF-8 will be invalid.
30+
// To handle this, we find the first non-continuation byte after 1024, and
31+
// decode everything up to that point.
32+
33+
const lastUtf8IndexBefore1024 = buffer
34+
.slice(1024, 1028) // 4 bytes should be enough - max length of UTF8 char
35+
.findIndex((byte) =>
36+
(byte & 0xC0) != 0x80 // 0x80 === 0b10... => continuation byte
37+
);
38+
39+
// If there's no non-continuation byte, it's definitely not UTF-8
40+
if (lastUtf8IndexBefore1024 === -1) return false;
41+
const cleanEndOfUtf8Data = 1024 + lastUtf8IndexBefore1024;
42+
43+
dataToCheck = buffer.slice(0, cleanEndOfUtf8Data);
44+
} else {
45+
dataToCheck = buffer;
46+
}
47+
2548
try {
26-
// Just check the first 1kb, in case it's a huge file
27-
const dataToCheck = buffer.slice(0, 1024);
49+
// Fully decode our maybe-UTF8 chunk, and see if it's valid throughout:
2850
strictDecoder.decode(dataToCheck);
2951
return true; // Decoded OK, probably safe
3052
} catch (e) {

test/unit/util/buffer.spec.ts

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
2+
import { isProbablyUtf8 } from "../../../src/util/buffer";
3+
import { expect } from "../../test-setup";
4+
5+
describe("Buffer utils", () => {
6+
describe("isProbablyUtf8", () => {
7+
it("returns true for empty string", () => {
8+
expect(
9+
isProbablyUtf8(Buffer.from(""))
10+
).to.equal(true);
11+
});
12+
13+
it("returns true for a short UTF-8 string", () => {
14+
expect(
15+
isProbablyUtf8(Buffer.from("hello world"))
16+
).to.equal(true);
17+
});
18+
19+
it("returns false for a short binary string", () => {
20+
expect(
21+
isProbablyUtf8(Buffer.from([
22+
// The header for a PNG
23+
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A
24+
]))
25+
).to.equal(false);
26+
});
27+
28+
it("returns true for >1KB ASCII string", () => {
29+
expect(
30+
isProbablyUtf8(Buffer.alloc(4096).fill(
31+
'hello'
32+
))
33+
).to.equal(true);
34+
});
35+
36+
it("returns true for >1KB multibyte UTF-8 string", () => {
37+
expect(
38+
isProbablyUtf8(Buffer.alloc(4096).fill(
39+
'你好' // Hello in chinese (2x 3-byte chars)
40+
// N.b. 1024 is not divisible by 3, so this
41+
// tests char-split detection
42+
))
43+
).to.equal(true);
44+
});
45+
46+
it("returns true for an exactly 1026 byte multibyte UTF-8 string", () => {
47+
expect(
48+
isProbablyUtf8(Buffer.alloc(1026).fill(
49+
'你好' // Hello in chinese (2x 3-byte chars)
50+
// 1026 is divisible by 3, but this means we run out of
51+
// buffer looking for the next UTF-8 char from 1024+
52+
))
53+
).to.equal(true);
54+
});
55+
56+
it("returns false for >1KB binary string", () => {
57+
expect(
58+
isProbablyUtf8(Buffer.alloc(4096).fill(
59+
Buffer.from([
60+
// The header for a PNG
61+
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A
62+
])
63+
))
64+
).to.equal(false);
65+
});
66+
67+
it("returns false for >1KB binary string of continuation bytes", () => {
68+
expect(
69+
isProbablyUtf8(Buffer.from(
70+
Buffer.alloc(4096).fill(Buffer.from([
71+
0xba // All continuation bytes
72+
]))
73+
))
74+
).to.equal(false);
75+
});
76+
});
77+
})

0 commit comments

Comments
 (0)