Fix isProbablyUtf8 to handle multibyte chars over the 1024 byte line

pimterry · pimterry · commit 01dc1a6e2806 · 2024-04-19T15:04:15.000+02:00
Previously, multibyte chars crossing this line would make a valid UTF-8
string be considered as binary. That's particularly bad because if
you're modifying a string containing multibyte chars it might change
from utf-8 to binary when a multibyte char is pushed across that line,
resulting in a very weird edit experience.
diff --git a/src/util/buffer.ts b/src/util/buffer.ts
@@ -22,9 +22,31 @@ const binaryLaxDecoder = new TextDecoder('latin1', { fatal: false });
 // as UTF8) but for _editing_ binary data we need a lossless encoding, not utf8.
 // (Monaco isn't perfectly lossless editing binary anyway, but we can try).
 export function isProbablyUtf8(buffer: Buffer) {
+    let dataToCheck: Buffer;
+    if (buffer.byteLength > 1028) {
+        // We want to trim the data to ~1KB, to avoid checking huge bodies in full.
+        // Unfortunately, for UTF-8 this isn't safe: if a multibyte char crosses the
+        // line when we cut the string, our slice of valid UTF-8 will be invalid.
+        // To handle this, we find the first non-continuation byte after 1024, and
+        // decode everything up to that point.
+
+        const lastUtf8IndexBefore1024 = buffer
+            .slice(1024, 1028) // 4 bytes should be enough - max length of UTF8 char
+            .findIndex((byte) =>
+                (byte & 0xC0) != 0x80 // 0x80 === 0b10... => continuation byte
+            );
+
+        // If there's no non-continuation byte, it's definitely not UTF-8
+        if (lastUtf8IndexBefore1024 === -1) return false;
+        const cleanEndOfUtf8Data = 1024 + lastUtf8IndexBefore1024;
+
+        dataToCheck = buffer.slice(0, cleanEndOfUtf8Data);
+    } else {
+        dataToCheck = buffer;
+    }
+
     try {
-        // Just check the first 1kb, in case it's a huge file
-        const dataToCheck = buffer.slice(0, 1024);
+        // Fully decode our maybe-UTF8 chunk, and see if it's valid throughout:
         strictDecoder.decode(dataToCheck);
         return true; // Decoded OK, probably safe
     } catch (e) {
diff --git a/test/unit/util/buffer.spec.ts b/test/unit/util/buffer.spec.ts
@@ -0,0 +1,77 @@
+
+import { isProbablyUtf8 } from "../../../src/util/buffer";
+import { expect } from "../../test-setup";
+
+describe("Buffer utils", () => {
+    describe("isProbablyUtf8", () => {
+        it("returns true for empty string", () => {
+            expect(
+                isProbablyUtf8(Buffer.from(""))
+            ).to.equal(true);
+        });
+
+        it("returns true for a short UTF-8 string", () => {
+            expect(
+                isProbablyUtf8(Buffer.from("hello world"))
+            ).to.equal(true);
+        });
+
+        it("returns false for a short binary string", () => {
+            expect(
+                isProbablyUtf8(Buffer.from([
+                    // The header for a PNG
+                    0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A
+                ]))
+            ).to.equal(false);
+        });
+
+        it("returns true for >1KB ASCII string", () => {
+            expect(
+                isProbablyUtf8(Buffer.alloc(4096).fill(
+                    'hello'
+                ))
+            ).to.equal(true);
+        });
+
+        it("returns true for >1KB multibyte UTF-8 string", () => {
+            expect(
+                isProbablyUtf8(Buffer.alloc(4096).fill(
+                    '你好' // Hello in chinese (2x 3-byte chars)
+                    // N.b. 1024 is not divisible by 3, so this
+                    // tests char-split detection
+                ))
+            ).to.equal(true);
+        });
+
+        it("returns true for an exactly 1026 byte multibyte UTF-8 string", () => {
+            expect(
+                isProbablyUtf8(Buffer.alloc(1026).fill(
+                    '你好' // Hello in chinese (2x 3-byte chars)
+                    // 1026 is divisible by 3, but this means we run out of
+                    // buffer looking for the next UTF-8 char from 1024+
+                ))
+            ).to.equal(true);
+        });
+
+        it("returns false for >1KB binary string", () => {
+            expect(
+                isProbablyUtf8(Buffer.alloc(4096).fill(
+                    Buffer.from([
+                        // The header for a PNG
+                        0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A
+                    ])
+                ))
+            ).to.equal(false);
+        });
+
+        it("returns false for >1KB binary string of continuation bytes", () => {
+            expect(
+                isProbablyUtf8(Buffer.from(
+                    Buffer.alloc(4096).fill(Buffer.from([
+                        0xba // All continuation bytes
+                    ]))
+                ))
+            ).to.equal(false);
+        });
+    });
+})