Optimize binary encoding by directly emitting the null byte (emscripten-core#25610)

juj · web-flow · commit 00ad5e2c80a0 · 2025-10-22T00:25:08.000+03:00
Optimize binary encoding by directly emitting the null byte inside the generated file. The null byte 00h is a valid UTF-8 character: https://datatracker.ietf.org/doc/html/rfc3629 Given that we do still have the opt out -sSINGLE_FILE_BINARY_ENCODE=0 setting from binary encoding, I propose we try to take the encoding to its maximum potential, and see if we can get away with emitting the null byte as-is. The benefit of this are two-fold: a) assuming a uniform distribution of encoded bytes, not emitting nulls takes +0.39% more space. (or with nulls, -0.26% smaller) b) by not offsetting the bytes, any strings in the emitted binary data will be directly human-readable, e.g.: <img width="2464" height="1451" alt="image" src="https://github.com/user-attachments/assets/e85edc36-da52-4274-8a43-092405a45850" /> So C strings will be directly parseable/searchable in the output. That is appealing. I do not currently know of dealbreaking reasons to not avoid nulls, except than a generic FUD "editors/toolchains might be buggy to handle the null." But those are bugs of the editors, and we do have the `-sSINGLE_FILE_BINARY_ENCODE=0` fallback to avoid this. So emitting nulls will allow us to surface if there will be insurmountable issues with null bytes in the output. We can always revert back to the previous form, if a difficult blocker arises. (and as a plus, we will then have learned about that blocker, concretely telling us why that approach will not be feasible)
diff --git a/src/binaryDecode.js b/src/binaryDecode.js
@@ -3,8 +3,9 @@
 // function, leading into incorrect results.
 /** @noinline */
 function binaryDecode(bin) {
-  for (var i = 0, l = bin.length, o = new Uint8Array(l); i < l; ++i) {
-    o[i] = bin.charCodeAt(i) - 1;
+  for (var i = 0, l = bin.length, o = new Uint8Array(l), c; i < l; ++i) {
+    c = bin.charCodeAt(i);
+    o[i] = ~c >> 8 & c; // Recover the null byte in a manner that is compatible with https://crbug.com/453961758
   }
   return o;
 }
diff --git a/test/codesize/test_codesize_hello_single_file.json b/test/codesize/test_codesize_hello_single_file.json
@@ -1,6 +1,6 @@
 {
-  "a.out.js": 5394,
-  "a.out.js.gz": 2992,
+  "a.out.js": 5404,
+  "a.out.js.gz": 2989,
   "sent": [
     "a (fd_write)"
   ]
diff --git a/test/codesize/test_minimal_runtime_code_size_hello_webgl2_wasm_singlefile.json b/test/codesize/test_minimal_runtime_code_size_hello_webgl2_wasm_singlefile.json
@@ -1,4 +1,4 @@
 {
-  "a.html": 15176,
-  "a.html.gz": 9126
+  "a.html": 15143,
+  "a.html.gz": 9087
 }
diff --git a/test/codesize/test_minimal_runtime_code_size_random_printf_wasm.json b/test/codesize/test_minimal_runtime_code_size_random_printf_wasm.json
@@ -1,4 +1,4 @@
 {
-  "a.html": 10998,
+  "a.html": 10999,
   "a.html.gz": 5758
 }
diff --git a/tools/link.py b/tools/link.py
@@ -2602,6 +2602,39 @@ def minify_html(filename):
   start_time = time.time()
   shared.check_call(shared.get_npm_cmd('html-minifier-terser') + [filename, '-o', filename] + opts, env=shared.env_with_node_in_path())
 
+  # HTML minifier will turn all null bytes into an escaped two-byte sequence "\0". Turn those back to single byte sequences.
+  def unescape_nulls(filename):
+    with open(filename, encoding="utf-8") as f:
+      data = f.read()
+
+    out = []
+    in_escape = False
+    i = 0
+    while i < len(data):
+      ch = data[i]
+      i += 1
+      if ch == '\\':
+        if in_escape:
+          out.append('\\\\')
+        in_escape = not in_escape
+      elif in_escape:
+        in_escape = False
+        if ch == '0':
+          out.append('\x00') # Convert '\\0' (5Ch 00h) into '\0' (00h)
+        elif ch == 'x' and data[i] == '0' and data[i + 1] == '0':
+          out.append('\x00') # Oddly html-minifier generates both "\\0" and "\\x00", so handle that too.
+          i += 2
+        else:
+          out.append('\\')
+          out.append(ch)
+      else:
+        out.append(ch)
+
+    with open(filename, "wb") as f:
+      f.write(''.join(out).encode("utf-8"))
+
+  unescape_nulls(filename)
+
   elapsed_time = time.time() - start_time
   size_after = os.path.getsize(filename)
   delta = size_after - size_before
@@ -2953,19 +2986,17 @@ def move_file(src, dst):
 
 def binary_encode(filename):
   """This function encodes the given binary byte array to a UTF-8 string, by
-  first adding +1 to all the bytes [0, 255] to form values [1, 256], and then
-  encoding each of those values as UTF-8, except for specific byte values that
+  encoding each byte values as UTF-8, except for specific byte values that
   are escaped as two bytes. This kind of encoding results in a string that will
   compress well by both gzip and brotli, unlike base64 encoding binary data
-  would do, and avoids emitting the null byte inside a string.
+  would do.
   """
 
   data = utils.read_binary(filename)
 
   out = bytearray(len(data) * 2) # Size output buffer conservatively
   i = 0
   for d in data:
-    d += 1 # Offset all bytes up by +1 to make zero (a very common value) be encoded with only one byte as 0x01. This is possible since we can encode 255 as 0x100 in UTF-8.
     if d == ord('"'):
       # Escape double quote " character with a backspace since we are writing the binary string inside double quotes.
       # Also closure optimizer will turn the string into being delimited with double quotes, even if it were single quotes to start with. (" -> 2 bytes)

Original file line number	Diff line number	Diff line change
`@@ -3,8 +3,9 @@`
`3`	`3`	`// function, leading into incorrect results.`
`4`	`4`	`/** @noinline */`
`5`	`5`	`function binaryDecode(bin) {`
`6`		`- for (var i = 0, l = bin.length, o = new Uint8Array(l); i < l; ++i) {`
`7`		`- o[i] = bin.charCodeAt(i) - 1;`
	`6`	`+ for (var i = 0, l = bin.length, o = new Uint8Array(l), c; i < l; ++i) {`
	`7`	`+ c = bin.charCodeAt(i);`
	`8`	`+ o[i] = ~c >> 8 & c; // Recover the null byte in a manner that is compatible with https://crbug.com/453961758`
`8`	`9`	`}`
`9`	`10`	`return o;`
`10`	`11`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`		`- "a.out.js": 5394,`
`3`		`- "a.out.js.gz": 2992,`
	`2`	`+ "a.out.js": 5404,`
	`3`	`+ "a.out.js.gz": 2989,`
`4`	`4`	`"sent": [`
`5`	`5`	`"a (fd_write)"`
`6`	`6`	`]`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`{`
`2`		`- "a.html": 15176,`
`3`		`- "a.html.gz": 9126`
	`2`	`+ "a.html": 15143,`
	`3`	`+ "a.html.gz": 9087`
`4`	`4`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`{`
`2`		`- "a.html": 10998,`
	`2`	`+ "a.html": 10999,`
`3`	`3`	`"a.html.gz": 5758`
`4`	`4`	`}`