Skip to content

Commit 00ad5e2

Browse files
authored
Optimize binary encoding by directly emitting the null byte (emscripten-core#25610)
Optimize binary encoding by directly emitting the null byte inside the generated file. The null byte 00h is a valid UTF-8 character: https://datatracker.ietf.org/doc/html/rfc3629 Given that we do still have the opt out -sSINGLE_FILE_BINARY_ENCODE=0 setting from binary encoding, I propose we try to take the encoding to its maximum potential, and see if we can get away with emitting the null byte as-is. The benefit of this are two-fold: a) assuming a uniform distribution of encoded bytes, not emitting nulls takes +0.39% more space. (or with nulls, -0.26% smaller) b) by not offsetting the bytes, any strings in the emitted binary data will be directly human-readable, e.g.: <img width="2464" height="1451" alt="image" src="https://github.com/user-attachments/assets/e85edc36-da52-4274-8a43-092405a45850" /> So C strings will be directly parseable/searchable in the output. That is appealing. I do not currently know of dealbreaking reasons to not avoid nulls, except than a generic FUD "editors/toolchains might be buggy to handle the null." But those are bugs of the editors, and we do have the `-sSINGLE_FILE_BINARY_ENCODE=0` fallback to avoid this. So emitting nulls will allow us to surface if there will be insurmountable issues with null bytes in the output. We can always revert back to the previous form, if a difficult blocker arises. (and as a plus, we will then have learned about that blocker, concretely telling us why that approach will not be feasible)
1 parent b5bf368 commit 00ad5e2

File tree

5 files changed

+43
-11
lines changed

5 files changed

+43
-11
lines changed

src/binaryDecode.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
// function, leading into incorrect results.
44
/** @noinline */
55
function binaryDecode(bin) {
6-
for (var i = 0, l = bin.length, o = new Uint8Array(l); i < l; ++i) {
7-
o[i] = bin.charCodeAt(i) - 1;
6+
for (var i = 0, l = bin.length, o = new Uint8Array(l), c; i < l; ++i) {
7+
c = bin.charCodeAt(i);
8+
o[i] = ~c >> 8 & c; // Recover the null byte in a manner that is compatible with https://crbug.com/453961758
89
}
910
return o;
1011
}

test/codesize/test_codesize_hello_single_file.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"a.out.js": 5394,
3-
"a.out.js.gz": 2992,
2+
"a.out.js": 5404,
3+
"a.out.js.gz": 2989,
44
"sent": [
55
"a (fd_write)"
66
]
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
2-
"a.html": 15176,
3-
"a.html.gz": 9126
2+
"a.html": 15143,
3+
"a.html.gz": 9087
44
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
2-
"a.html": 10998,
2+
"a.html": 10999,
33
"a.html.gz": 5758
44
}

tools/link.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2602,6 +2602,39 @@ def minify_html(filename):
26022602
start_time = time.time()
26032603
shared.check_call(shared.get_npm_cmd('html-minifier-terser') + [filename, '-o', filename] + opts, env=shared.env_with_node_in_path())
26042604

2605+
# HTML minifier will turn all null bytes into an escaped two-byte sequence "\0". Turn those back to single byte sequences.
2606+
def unescape_nulls(filename):
2607+
with open(filename, encoding="utf-8") as f:
2608+
data = f.read()
2609+
2610+
out = []
2611+
in_escape = False
2612+
i = 0
2613+
while i < len(data):
2614+
ch = data[i]
2615+
i += 1
2616+
if ch == '\\':
2617+
if in_escape:
2618+
out.append('\\\\')
2619+
in_escape = not in_escape
2620+
elif in_escape:
2621+
in_escape = False
2622+
if ch == '0':
2623+
out.append('\x00') # Convert '\\0' (5Ch 00h) into '\0' (00h)
2624+
elif ch == 'x' and data[i] == '0' and data[i + 1] == '0':
2625+
out.append('\x00') # Oddly html-minifier generates both "\\0" and "\\x00", so handle that too.
2626+
i += 2
2627+
else:
2628+
out.append('\\')
2629+
out.append(ch)
2630+
else:
2631+
out.append(ch)
2632+
2633+
with open(filename, "wb") as f:
2634+
f.write(''.join(out).encode("utf-8"))
2635+
2636+
unescape_nulls(filename)
2637+
26052638
elapsed_time = time.time() - start_time
26062639
size_after = os.path.getsize(filename)
26072640
delta = size_after - size_before
@@ -2953,19 +2986,17 @@ def move_file(src, dst):
29532986

29542987
def binary_encode(filename):
29552988
"""This function encodes the given binary byte array to a UTF-8 string, by
2956-
first adding +1 to all the bytes [0, 255] to form values [1, 256], and then
2957-
encoding each of those values as UTF-8, except for specific byte values that
2989+
encoding each byte values as UTF-8, except for specific byte values that
29582990
are escaped as two bytes. This kind of encoding results in a string that will
29592991
compress well by both gzip and brotli, unlike base64 encoding binary data
2960-
would do, and avoids emitting the null byte inside a string.
2992+
would do.
29612993
"""
29622994

29632995
data = utils.read_binary(filename)
29642996

29652997
out = bytearray(len(data) * 2) # Size output buffer conservatively
29662998
i = 0
29672999
for d in data:
2968-
d += 1 # Offset all bytes up by +1 to make zero (a very common value) be encoded with only one byte as 0x01. This is possible since we can encode 255 as 0x100 in UTF-8.
29693000
if d == ord('"'):
29703001
# Escape double quote " character with a backspace since we are writing the binary string inside double quotes.
29713002
# Also closure optimizer will turn the string into being delimited with double quotes, even if it were single quotes to start with. (" -> 2 bytes)

0 commit comments

Comments
 (0)