Use fromCodePoint / codePointAt APIs (#24356)

RReverser · web-flow · commit 15890bbee00d · 2025-05-19T15:10:53.000+02:00
These APIs were added to JS strings long ago now and are supported in
all engines specified in our `OLDEST_*` browser versions.

They save some code size by avoiding manual UTF-16 &lt;-&gt; UTF-32
conversion, although we still need to worry about correct iteration.
diff --git a/src/lib/libstrings.js b/src/lib/libstrings.js
@@ -168,18 +168,10 @@ addToLibrary({
     var startIdx = outIdx;
     var endIdx = outIdx + maxBytesToWrite - 1; // -1 for string null terminator.
     for (var i = 0; i < str.length; ++i) {
-      // Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code
-      // unit, not a Unicode code point of the character! So decode
-      // UTF16->UTF32->UTF8.
-      // See http://unicode.org/faq/utf_bom.html#utf16-3
       // For UTF8 byte structure, see http://en.wikipedia.org/wiki/UTF-8#Description
       // and https://www.ietf.org/rfc/rfc2279.txt
       // and https://tools.ietf.org/html/rfc3629
-      var u = str.charCodeAt(i); // possibly a lead surrogate
-      if (u >= 0xD800 && u <= 0xDFFF) {
-        var u1 = str.charCodeAt(++i);
-        u = 0x10000 + ((u & 0x3FF) << 10) | (u1 & 0x3FF);
-      }
+      var u = str.codePointAt(i);
       if (u <= 0x7F) {
         if (outIdx >= endIdx) break;
         heap[outIdx++] = u;
@@ -201,6 +193,9 @@ addToLibrary({
         heap[outIdx++] = 0x80 | ((u >> 12) & 63);
         heap[outIdx++] = 0x80 | ((u >> 6) & 63);
         heap[outIdx++] = 0x80 | (u & 63);
+        // Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
+        // We need to manually skip over the second code unit for correct iteration.
+        i++;
       }
     }
     // Null-terminate the pointer to the buffer.
@@ -407,23 +402,13 @@ addToLibrary({
 #if ASSERTIONS
     assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
 #endif
-    var i = 0;
-
     var str = '';
     // If maxBytesToRead is not passed explicitly, it will be undefined, and this
     // will always evaluate to true. This saves on code size.
-    while (!(i >= maxBytesToRead / 4)) {
+    for (var i = 0; !(i >= maxBytesToRead / 4); i++) {
       var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
-      if (utf32 == 0) break;
-      ++i;
-      // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
-      // See http://unicode.org/faq/utf_bom.html#utf16-3
-      if (utf32 >= 0x10000) {
-        var ch = utf32 - 0x10000;
-        str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
-      } else {
-        str += String.fromCharCode(utf32);
-      }
+      if (!utf32) break;
+      str += String.fromCodePoint(utf32);
     }
     return str;
   },
@@ -459,14 +444,13 @@ addToLibrary({
     var startPtr = outPtr;
     var endPtr = startPtr + maxBytesToWrite - 4;
     for (var i = 0; i < str.length; ++i) {
-      // Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! We must decode the string to UTF-32 to the heap.
-      // See http://unicode.org/faq/utf_bom.html#utf16-3
-      var codeUnit = str.charCodeAt(i); // possibly a lead surrogate
-      if (codeUnit >= 0xD800 && codeUnit <= 0xDFFF) {
-        var trailSurrogate = str.charCodeAt(++i);
-        codeUnit = 0x10000 + ((codeUnit & 0x3FF) << 10) | (trailSurrogate & 0x3FF);
+      var codePoint = str.codePointAt(i);
+      // Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
+      // We need to manually skip over the second code unit for correct iteration.
+      if (codePoint > 0xFFFF) {
+        i++;
       }
-      {{{ makeSetValue('outPtr', 0, 'codeUnit', 'i32') }}};
+      {{{ makeSetValue('outPtr', 0, 'codePoint', 'i32') }}};
       outPtr += 4;
       if (outPtr + 4 > endPtr) break;
     }
@@ -480,10 +464,12 @@ addToLibrary({
   $lengthBytesUTF32: (str) => {
     var len = 0;
     for (var i = 0; i < str.length; ++i) {
-      // Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! We must decode the string to UTF-32 to the heap.
-      // See http://unicode.org/faq/utf_bom.html#utf16-3
-      var codeUnit = str.charCodeAt(i);
-      if (codeUnit >= 0xD800 && codeUnit <= 0xDFFF) ++i; // possibly a lead surrogate, so skip over the tail surrogate.
+      var codePoint = str.codePointAt(i);
+      // Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
+      // We need to manually skip over the second code unit for correct iteration.
+      if (codePoint > 0xFFFF) {
+        i++;
+      }
       len += 4;
     }
 
diff --git a/test/code_size/embind_hello_wasm.json b/test/code_size/embind_hello_wasm.json
@@ -1,10 +1,10 @@
 {
   "a.html": 552,
   "a.html.gz": 380,
-  "a.js": 8831,
-  "a.js.gz": 3900,
+  "a.js": 8657,
+  "a.js.gz": 3835,
   "a.wasm": 7344,
   "a.wasm.gz": 3368,
-  "total": 16727,
-  "total_gz": 7648
+  "total": 16553,
+  "total_gz": 7583
 }
diff --git a/test/code_size/embind_val_wasm.json b/test/code_size/embind_val_wasm.json
@@ -1,10 +1,10 @@
 {
   "a.html": 552,
   "a.html.gz": 380,
-  "a.js": 6688,
-  "a.js.gz": 2893,
+  "a.js": 6514,
+  "a.js.gz": 2827,
   "a.wasm": 9137,
   "a.wasm.gz": 4700,
-  "total": 16377,
-  "total_gz": 7973
+  "total": 16203,
+  "total_gz": 7907
 }
diff --git a/test/other/codesize/test_codesize_cxx_ctors1.gzsize b/test/other/codesize/test_codesize_cxx_ctors1.gzsize
@@ -1 +1 @@
-8202
+8184
diff --git a/test/other/codesize/test_codesize_cxx_ctors1.jssize b/test/other/codesize/test_codesize_cxx_ctors1.jssize
@@ -1 +1 @@
-19869
+19848
diff --git a/test/other/codesize/test_codesize_cxx_ctors2.gzsize b/test/other/codesize/test_codesize_cxx_ctors2.gzsize
@@ -1 +1 @@
-8191
+8173
diff --git a/test/other/codesize/test_codesize_cxx_ctors2.jssize b/test/other/codesize/test_codesize_cxx_ctors2.jssize
@@ -1 +1 @@
-19847
+19826
diff --git a/test/other/codesize/test_codesize_cxx_except.gzsize b/test/other/codesize/test_codesize_cxx_except.gzsize
@@ -1 +1 @@
-9198
+9181
diff --git a/test/other/codesize/test_codesize_cxx_except.jssize b/test/other/codesize/test_codesize_cxx_except.jssize
@@ -1 +1 @@
-23606
+23585
diff --git a/test/other/codesize/test_codesize_cxx_except_wasm.gzsize b/test/other/codesize/test_codesize_cxx_except_wasm.gzsize
@@ -1 +1 @@
-8151
+8133
diff --git a/test/other/codesize/test_codesize_cxx_except_wasm.jssize b/test/other/codesize/test_codesize_cxx_except_wasm.jssize
@@ -1 +1 @@
-19762
+19741
diff --git a/test/other/codesize/test_codesize_cxx_except_wasm_legacy.gzsize b/test/other/codesize/test_codesize_cxx_except_wasm_legacy.gzsize
@@ -1 +1 @@
-8151
+8133
diff --git a/test/other/codesize/test_codesize_cxx_except_wasm_legacy.jssize b/test/other/codesize/test_codesize_cxx_except_wasm_legacy.jssize
@@ -1 +1 @@
-19762
+19741
diff --git a/test/other/codesize/test_codesize_cxx_lto.gzsize b/test/other/codesize/test_codesize_cxx_lto.gzsize
@@ -1 +1 @@
-8057
+8036
diff --git a/test/other/codesize/test_codesize_cxx_lto.jssize b/test/other/codesize/test_codesize_cxx_lto.jssize
@@ -1 +1 @@
-19581
+19560
diff --git a/test/other/codesize/test_codesize_cxx_mangle.gzsize b/test/other/codesize/test_codesize_cxx_mangle.gzsize
@@ -1 +1 @@
-9241
+9225
diff --git a/test/other/codesize/test_codesize_cxx_mangle.jssize b/test/other/codesize/test_codesize_cxx_mangle.jssize
@@ -1 +1 @@
-23721
+23700
diff --git a/test/other/codesize/test_codesize_cxx_noexcept.gzsize b/test/other/codesize/test_codesize_cxx_noexcept.gzsize
@@ -1 +1 @@
-8202
+8184
diff --git a/test/other/codesize/test_codesize_cxx_noexcept.jssize b/test/other/codesize/test_codesize_cxx_noexcept.jssize
@@ -1 +1 @@
-19869
+19848
diff --git a/test/other/codesize/test_codesize_cxx_wasmfs.gzsize b/test/other/codesize/test_codesize_cxx_wasmfs.gzsize
@@ -1 +1 @@
-3382
+3374
diff --git a/test/other/codesize/test_codesize_cxx_wasmfs.jssize b/test/other/codesize/test_codesize_cxx_wasmfs.jssize
@@ -1 +1 @@
-7269
+7248
diff --git a/test/other/codesize/test_codesize_files_js_fs.gzsize b/test/other/codesize/test_codesize_files_js_fs.gzsize
@@ -1 +1 @@
-7495
+7485
diff --git a/test/other/codesize/test_codesize_files_js_fs.jssize b/test/other/codesize/test_codesize_files_js_fs.jssize
@@ -1 +1 @@
-18450
+18429
diff --git a/test/other/codesize/test_codesize_files_wasmfs.gzsize b/test/other/codesize/test_codesize_files_wasmfs.gzsize
@@ -1 +1 @@
-2635
+2621
diff --git a/test/other/codesize/test_codesize_files_wasmfs.jssize b/test/other/codesize/test_codesize_files_wasmfs.jssize
@@ -1 +1 @@
-5652
+5629
diff --git a/test/other/codesize/test_codesize_hello_dylink.gzsize b/test/other/codesize/test_codesize_hello_dylink.gzsize
@@ -1 +1 @@
-11727
+11714
diff --git a/test/other/codesize/test_codesize_hello_dylink.jssize b/test/other/codesize/test_codesize_hello_dylink.jssize
@@ -1 +1 @@
-27722
+27701
diff --git a/test/other/codesize/test_codesize_mem_O3.gzsize b/test/other/codesize/test_codesize_mem_O3.gzsize
@@ -1 +1 @@
-2166
+2147
diff --git a/test/other/codesize/test_codesize_mem_O3.jssize b/test/other/codesize/test_codesize_mem_O3.jssize
@@ -1 +1 @@
-4518
+4495
diff --git a/test/other/codesize/test_codesize_mem_O3_grow.gzsize b/test/other/codesize/test_codesize_mem_O3_grow.gzsize
@@ -1 +1 @@
-2317
+2298
diff --git a/test/other/codesize/test_codesize_mem_O3_grow.jssize b/test/other/codesize/test_codesize_mem_O3_grow.jssize
@@ -1 +1 @@
-4803
+4780