Skip to content

Commit 15890bb

Browse files
authored
Use fromCodePoint / codePointAt APIs (#24356)
These APIs were added to JS strings long ago now and are supported in all engines specified in our `OLDEST_*` browser versions. They save some code size by avoiding manual UTF-16 <-> UTF-32 conversion, although we still need to worry about correct iteration.
1 parent d0514d0 commit 15890bb

31 files changed

+55
-69
lines changed

src/lib/libstrings.js

Lines changed: 19 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -168,18 +168,10 @@ addToLibrary({
168168
var startIdx = outIdx;
169169
var endIdx = outIdx + maxBytesToWrite - 1; // -1 for string null terminator.
170170
for (var i = 0; i < str.length; ++i) {
171-
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code
172-
// unit, not a Unicode code point of the character! So decode
173-
// UTF16->UTF32->UTF8.
174-
// See http://unicode.org/faq/utf_bom.html#utf16-3
175171
// For UTF8 byte structure, see http://en.wikipedia.org/wiki/UTF-8#Description
176172
// and https://www.ietf.org/rfc/rfc2279.txt
177173
// and https://tools.ietf.org/html/rfc3629
178-
var u = str.charCodeAt(i); // possibly a lead surrogate
179-
if (u >= 0xD800 && u <= 0xDFFF) {
180-
var u1 = str.charCodeAt(++i);
181-
u = 0x10000 + ((u & 0x3FF) << 10) | (u1 & 0x3FF);
182-
}
174+
var u = str.codePointAt(i);
183175
if (u <= 0x7F) {
184176
if (outIdx >= endIdx) break;
185177
heap[outIdx++] = u;
@@ -201,6 +193,9 @@ addToLibrary({
201193
heap[outIdx++] = 0x80 | ((u >> 12) & 63);
202194
heap[outIdx++] = 0x80 | ((u >> 6) & 63);
203195
heap[outIdx++] = 0x80 | (u & 63);
196+
// Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
197+
// We need to manually skip over the second code unit for correct iteration.
198+
i++;
204199
}
205200
}
206201
// Null-terminate the pointer to the buffer.
@@ -407,23 +402,13 @@ addToLibrary({
407402
#if ASSERTIONS
408403
assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
409404
#endif
410-
var i = 0;
411-
412405
var str = '';
413406
// If maxBytesToRead is not passed explicitly, it will be undefined, and this
414407
// will always evaluate to true. This saves on code size.
415-
while (!(i >= maxBytesToRead / 4)) {
408+
for (var i = 0; !(i >= maxBytesToRead / 4); i++) {
416409
var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
417-
if (utf32 == 0) break;
418-
++i;
419-
// Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
420-
// See http://unicode.org/faq/utf_bom.html#utf16-3
421-
if (utf32 >= 0x10000) {
422-
var ch = utf32 - 0x10000;
423-
str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
424-
} else {
425-
str += String.fromCharCode(utf32);
426-
}
410+
if (!utf32) break;
411+
str += String.fromCodePoint(utf32);
427412
}
428413
return str;
429414
},
@@ -459,14 +444,13 @@ addToLibrary({
459444
var startPtr = outPtr;
460445
var endPtr = startPtr + maxBytesToWrite - 4;
461446
for (var i = 0; i < str.length; ++i) {
462-
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! We must decode the string to UTF-32 to the heap.
463-
// See http://unicode.org/faq/utf_bom.html#utf16-3
464-
var codeUnit = str.charCodeAt(i); // possibly a lead surrogate
465-
if (codeUnit >= 0xD800 && codeUnit <= 0xDFFF) {
466-
var trailSurrogate = str.charCodeAt(++i);
467-
codeUnit = 0x10000 + ((codeUnit & 0x3FF) << 10) | (trailSurrogate & 0x3FF);
447+
var codePoint = str.codePointAt(i);
448+
// Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
449+
// We need to manually skip over the second code unit for correct iteration.
450+
if (codePoint > 0xFFFF) {
451+
i++;
468452
}
469-
{{{ makeSetValue('outPtr', 0, 'codeUnit', 'i32') }}};
453+
{{{ makeSetValue('outPtr', 0, 'codePoint', 'i32') }}};
470454
outPtr += 4;
471455
if (outPtr + 4 > endPtr) break;
472456
}
@@ -480,10 +464,12 @@ addToLibrary({
480464
$lengthBytesUTF32: (str) => {
481465
var len = 0;
482466
for (var i = 0; i < str.length; ++i) {
483-
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! We must decode the string to UTF-32 to the heap.
484-
// See http://unicode.org/faq/utf_bom.html#utf16-3
485-
var codeUnit = str.charCodeAt(i);
486-
if (codeUnit >= 0xD800 && codeUnit <= 0xDFFF) ++i; // possibly a lead surrogate, so skip over the tail surrogate.
467+
var codePoint = str.codePointAt(i);
468+
// Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
469+
// We need to manually skip over the second code unit for correct iteration.
470+
if (codePoint > 0xFFFF) {
471+
i++;
472+
}
487473
len += 4;
488474
}
489475

test/code_size/embind_hello_wasm.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"a.html": 552,
33
"a.html.gz": 380,
4-
"a.js": 8831,
5-
"a.js.gz": 3900,
4+
"a.js": 8657,
5+
"a.js.gz": 3835,
66
"a.wasm": 7344,
77
"a.wasm.gz": 3368,
8-
"total": 16727,
9-
"total_gz": 7648
8+
"total": 16553,
9+
"total_gz": 7583
1010
}

test/code_size/embind_val_wasm.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"a.html": 552,
33
"a.html.gz": 380,
4-
"a.js": 6688,
5-
"a.js.gz": 2893,
4+
"a.js": 6514,
5+
"a.js.gz": 2827,
66
"a.wasm": 9137,
77
"a.wasm.gz": 4700,
8-
"total": 16377,
9-
"total_gz": 7973
8+
"total": 16203,
9+
"total_gz": 7907
1010
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
8202
1+
8184
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
19869
1+
19848
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
8191
1+
8173
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
19847
1+
19826
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
9198
1+
9181
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
23606
1+
23585
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
8151
1+
8133

0 commit comments

Comments
 (0)