You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
These APIs were added to JS strings long ago now and are supported in
all engines specified in our `OLDEST_*` browser versions.
They save some code size by avoiding manual UTF-16 <-> UTF-32
conversion, although we still need to worry about correct iteration.
Copy file name to clipboardExpand all lines: src/lib/libstrings.js
+19-33Lines changed: 19 additions & 33 deletions
Original file line number
Diff line number
Diff line change
@@ -168,18 +168,10 @@ addToLibrary({
168
168
var startIdx = outIdx;
169
169
var endIdx = outIdx + maxBytesToWrite - 1; // -1 for string null terminator.
170
170
for (var i = 0; i < str.length; ++i) {
171
-
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code
172
-
// unit, not a Unicode code point of the character! So decode
173
-
// UTF16->UTF32->UTF8.
174
-
// See http://unicode.org/faq/utf_bom.html#utf16-3
175
171
// For UTF8 byte structure, see http://en.wikipedia.org/wiki/UTF-8#Description
176
172
// and https://www.ietf.org/rfc/rfc2279.txt
177
173
// and https://tools.ietf.org/html/rfc3629
178
-
var u = str.charCodeAt(i); // possibly a lead surrogate
179
-
if (u >= 0xD800 && u <= 0xDFFF) {
180
-
var u1 = str.charCodeAt(++i);
181
-
u = 0x10000 + ((u & 0x3FF) << 10) | (u1 & 0x3FF);
182
-
}
174
+
var u = str.codePointAt(i);
183
175
if (u <= 0x7F) {
184
176
if (outIdx >= endIdx) break;
185
177
heap[outIdx++] = u;
@@ -201,6 +193,9 @@ addToLibrary({
201
193
heap[outIdx++] = 0x80 | ((u >> 12) & 63);
202
194
heap[outIdx++] = 0x80 | ((u >> 6) & 63);
203
195
heap[outIdx++] = 0x80 | (u & 63);
196
+
// Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
197
+
// We need to manually skip over the second code unit for correct iteration.
198
+
i++;
204
199
}
205
200
}
206
201
// Null-terminate the pointer to the buffer.
@@ -407,23 +402,13 @@ addToLibrary({
407
402
#if ASSERTIONS
408
403
assert(ptr%4==0,'Pointer passed to UTF32ToString must be aligned to four bytes!');
409
404
#endif
410
-
vari=0;
411
-
412
405
varstr='';
413
406
// If maxBytesToRead is not passed explicitly, it will be undefined, and this
414
407
// will always evaluate to true. This saves on code size.
415
-
while(!(i>=maxBytesToRead/4)){
408
+
for(vari=0;!(i>=maxBytesToRead/4);i++){
416
409
varutf32={{{makeGetValue('ptr','i*4','i32')}}};
417
-
if(utf32==0)break;
418
-
++i;
419
-
// Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
420
-
// See http://unicode.org/faq/utf_bom.html#utf16-3
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! We must decode the string to UTF-32 to the heap.
463
-
// See http://unicode.org/faq/utf_bom.html#utf16-3
464
-
var codeUnit =str.charCodeAt(i);// possibly a lead surrogate
// Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
449
+
// We need to manually skip over the second code unit for correct iteration.
450
+
if(codePoint>0xFFFF){
451
+
i++;
468
452
}
469
-
{{{makeSetValue('outPtr',0,'codeUnit','i32')}}};
453
+
{{{makeSetValue('outPtr',0,'codePoint','i32')}}};
470
454
outPtr+=4;
471
455
if(outPtr+4>endPtr)break;
472
456
}
@@ -480,10 +464,12 @@ addToLibrary({
480
464
$lengthBytesUTF32: (str)=>{
481
465
varlen=0;
482
466
for(vari=0;i<str.length;++i){
483
-
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! We must decode the string to UTF-32 to the heap.
484
-
// See http://unicode.org/faq/utf_bom.html#utf16-3
485
-
varcodeUnit=str.charCodeAt(i);
486
-
if(codeUnit>=0xD800&&codeUnit<=0xDFFF)++i;// possibly a lead surrogate, so skip over the tail surrogate.
467
+
varcodePoint=str.codePointAt(i);
468
+
// Gotcha: if codePoint is over 0xFFFF, it is represented as a surrogate pair in UTF-16.
469
+
// We need to manually skip over the second code unit for correct iteration.
0 commit comments