Skip to content

Commit f7ea981

Browse files
committed
Runtime: rely on TextEncoder and TextDecoder
1 parent 824efea commit f7ea981

File tree

2 files changed

+40
-123
lines changed

2 files changed

+40
-123
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* Runtime: make Obj.dup work with floats and boxed numbers (#1871)
1717
* Runtime: delete BigStringReader, one should use UInt8ArrayReader instead
1818
* Runtime: less conversion during un-marshalling (#1889)
19+
* Runtime: use TextEncoder/TextDecoder for utf8-utf16 conversions
1920
* Runtime/wasm: implement BLAKE2b primitives for Wasm (#1873)
2021
* Runtime/wasm: support jsoo_env and keep track of backtrace status (#1881)
2122
* Runtime/wasm: support unmarshaling compressed data (#1898)

runtime/js/mlBytes.js

Lines changed: 39 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -79,115 +79,6 @@ function caml_sub_uint8_array_to_jsbytes(a, i, len) {
7979
return s;
8080
}
8181

82-
//Provides: caml_utf8_of_utf16
83-
function caml_utf8_of_utf16(s) {
84-
for (var b = "", t = b, c, d, i = 0, l = s.length; i < l; i++) {
85-
c = s.charCodeAt(i);
86-
if (c < 0x80) {
87-
for (var j = i + 1; j < l && (c = s.charCodeAt(j)) < 0x80; j++);
88-
if (j - i > 512) {
89-
t.slice(0, 1);
90-
b += t;
91-
t = "";
92-
b += s.slice(i, j);
93-
} else t += s.slice(i, j);
94-
if (j === l) break;
95-
i = j;
96-
}
97-
if (c < 0x800) {
98-
t += String.fromCharCode(0xc0 | (c >> 6));
99-
t += String.fromCharCode(0x80 | (c & 0x3f));
100-
} else if (c < 0xd800 || c > 0xdfff) {
101-
t += String.fromCharCode(
102-
0xe0 | (c >> 12),
103-
0x80 | ((c >> 6) & 0x3f),
104-
0x80 | (c & 0x3f),
105-
);
106-
} else if (
107-
c > 0xdbff ||
108-
i + 1 === l ||
109-
(d = s.charCodeAt(i + 1)) < 0xdc00 ||
110-
d > 0xdfff
111-
) {
112-
// Unmatched surrogate pair, replaced by \ufffd (replacement character)
113-
t += "\xef\xbf\xbd";
114-
} else {
115-
i++;
116-
c = (c << 10) + d - 0x35fdc00;
117-
t += String.fromCharCode(
118-
0xf0 | (c >> 18),
119-
0x80 | ((c >> 12) & 0x3f),
120-
0x80 | ((c >> 6) & 0x3f),
121-
0x80 | (c & 0x3f),
122-
);
123-
}
124-
if (t.length > 1024) {
125-
t.slice(0, 1);
126-
b += t;
127-
t = "";
128-
}
129-
}
130-
return b + t;
131-
}
132-
133-
//Provides: caml_utf16_of_utf8
134-
function caml_utf16_of_utf8(s) {
135-
for (var b = "", t = "", c, c1, c2, v, i = 0, l = s.length; i < l; i++) {
136-
c1 = s.charCodeAt(i);
137-
if (c1 < 0x80) {
138-
for (var j = i + 1; j < l && (c1 = s.charCodeAt(j)) < 0x80; j++);
139-
if (j - i > 512) {
140-
t.slice(0, 1);
141-
b += t;
142-
t = "";
143-
b += s.slice(i, j);
144-
} else t += s.slice(i, j);
145-
if (j === l) break;
146-
i = j;
147-
}
148-
v = 1;
149-
if (++i < l && ((c2 = s.charCodeAt(i)) & -64) === 128) {
150-
c = c2 + (c1 << 6);
151-
if (c1 < 0xe0) {
152-
v = c - 0x3080;
153-
if (v < 0x80) v = 1;
154-
} else {
155-
v = 2;
156-
if (++i < l && ((c2 = s.charCodeAt(i)) & -64) === 128) {
157-
c = c2 + (c << 6);
158-
if (c1 < 0xf0) {
159-
v = c - 0xe2080;
160-
if (v < 0x800 || (v > 0xd7ff && v < 0xe000)) v = 2;
161-
} else {
162-
v = 3;
163-
if (
164-
++i < l &&
165-
((c2 = s.charCodeAt(i)) & -64) === 128 &&
166-
c1 < 0xf5
167-
) {
168-
v = c2 - 0x3c82080 + (c << 6);
169-
if (v < 0x10000 || v > 0x10ffff) v = 3;
170-
}
171-
}
172-
}
173-
}
174-
}
175-
if (v < 4) {
176-
// Invalid sequence
177-
i -= v;
178-
t += "\ufffd";
179-
} else if (v > 0xffff)
180-
t += String.fromCharCode(0xd7c0 + (v >> 10), 0xdc00 + (v & 0x3ff));
181-
else t += String.fromCharCode(v);
182-
if (t.length > 1024) {
183-
t.slice(0, 1);
184-
b += t;
185-
t = "";
186-
}
187-
}
188-
return b + t;
189-
}
190-
19182
//Provides: jsoo_is_ascii
19283
function jsoo_is_ascii(s) {
19384
// The regular expression gets better at around this point for all browsers
@@ -384,17 +275,28 @@ function caml_bytes_set(s, i, c) {
384275
return caml_bytes_unsafe_set(s, i, c);
385276
}
386277

278+
//Provides: jsoo_text_encoder
279+
var jsoo_text_encoder = new TextEncoder();
280+
281+
//Provides: jsoo_text_decoder
282+
var jsoo_text_decoder = new TextDecoder();
283+
387284
//Provides: caml_bytes_of_utf16_jsstring
388-
//Requires: jsoo_is_ascii, caml_utf8_of_utf16, MlBytes
285+
//Requires: MlBytes, jsoo_text_encoder
286+
//Requires: jsoo_is_ascii
389287
function caml_bytes_of_utf16_jsstring(s) {
390-
var tag = 9 /* BYTES | ASCII */;
391-
if (!jsoo_is_ascii(s))
392-
(tag = 8) /* BYTES | NOT_ASCII */, (s = caml_utf8_of_utf16(s));
393-
return new MlBytes(tag, s, s.length);
288+
if (jsoo_is_ascii(s)) {
289+
return new MlBytes(9, s, s.length);
290+
} else {
291+
var a = jsoo_text_encoder.encode(s);
292+
return new MlBytes(4, a, a.length);
293+
}
394294
}
395295

396296
//Provides: MlBytes
397-
//Requires: caml_convert_string_to_bytes, jsoo_is_ascii, caml_utf16_of_utf8
297+
//Requires: caml_convert_string_to_bytes, jsoo_is_ascii
298+
//Requires: caml_uint8_array_of_bytes
299+
//Requires: jsoo_text_decoder
398300
class MlBytes {
399301
constructor(tag, contents, length) {
400302
this.t = tag;
@@ -420,9 +322,9 @@ class MlBytes {
420322
}
421323

422324
toUtf16() {
423-
var r = this.toString();
424-
if (this.t === 9) return r;
425-
return caml_utf16_of_utf8(r);
325+
if (this.t === 9) return this.c;
326+
var a = caml_uint8_array_of_bytes(this);
327+
return jsoo_text_decoder.decode(a);
426328
}
427329

428330
slice() {
@@ -750,20 +652,35 @@ function caml_jsbytes_of_string(x) {
750652
return x;
751653
}
752654

655+
//Provides: jsoo_text_decoder_buff
656+
var jsoo_text_decoder_buff = new ArrayBuffer(1024);
657+
753658
//Provides: caml_jsstring_of_string const
754-
//Requires: jsoo_is_ascii, caml_utf16_of_utf8
659+
//Requires: jsoo_is_ascii
660+
//Requires: jsoo_text_decoder
661+
//Requires: jsoo_text_decoder_buff
755662
//If: js-string
756663
function caml_jsstring_of_string(s) {
757664
if (jsoo_is_ascii(s)) return s;
758-
return caml_utf16_of_utf8(s);
665+
var a =
666+
s.length <= jsoo_text_decoder_buff.length
667+
? new Uint8Array(jsoo_text_decoder_buff, 0, s.length)
668+
: new Uint8Array(s.length);
669+
for (var i = 0; i < s.length; i++) {
670+
a[i] = s.charCodeAt(i);
671+
}
672+
return jsoo_text_decoder.decode(a);
759673
}
760674

761675
//Provides: caml_string_of_jsstring const
762-
//Requires: jsoo_is_ascii, caml_utf8_of_utf16, caml_string_of_jsbytes
676+
//Requires: caml_string_of_array
677+
//Requires: jsoo_text_encoder
678+
//Requires: jsoo_is_ascii, caml_string_of_jsbytes
763679
//If: js-string
764680
function caml_string_of_jsstring(s) {
765681
if (jsoo_is_ascii(s)) return caml_string_of_jsbytes(s);
766-
else return caml_string_of_jsbytes(caml_utf8_of_utf16(s));
682+
var a = jsoo_text_encoder.encode(s);
683+
return caml_string_of_array(a);
767684
}
768685

769686
//Provides: caml_bytes_of_jsbytes const
@@ -883,7 +800,6 @@ function caml_ml_bytes_content(s) {
883800
}
884801

885802
//Provides: caml_is_ml_string
886-
//Requires: jsoo_is_ascii
887803
//If: js-string
888804
function caml_is_ml_string(s) {
889805
// biome-ignore lint/suspicious/noControlCharactersInRegex: expected

0 commit comments

Comments
 (0)