Skip to content

Commit efddd65

Browse files
anonrigerikcorry
andauthored
Fix TextEncoderStream surrogate pair handling across chunks (#5680)
Co-authored-by: Erik Corry <[email protected]>
1 parent 8c10cc1 commit efddd65

File tree

3 files changed

+89
-34
lines changed

3 files changed

+89
-34
lines changed

src/workerd/api/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,14 +436,14 @@ wd_cc_library(
436436
implementation_deps = [
437437
"//src/workerd/io:features",
438438
"//src/workerd/util:strings",
439-
"@simdutf",
440439
],
441440
visibility = ["//visibility:public"],
442441
deps = [
443442
":util",
444443
"//src/workerd/io:compatibility-date_capnp",
445444
"//src/workerd/jsg",
446445
"@capnp-cpp//src/kj",
446+
"@simdutf",
447447
],
448448
)
449449

src/workerd/api/streams/encoding.c++

Lines changed: 87 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,105 @@
44

55
#include "encoding.h"
66

7+
#include "simdutf.h"
8+
79
#include <workerd/api/encoding.h>
810
#include <workerd/api/streams/standard.h>
911
#include <workerd/io/features.h>
1012
#include <workerd/jsg/jsg.h>
1113

14+
#include <v8.h>
15+
16+
#include <kj/common.h>
17+
#include <kj/refcount.h>
18+
1219
namespace workerd::api {
1320

21+
namespace {
22+
constexpr kj::byte REPLACEMENT_UTF8[] = {0xEF, 0xBF, 0xBD};
23+
24+
struct Holder: public kj::Refcounted {
25+
kj::Maybe<char16_t> pending = kj::none;
26+
};
27+
} // namespace
28+
29+
// TextEncoderStream encodes a stream of JavaScript strings into UTF-8 bytes.
30+
//
31+
// WHATWG Encoding spec requirement (https://encoding.spec.whatwg.org/#interface-textencoderstream):
32+
// The encoder must encode unpaired UTF-16 surrogates as replacement characters.
33+
//
34+
// simdutf handles this for us, but we have to be careful of surrogate pairs
35+
// (high surrogate, followed by low surrogate) split across chunk boundaries.
36+
//
37+
// We do this with the pending field:
38+
// holder->pending = kj::none -> No pending high surrogate from previous chunk
39+
// holder->pending = char16_t -> High surrogate waiting for a matching low surrogate
40+
//
41+
// Ref: https://github.com/web-platform-tests/wpt/blob/master/encoding/streams/encode-utf8.any.js
1442
jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) {
15-
auto transformer = TransformStream::constructor(js,
16-
Transformer{.transform = jsg::Function<Transformer::TransformAlgorithm>(
17-
[](jsg::Lock& js, auto chunk, auto controller) {
18-
auto str = jsg::check(chunk->ToString(js.v8Context()));
19-
auto utf8Length = str->Utf8LengthV2(js.v8Isolate);
43+
auto state = kj::rc<Holder>();
2044

21-
// Don't emit empty chunks
22-
if (utf8Length == 0) {
23-
return js.resolvedPromise();
45+
auto transform = [holder = state.addRef()](jsg::Lock& js, v8::Local<v8::Value> chunk,
46+
jsg::Ref<TransformStreamDefaultController> controller) mutable {
47+
auto str = jsg::check(chunk->ToString(js.v8Context()));
48+
size_t length = str->Length();
49+
if (length == 0) return js.resolvedPromise();
50+
51+
// Allocate buffer: reserve slot 0 for pending surrogate if we have one
52+
size_t prefix = (holder->pending == kj::none) ? 0 : 1;
53+
size_t end = prefix + length;
54+
auto buf = kj::heapArray<char16_t>(end);
55+
str->WriteV2(js.v8Isolate, 0, length, reinterpret_cast<uint16_t*>(buf.begin() + prefix));
56+
57+
KJ_IF_SOME(lead, holder->pending) {
58+
buf.begin()[0] = lead;
59+
holder->pending = kj::none;
2460
}
2561

26-
v8::Local<v8::ArrayBuffer> buffer;
27-
JSG_REQUIRE(v8::ArrayBuffer::MaybeNew(js.v8Isolate, utf8Length).ToLocal(&buffer), RangeError,
28-
"Cannot allocate space for TextEncoder.encode");
29-
30-
auto bytes = jsg::asBytes(buffer).releaseAsChars();
31-
[[maybe_unused]] auto written = str->WriteUtf8V2(
32-
js.v8Isolate, bytes.begin(), bytes.size(), v8::String::WriteFlags::kReplaceInvalidUtf8);
33-
34-
KJ_DASSERT(written == buffer->ByteLength());
35-
controller->enqueue(js, v8::Uint8Array::New(buffer, 0, buffer->ByteLength()));
62+
// If chunk ends with high surrogate, save it for next chunk
63+
if (end > 0 && U_IS_LEAD(buf[end - 1])) {
64+
holder->pending = buf[--end];
65+
}
66+
if (end == 0) return js.resolvedPromise();
67+
68+
auto slice = buf.first(end);
69+
auto result = simdutf::utf8_length_from_utf16_with_replacement(slice.begin(), slice.size());
70+
// Only sanitize if there are surrogates in the buffer - UTF-16 without
71+
// surrogates is always well-formed.
72+
if (result.error == simdutf::error_code::SURROGATE) {
73+
simdutf::to_well_formed_utf16(slice.begin(), slice.size(), slice.begin());
74+
}
75+
auto utf8Length = result.count;
76+
KJ_DASSERT(utf8Length > 0 && utf8Length >= end);
77+
78+
auto backingStore = js.allocBackingStore(utf8Length, jsg::Lock::AllocOption::UNINITIALIZED);
79+
auto dest = kj::ArrayPtr<char>(static_cast<char*>(backingStore->Data()), utf8Length);
80+
[[maybe_unused]] auto written =
81+
simdutf::convert_utf16_to_utf8(slice.begin(), slice.size(), dest.begin());
82+
KJ_DASSERT(written == utf8Length, "simdutf should write exactly utf8Length bytes");
83+
84+
auto array = v8::Uint8Array::New(
85+
v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8Length);
86+
controller->enqueue(js, jsg::JsUint8Array(array));
3687
return js.resolvedPromise();
37-
})},
88+
};
89+
90+
auto flush = [holder = state.addRef()](
91+
jsg::Lock& js, jsg::Ref<TransformStreamDefaultController> controller) mutable {
92+
// If stream ends with orphaned high surrogate, emit replacement character
93+
if (holder->pending != kj::none) {
94+
auto backingStore = js.allocBackingStore(3, jsg::Lock::AllocOption::UNINITIALIZED);
95+
memcpy(backingStore->Data(), REPLACEMENT_UTF8, 3);
96+
auto array =
97+
v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, 3);
98+
controller->enqueue(js, jsg::JsUint8Array(array));
99+
}
100+
return js.resolvedPromise();
101+
};
102+
103+
auto transformer = TransformStream::constructor(js,
104+
Transformer{.transform = jsg::Function<Transformer::TransformAlgorithm>(kj::mv(transform)),
105+
.flush = jsg::Function<Transformer::FlushAlgorithm>(kj::mv(flush))},
38106
StreamQueuingStrategy{}, StreamQueuingStrategy{});
39107

40108
return js.alloc<TextEncoderStream>(transformer->getReadable(), transformer->getWritable());

src/wpt/encoding-test.ts

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -110,20 +110,7 @@ export default {
110110
],
111111
},
112112
'streams/encode-bad-chunks.any.js': {},
113-
'streams/encode-utf8.any.js': {
114-
comment: 'Surrogate pair handling across chunks not yet implemented',
115-
expectedFailures: [
116-
'a character split between chunks should be correctly encoded',
117-
'a character following one split between chunks should be correctly encoded',
118-
'an unmatched surrogate at the end of a chunk followed by an astral character in the next chunk should be replaced with the replacement character at the start of the next output chunk',
119-
'an unmatched surrogate at the end of a chunk followed by an ascii character in the next chunk should be replaced with the replacement character at the start of the next output chunk',
120-
'a non-terminal unpaired leading surrogate should immediately be replaced',
121-
'two consecutive astral characters each split down the middle should be correctly reassembled',
122-
'two consecutive astral characters each split down the middle with an invalid surrogate in the middle should be correctly encoded',
123-
'an unmatched surrogate at the end of a chunk followed by a plane 1 character split into two chunks should result in the encoded plane 1 character appearing in the last output chunk',
124-
'a leading surrogate chunk should be carried past empty chunks',
125-
],
126-
},
113+
'streams/encode-utf8.any.js': {},
127114
'streams/invalid-realm.window.js': {
128115
comment: 'Enable when ShadowRealm is supported',
129116
expectedFailures: [

0 commit comments

Comments
 (0)