|
4 | 4 |
|
5 | 5 | #include "encoding.h" |
6 | 6 |
|
| 7 | +#include "simdutf.h" |
| 8 | + |
7 | 9 | #include <workerd/api/encoding.h> |
8 | 10 | #include <workerd/api/streams/standard.h> |
9 | 11 | #include <workerd/io/features.h> |
10 | 12 | #include <workerd/jsg/jsg.h> |
11 | 13 |
|
| 14 | +#include <v8.h> |
| 15 | + |
| 16 | +#include <kj/common.h> |
| 17 | +#include <kj/refcount.h> |
| 18 | + |
12 | 19 | namespace workerd::api { |
13 | 20 |
|
| 21 | +namespace { |
| 22 | +constexpr kj::byte REPLACEMENT_UTF8[] = {0xEF, 0xBF, 0xBD}; |
| 23 | + |
| 24 | +struct Holder: public kj::Refcounted { |
| 25 | + kj::Maybe<char16_t> pending = kj::none; |
| 26 | +}; |
| 27 | +} // namespace |
| 28 | + |
| 29 | +// TextEncoderStream encodes a stream of JavaScript strings into UTF-8 bytes. |
| 30 | +// |
| 31 | +// WHATWG Encoding spec requirement (https://encoding.spec.whatwg.org/#interface-textencoderstream): |
| 32 | +// The encoder must encode unpaired UTF-16 surrogates as replacement characters. |
| 33 | +// |
| 34 | +// simdutf handles this for us, but we have to be careful of surrogate pairs |
| 35 | +// (high surrogate, followed by low surrogate) split across chunk boundaries. |
| 36 | +// |
| 37 | +// We do this with the pending field: |
| 38 | +// holder->pending = kj::none -> No pending high surrogate from previous chunk |
| 39 | +// holder->pending = char16_t -> High surrogate waiting for a matching low surrogate |
| 40 | +// |
| 41 | +// Ref: https://github.com/web-platform-tests/wpt/blob/master/encoding/streams/encode-utf8.any.js |
14 | 42 | jsg::Ref<TextEncoderStream> TextEncoderStream::constructor(jsg::Lock& js) { |
15 | | - auto transformer = TransformStream::constructor(js, |
16 | | - Transformer{.transform = jsg::Function<Transformer::TransformAlgorithm>( |
17 | | - [](jsg::Lock& js, auto chunk, auto controller) { |
18 | | - auto str = jsg::check(chunk->ToString(js.v8Context())); |
19 | | - auto utf8Length = str->Utf8LengthV2(js.v8Isolate); |
| 43 | + auto state = kj::rc<Holder>(); |
20 | 44 |
|
21 | | - // Don't emit empty chunks |
22 | | - if (utf8Length == 0) { |
23 | | - return js.resolvedPromise(); |
| 45 | + auto transform = [holder = state.addRef()](jsg::Lock& js, v8::Local<v8::Value> chunk, |
| 46 | + jsg::Ref<TransformStreamDefaultController> controller) mutable { |
| 47 | + auto str = jsg::check(chunk->ToString(js.v8Context())); |
| 48 | + size_t length = str->Length(); |
| 49 | + if (length == 0) return js.resolvedPromise(); |
| 50 | + |
| 51 | + // Allocate buffer: reserve slot 0 for pending surrogate if we have one |
| 52 | + size_t prefix = (holder->pending == kj::none) ? 0 : 1; |
| 53 | + size_t end = prefix + length; |
| 54 | + auto buf = kj::heapArray<char16_t>(end); |
| 55 | + str->WriteV2(js.v8Isolate, 0, length, reinterpret_cast<uint16_t*>(buf.begin() + prefix)); |
| 56 | + |
| 57 | + KJ_IF_SOME(lead, holder->pending) { |
| 58 | + buf.begin()[0] = lead; |
| 59 | + holder->pending = kj::none; |
24 | 60 | } |
25 | 61 |
|
26 | | - v8::Local<v8::ArrayBuffer> buffer; |
27 | | - JSG_REQUIRE(v8::ArrayBuffer::MaybeNew(js.v8Isolate, utf8Length).ToLocal(&buffer), RangeError, |
28 | | - "Cannot allocate space for TextEncoder.encode"); |
29 | | - |
30 | | - auto bytes = jsg::asBytes(buffer).releaseAsChars(); |
31 | | - [[maybe_unused]] auto written = str->WriteUtf8V2( |
32 | | - js.v8Isolate, bytes.begin(), bytes.size(), v8::String::WriteFlags::kReplaceInvalidUtf8); |
33 | | - |
34 | | - KJ_DASSERT(written == buffer->ByteLength()); |
35 | | - controller->enqueue(js, v8::Uint8Array::New(buffer, 0, buffer->ByteLength())); |
| 62 | + // If chunk ends with high surrogate, save it for next chunk |
| 63 | + if (end > 0 && U_IS_LEAD(buf[end - 1])) { |
| 64 | + holder->pending = buf[--end]; |
| 65 | + } |
| 66 | + if (end == 0) return js.resolvedPromise(); |
| 67 | + |
| 68 | + auto slice = buf.first(end); |
| 69 | + auto result = simdutf::utf8_length_from_utf16_with_replacement(slice.begin(), slice.size()); |
| 70 | + // Only sanitize if there are surrogates in the buffer - UTF-16 without |
| 71 | + // surrogates is always well-formed. |
| 72 | + if (result.error == simdutf::error_code::SURROGATE) { |
| 73 | + simdutf::to_well_formed_utf16(slice.begin(), slice.size(), slice.begin()); |
| 74 | + } |
| 75 | + auto utf8Length = result.count; |
| 76 | + KJ_DASSERT(utf8Length > 0 && utf8Length >= end); |
| 77 | + |
| 78 | + auto backingStore = js.allocBackingStore(utf8Length, jsg::Lock::AllocOption::UNINITIALIZED); |
| 79 | + auto dest = kj::ArrayPtr<char>(static_cast<char*>(backingStore->Data()), utf8Length); |
| 80 | + [[maybe_unused]] auto written = |
| 81 | + simdutf::convert_utf16_to_utf8(slice.begin(), slice.size(), dest.begin()); |
| 82 | + KJ_DASSERT(written == utf8Length, "simdutf should write exactly utf8Length bytes"); |
| 83 | + |
| 84 | + auto array = v8::Uint8Array::New( |
| 85 | + v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8Length); |
| 86 | + controller->enqueue(js, jsg::JsUint8Array(array)); |
36 | 87 | return js.resolvedPromise(); |
37 | | - })}, |
| 88 | + }; |
| 89 | + |
| 90 | + auto flush = [holder = state.addRef()]( |
| 91 | + jsg::Lock& js, jsg::Ref<TransformStreamDefaultController> controller) mutable { |
| 92 | + // If stream ends with orphaned high surrogate, emit replacement character |
| 93 | + if (holder->pending != kj::none) { |
| 94 | + auto backingStore = js.allocBackingStore(3, jsg::Lock::AllocOption::UNINITIALIZED); |
| 95 | + memcpy(backingStore->Data(), REPLACEMENT_UTF8, 3); |
| 96 | + auto array = |
| 97 | + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, 3); |
| 98 | + controller->enqueue(js, jsg::JsUint8Array(array)); |
| 99 | + } |
| 100 | + return js.resolvedPromise(); |
| 101 | + }; |
| 102 | + |
| 103 | + auto transformer = TransformStream::constructor(js, |
| 104 | + Transformer{.transform = jsg::Function<Transformer::TransformAlgorithm>(kj::mv(transform)), |
| 105 | + .flush = jsg::Function<Transformer::FlushAlgorithm>(kj::mv(flush))}, |
38 | 106 | StreamQueuingStrategy{}, StreamQueuingStrategy{}); |
39 | 107 |
|
40 | 108 | return js.alloc<TextEncoderStream>(transformer->getReadable(), transformer->getWritable()); |
|
0 commit comments