@@ -71,6 +71,88 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7171 return info;
7272}
7373
74+ // The following code is adapted from Cloudflare workers.
75+ // Particularly from: https://github.com/cloudflare/workerd/pull/5448
76+ //
77+ // Copyright (c) 2017-2025 Cloudflare, Inc.
78+ // Licensed under the Apache 2.0 license found in the LICENSE file or at:
79+ // https://opensource.org/licenses/Apache-2.0
80+ namespace {
81+ constexpr bool isSurrogatePair (uint16_t lead, uint16_t trail) {
82+ return (lead & 0xfc00 ) == 0xd800 && (trail & 0xfc00 ) == 0xdc00 ;
83+ }
84+
85+ constexpr size_t simpleUtfEncodingLength (uint16_t c) {
86+ if (c < 0x80 ) return 1 ;
87+ if (c < 0x400 ) return 2 ;
88+ return 3 ;
89+ }
90+
91+ template <typename Char>
92+ size_t findBestFit (const Char* data, size_t length, size_t bufferSize) {
93+ size_t pos = 0 ;
94+ size_t utf8Accumulated = 0 ;
95+ constexpr size_t CHUNK = 257 ;
96+ constexpr bool UTF16 = sizeof (Char) == 2 ;
97+ constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2 ;
98+
99+ double expansion = 1.15 ;
100+
101+ while (pos < length && utf8Accumulated < bufferSize) {
102+ size_t remainingInput = length - pos;
103+ size_t spaceRemaining = bufferSize - utf8Accumulated;
104+ DCHECK_GE (expansion, 1.15 );
105+
106+ size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
107+ if (guaranteedToFit >= remainingInput) {
108+ return length;
109+ }
110+ size_t likelyToFit =
111+ std::min (static_cast <size_t >(spaceRemaining / expansion), CHUNK);
112+ size_t fitEstimate =
113+ std::max (size_t {1 }, std::max (guaranteedToFit, likelyToFit));
114+ size_t chunkSize = std::min (remainingInput, fitEstimate);
115+ if (chunkSize == 1 ) break ;
116+ DCHECK_GE (chunkSize, 1 );
117+
118+ size_t chunkUtf8Len;
119+ if constexpr (UTF16) {
120+ // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
121+ // available For now, validate and use utf8_length_from_utf16
122+ chunkUtf8Len = simdutf::utf8_length_from_utf16 (data + pos, chunkSize);
123+ } else {
124+ chunkUtf8Len = simdutf::utf8_length_from_latin1 (data + pos, chunkSize);
125+ }
126+
127+ if (utf8Accumulated + chunkUtf8Len > bufferSize) {
128+ DCHECK_GT (chunkSize, guaranteedToFit);
129+ expansion = std::max (expansion * 1.1 , (chunkUtf8Len * 1.1 ) / chunkSize);
130+ } else {
131+ expansion = std::max (1.15 , (chunkUtf8Len * 1.1 ) / chunkSize);
132+ pos += chunkSize;
133+ utf8Accumulated += chunkUtf8Len;
134+ }
135+ }
136+
137+ while (pos < length && utf8Accumulated < bufferSize) {
138+ size_t extra = simpleUtfEncodingLength (data[pos]);
139+ if (utf8Accumulated + extra > bufferSize) break ;
140+ pos++;
141+ utf8Accumulated += extra;
142+ }
143+
144+ if (UTF16 && pos != 0 && pos != length &&
145+ isSurrogatePair (data[pos - 1 ], data[pos])) {
146+ if (utf8Accumulated < bufferSize) {
147+ pos++;
148+ } else {
149+ pos--;
150+ }
151+ }
152+ return pos;
153+ }
154+ } // namespace
155+
74156void BindingData::Deserialize (Local<Context> context,
75157 Local<Object> holder,
76158 int index,
@@ -101,15 +183,67 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101183 char * write_result = static_cast <char *>(buf->Data ()) + dest->ByteOffset ();
102184 size_t dest_length = dest->ByteLength ();
103185
104- size_t nchars;
105- size_t written = source->WriteUtf8V2 (isolate,
106- write_result,
107- dest_length,
108- String::WriteFlags::kReplaceInvalidUtf8 ,
109- &nchars);
186+ size_t read = 0 ;
187+ size_t written = 0 ;
188+ v8::String::ValueView view (isolate, source);
189+ uint32_t length = view.length ();
190+
191+ if (view.is_one_byte ()) {
192+ auto data = reinterpret_cast <const char *>(view.data8 ());
193+ simdutf::result result = simdutf::validate_ascii_with_errors (
194+ data, std::min (static_cast <size_t >(length), dest_length));
195+ written = read = result.count ;
196+ auto out_addr = write_result;
197+ memcpy (out_addr, data, read);
198+ out_addr += read;
199+ data += read;
200+ length -= read;
201+ dest_length -= read;
202+ if (length != 0 && dest_length != 0 ) {
203+ size_t rest = findBestFit (data, length, dest_length);
204+ if (rest != 0 ) {
205+ DCHECK_LE (simdutf::utf8_length_from_latin1 (data, rest), dest_length);
206+ written += simdutf::convert_latin1_to_utf8 (data, rest, out_addr);
207+ read += rest;
208+ }
209+ }
210+ } else {
211+ auto data = reinterpret_cast <const char16_t *>(view.data16 ());
212+
213+ // Check if input has unpaired surrogates - if so, convert to well-formed
214+ // first
215+ simdutf::result validation_result =
216+ simdutf::validate_utf16_with_errors (data, length);
217+
218+ if (validation_result.error == simdutf::SUCCESS) {
219+ // Valid UTF-16 - use the fast path
220+ read = findBestFit (data, length, dest_length);
221+ if (read != 0 ) {
222+ DCHECK_LE (simdutf::utf8_length_from_utf16 (data, read), dest_length);
223+ written = simdutf::convert_utf16_to_utf8 (data, read, write_result);
224+ }
225+ } else {
226+ // Invalid UTF-16 with unpaired surrogates - convert to well-formed first
227+ // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
228+ // available
229+ std::vector<char16_t > conversion_buffer (length);
230+ simdutf::to_well_formed_utf16 (data, length, conversion_buffer.data ());
231+
232+ // Now use findBestFit with the well-formed data
233+ read = findBestFit (conversion_buffer.data (), length, dest_length);
234+ if (read != 0 ) {
235+ DCHECK_LE (
236+ simdutf::utf8_length_from_utf16 (conversion_buffer.data (), read),
237+ dest_length);
238+ written = simdutf::convert_utf16_to_utf8 (
239+ conversion_buffer.data (), read, write_result);
240+ }
241+ }
242+ }
243+ DCHECK_LE (written, dest_length);
110244
111- binding_data->encode_into_results_buffer_ [0 ] = nchars ;
112- binding_data->encode_into_results_buffer_ [1 ] = written;
245+ binding_data->encode_into_results_buffer_ [0 ] = static_cast < double >(read) ;
246+ binding_data->encode_into_results_buffer_ [1 ] = static_cast < double >( written) ;
113247}
114248
115249// Encode a single string to a UTF-8 Uint8Array (not Buffer).
0 commit comments