@@ -71,6 +71,82 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7171 return info;
7272}
7373
74+ namespace {
75+ constexpr bool isSurrogatePair (uint16_t lead, uint16_t trail) {
76+ return (lead & 0xfc00 ) == 0xd800 && (trail & 0xfc00 ) == 0xdc00 ;
77+ }
78+
79+ constexpr size_t simpleUtfEncodingLength (uint16_t c) {
80+ if (c < 0x80 ) return 1 ;
81+ if (c < 0x400 ) return 2 ;
82+ return 3 ;
83+ }
84+
85+ template <typename Char>
86+ size_t findBestFit (const Char* data, size_t length, size_t bufferSize) {
87+ size_t pos = 0 ;
88+ size_t utf8Accumulated = 0 ;
89+ constexpr size_t CHUNK = 257 ;
90+ constexpr bool UTF16 = sizeof (Char) == 2 ;
91+ constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2 ;
92+
93+ double expansion = 1.15 ;
94+
95+ while (pos < length && utf8Accumulated < bufferSize) {
96+ size_t remainingInput = length - pos;
97+ size_t spaceRemaining = bufferSize - utf8Accumulated;
98+ DCHECK_GE (expansion, 1.15 );
99+
100+ size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
101+ if (guaranteedToFit >= remainingInput) {
102+ return length;
103+ }
104+ size_t likelyToFit =
105+ std::min (static_cast <size_t >(spaceRemaining / expansion), CHUNK);
106+ size_t fitEstimate =
107+ std::max (size_t {1 }, std::max (guaranteedToFit, likelyToFit));
108+ size_t chunkSize = std::min (remainingInput, fitEstimate);
109+ if (chunkSize == 1 ) break ;
110+ DCHECK_GE (chunkSize, 1 );
111+
112+ size_t chunkUtf8Len;
113+ if constexpr (UTF16) {
114+ // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
115+ // available For now, validate and use utf8_length_from_utf16
116+ chunkUtf8Len = simdutf::utf8_length_from_utf16 (data + pos, chunkSize);
117+ } else {
118+ chunkUtf8Len = simdutf::utf8_length_from_latin1 (data + pos, chunkSize);
119+ }
120+
121+ if (utf8Accumulated + chunkUtf8Len > bufferSize) {
122+ DCHECK_GT (chunkSize, guaranteedToFit);
123+ expansion = std::max (expansion * 1.1 , (chunkUtf8Len * 1.1 ) / chunkSize);
124+ } else {
125+ expansion = std::max (1.15 , (chunkUtf8Len * 1.1 ) / chunkSize);
126+ pos += chunkSize;
127+ utf8Accumulated += chunkUtf8Len;
128+ }
129+ }
130+
131+ while (pos < length && utf8Accumulated < bufferSize) {
132+ size_t extra = simpleUtfEncodingLength (data[pos]);
133+ if (utf8Accumulated + extra > bufferSize) break ;
134+ pos++;
135+ utf8Accumulated += extra;
136+ }
137+
138+ if (UTF16 && pos != 0 && pos != length &&
139+ isSurrogatePair (data[pos - 1 ], data[pos])) {
140+ if (utf8Accumulated < bufferSize) {
141+ pos++;
142+ } else {
143+ pos--;
144+ }
145+ }
146+ return pos;
147+ }
148+ } // namespace
149+
74150void BindingData::Deserialize (Local<Context> context,
75151 Local<Object> holder,
76152 int index,
@@ -101,15 +177,67 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101177 char * write_result = static_cast <char *>(buf->Data ()) + dest->ByteOffset ();
102178 size_t dest_length = dest->ByteLength ();
103179
104- size_t nchars;
105- size_t written = source->WriteUtf8V2 (isolate,
106- write_result,
107- dest_length,
108- String::WriteFlags::kReplaceInvalidUtf8 ,
109- &nchars);
180+ size_t read = 0 ;
181+ size_t written = 0 ;
182+ v8::String::ValueView view (isolate, source);
183+ uint32_t length = view.length ();
184+
185+ if (view.is_one_byte ()) {
186+ auto data = reinterpret_cast <const char *>(view.data8 ());
187+ simdutf::result result = simdutf::validate_ascii_with_errors (
188+ data, std::min (static_cast <size_t >(length), dest_length));
189+ written = read = result.count ;
190+ auto out_addr = write_result;
191+ memcpy (out_addr, data, read);
192+ out_addr += read;
193+ data += read;
194+ length -= read;
195+ dest_length -= read;
196+ if (length != 0 && dest_length != 0 ) {
197+ size_t rest = findBestFit (data, length, dest_length);
198+ if (rest != 0 ) {
199+ DCHECK_LE (simdutf::utf8_length_from_latin1 (data, rest), dest_length);
200+ written += simdutf::convert_latin1_to_utf8 (data, rest, out_addr);
201+ read += rest;
202+ }
203+ }
204+ } else {
205+ auto data = reinterpret_cast <const char16_t *>(view.data16 ());
206+
207+ // Check if input has unpaired surrogates - if so, convert to well-formed
208+ // first
209+ simdutf::result validation_result =
210+ simdutf::validate_utf16_with_errors (data, length);
211+
212+ if (validation_result.error == simdutf::SUCCESS) {
213+ // Valid UTF-16 - use the fast path
214+ read = findBestFit (data, length, dest_length);
215+ if (read != 0 ) {
216+ DCHECK_LE (simdutf::utf8_length_from_utf16 (data, read), dest_length);
217+ written = simdutf::convert_utf16_to_utf8 (data, read, write_result);
218+ }
219+ } else {
220+ // Invalid UTF-16 with unpaired surrogates - convert to well-formed first
221+ // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
222+ // available
223+ std::vector<char16_t > conversion_buffer (length);
224+ simdutf::to_well_formed_utf16 (data, length, conversion_buffer.data ());
225+
226+ // Now use findBestFit with the well-formed data
227+ read = findBestFit (conversion_buffer.data (), length, dest_length);
228+ if (read != 0 ) {
229+ DCHECK_LE (
230+ simdutf::utf8_length_from_utf16 (conversion_buffer.data (), read),
231+ dest_length);
232+ written = simdutf::convert_utf16_to_utf8 (
233+ conversion_buffer.data (), read, write_result);
234+ }
235+ }
236+ }
237+ DCHECK_LE (written, dest_length);
110238
111- binding_data->encode_into_results_buffer_ [0 ] = nchars ;
112- binding_data->encode_into_results_buffer_ [1 ] = written;
239+ binding_data->encode_into_results_buffer_ [0 ] = static_cast < double >(read) ;
240+ binding_data->encode_into_results_buffer_ [1 ] = static_cast < double >( written) ;
113241}
114242
115243// Encode a single string to a UTF-8 Uint8Array (not Buffer).
0 commit comments