Skip to content

Commit 41216b7

Browse files
anonrigerikcorrylemire
committed
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <[email protected]> Co-authored-by: Daniel Lemire <[email protected]>
1 parent 340e619 commit 41216b7

File tree

1 file changed

+154
-8
lines changed

1 file changed

+154
-8
lines changed

src/encoding_binding.cc

Lines changed: 154 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
#include "node_external_reference.h"
77
#include "simdutf.h"
88
#include "string_bytes.h"
9+
#include "util.h"
910
#include "v8.h"
1011

12+
#include <algorithm>
1113
#include <cstdint>
1214

1315
namespace node {
@@ -71,6 +73,90 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7173
return info;
7274
}
7375

76+
// The following code is adapted from Cloudflare workers.
77+
// Particularly from: https://github.com/cloudflare/workerd/pull/5448
78+
//
79+
// Copyright (c) 2017-2025 Cloudflare, Inc.
80+
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
81+
// https://opensource.org/licenses/Apache-2.0
82+
namespace {
83+
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
84+
85+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
86+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
87+
}
88+
89+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
90+
if (c < 0x80) return 1;
91+
if (c < 0x400) return 2;
92+
return 3;
93+
}
94+
95+
template <typename Char>
96+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
97+
size_t pos = 0;
98+
size_t utf8Accumulated = 0;
99+
constexpr size_t CHUNK = 257;
100+
constexpr bool UTF16 = sizeof(Char) == 2;
101+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
102+
103+
double expansion = 1.15;
104+
105+
while (pos < length && utf8Accumulated < bufferSize) {
106+
size_t remainingInput = length - pos;
107+
size_t spaceRemaining = bufferSize - utf8Accumulated;
108+
DCHECK_GE(expansion, 1.15);
109+
110+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
111+
if (guaranteedToFit >= remainingInput) {
112+
return length;
113+
}
114+
size_t likelyToFit =
115+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
116+
size_t fitEstimate =
117+
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
118+
size_t chunkSize = std::min(remainingInput, fitEstimate);
119+
if (chunkSize == 1) break;
120+
DCHECK_GE(chunkSize, 1);
121+
122+
size_t chunkUtf8Len;
123+
if constexpr (UTF16) {
124+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
125+
// available For now, validate and use utf8_length_from_utf16
126+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
127+
} else {
128+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
129+
}
130+
131+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
132+
DCHECK_GT(chunkSize, guaranteedToFit);
133+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
134+
} else {
135+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
136+
pos += chunkSize;
137+
utf8Accumulated += chunkUtf8Len;
138+
}
139+
}
140+
141+
while (pos < length && utf8Accumulated < bufferSize) {
142+
size_t extra = simpleUtfEncodingLength(data[pos]);
143+
if (utf8Accumulated + extra > bufferSize) break;
144+
pos++;
145+
utf8Accumulated += extra;
146+
}
147+
148+
if (UTF16 && pos != 0 && pos != length &&
149+
isSurrogatePair(data[pos - 1], data[pos])) {
150+
if (utf8Accumulated < bufferSize) {
151+
pos++;
152+
} else {
153+
pos--;
154+
}
155+
}
156+
return pos;
157+
}
158+
} // namespace
159+
74160
void BindingData::Deserialize(Local<Context> context,
75161
Local<Object> holder,
76162
int index,
@@ -101,15 +187,75 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101187
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102188
size_t dest_length = dest->ByteLength();
103189

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
190+
size_t read = 0;
191+
size_t written = 0;
192+
v8::String::ValueView view(isolate, source);
193+
uint32_t length = view.length();
194+
195+
if (view.is_one_byte()) {
196+
auto data = reinterpret_cast<const char*>(view.data8());
197+
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
198+
written = read = result.count;
199+
auto out_addr = write_result;
200+
memcpy(out_addr, data, read);
201+
out_addr += read;
202+
data += read;
203+
length -= read;
204+
dest_length -= read;
205+
if (length != 0 && dest_length != 0) {
206+
size_t rest = findBestFit(data, length, dest_length);
207+
if (rest != 0) {
208+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
209+
written += simdutf::convert_latin1_to_utf8(data, rest, out_addr);
210+
read += rest;
211+
}
212+
}
213+
} else {
214+
auto min_size = std::min(static_cast<size_t>(length), dest_length);
215+
auto data = reinterpret_cast<const char16_t*>(view.data16());
216+
217+
// Check if input has unpaired surrogates - if so, convert to well-formed
218+
// first
219+
simdutf::result validation_result =
220+
simdutf::validate_utf16_with_errors(data, min_size);
221+
222+
if (validation_result.error == simdutf::SUCCESS) {
223+
// Valid UTF-16 - use the fast path
224+
read = findBestFit(data, min_size, dest_length);
225+
if (read != 0) {
226+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
227+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
228+
}
229+
} else {
230+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
231+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
232+
// available
233+
// Manually check if min_size would split a valid surrogate pair
234+
size_t safe_length = min_size;
235+
if (safe_length > 0 && safe_length < length &&
236+
isSurrogatePair(data[safe_length - 1], data[safe_length])) {
237+
safe_length--;
238+
}
239+
240+
MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer(
241+
safe_length);
242+
simdutf::to_well_formed_utf16(data, safe_length, conversion_buffer.out());
243+
244+
// Now use findBestFit with the well-formed data
245+
read = findBestFit(conversion_buffer.out(), safe_length, dest_length);
246+
if (read != 0) {
247+
DCHECK_LE(
248+
simdutf::utf8_length_from_utf16(conversion_buffer.out(), read),
249+
dest_length);
250+
written = simdutf::convert_utf16_to_utf8(
251+
conversion_buffer.out(), read, write_result);
252+
}
253+
}
254+
}
255+
DCHECK_LE(written, dest_length);
110256

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
257+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
258+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113259
}
114260

115261
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)