Skip to content

Commit b1e941e

Browse files
anonrigerikcorrylemire
authored
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <[email protected]> Co-authored-by: Daniel Lemire <[email protected]> PR-URL: #60843 Reviewed-By: Daniel Lemire <[email protected]> Reviewed-By: Rafael Gonzaga <[email protected]>
1 parent 6f7f51b commit b1e941e

File tree

1 file changed

+201
-8
lines changed

1 file changed

+201
-8
lines changed

src/encoding_binding.cc

Lines changed: 201 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
#include "node_external_reference.h"
77
#include "simdutf.h"
88
#include "string_bytes.h"
9+
#include "util.h"
910
#include "v8.h"
1011

12+
#include <algorithm>
1113
#include <cstdint>
1214

1315
namespace node {
@@ -71,6 +73,113 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7173
return info;
7274
}
7375

76+
// The following code is adapted from Cloudflare workers.
77+
// Particularly from: https://github.com/cloudflare/workerd/pull/5448
78+
//
79+
// Copyright (c) 2017-2025 Cloudflare, Inc.
80+
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
81+
// https://opensource.org/licenses/Apache-2.0
82+
namespace {
83+
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
84+
85+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
86+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
87+
}
88+
89+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
90+
if (c < 0x80) return 1;
91+
if (c < 0x400) return 2;
92+
return 3;
93+
}
94+
95+
// Finds the maximum number of input characters (UTF-16 or Latin1) that can be
96+
// encoded into a UTF-8 buffer of the given size.
97+
//
98+
// The challenge is that UTF-8 encoding expands characters by variable amounts:
99+
// - ASCII (< 0x80): 1 byte
100+
// - Code points < 0x800: 2 bytes
101+
// - Other BMP characters: 3 bytes
102+
// - Surrogate pairs (supplementary planes): 4 bytes total
103+
//
104+
// This function uses an adaptive chunking algorithm:
105+
// 1. Process the input in chunks, estimating how many characters will fit
106+
// 2. Calculate the actual UTF-8 length for each chunk using simdutf
107+
// 3. Adjust the expansion factor based on observed encoding ratios
108+
// 4. Fall back to character-by-character processing near the buffer boundary
109+
// 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries
110+
//
111+
// The algorithm starts with a conservative expansion estimate (1.15x) and
112+
// dynamically adjusts based on actual character distribution, making it
113+
// efficient for common ASCII-heavy text while remaining correct for
114+
// multi-byte heavy content.
115+
template <typename Char>
116+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
117+
size_t pos = 0;
118+
size_t utf8Accumulated = 0;
119+
constexpr size_t CHUNK = 257;
120+
constexpr bool UTF16 = sizeof(Char) == 2;
121+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
122+
123+
double expansion = 1.15;
124+
125+
while (pos < length && utf8Accumulated < bufferSize) {
126+
size_t remainingInput = length - pos;
127+
size_t spaceRemaining = bufferSize - utf8Accumulated;
128+
DCHECK_GE(expansion, 1.15);
129+
130+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
131+
if (guaranteedToFit >= remainingInput) {
132+
return length;
133+
}
134+
size_t likelyToFit =
135+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
136+
size_t fitEstimate =
137+
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
138+
size_t chunkSize = std::min(remainingInput, fitEstimate);
139+
if (chunkSize == 1) break;
140+
CHECK_GT(chunkSize, 1);
141+
142+
size_t chunkUtf8Len;
143+
if constexpr (UTF16) {
144+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
145+
// available For now, validate and use utf8_length_from_utf16
146+
size_t newPos = pos + chunkSize;
147+
if (newPos < length && isSurrogatePair(data[newPos - 1], data[newPos]))
148+
chunkSize--;
149+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
150+
} else {
151+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
152+
}
153+
154+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
155+
DCHECK_GT(chunkSize, guaranteedToFit);
156+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
157+
} else {
158+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
159+
pos += chunkSize;
160+
utf8Accumulated += chunkUtf8Len;
161+
}
162+
}
163+
164+
while (pos < length && utf8Accumulated < bufferSize) {
165+
size_t extra = simpleUtfEncodingLength(data[pos]);
166+
if (utf8Accumulated + extra > bufferSize) break;
167+
pos++;
168+
utf8Accumulated += extra;
169+
}
170+
171+
if (UTF16 && pos != 0 && pos != length &&
172+
isSurrogatePair(data[pos - 1], data[pos])) {
173+
if (utf8Accumulated < bufferSize) {
174+
pos++;
175+
} else {
176+
pos--;
177+
}
178+
}
179+
return pos;
180+
}
181+
} // namespace
182+
74183
void BindingData::Deserialize(Local<Context> context,
75184
Local<Object> holder,
76185
int index,
@@ -98,18 +207,102 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
98207

99208
Local<Uint8Array> dest = args[1].As<Uint8Array>();
100209
Local<ArrayBuffer> buf = dest->Buffer();
210+
211+
// Handle detached buffers - return {read: 0, written: 0}
212+
if (buf->Data() == nullptr) {
213+
binding_data->encode_into_results_buffer_[0] = 0;
214+
binding_data->encode_into_results_buffer_[1] = 0;
215+
return;
216+
}
217+
101218
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102219
size_t dest_length = dest->ByteLength();
220+
size_t read = 0;
221+
size_t written = 0;
222+
223+
// For small strings (length <= 32), use the old V8 path for better
224+
// performance
225+
static constexpr int kSmallStringThreshold = 32;
226+
if (source->Length() <= kSmallStringThreshold) {
227+
written = source->WriteUtf8V2(isolate,
228+
write_result,
229+
dest_length,
230+
String::WriteFlags::kReplaceInvalidUtf8,
231+
&read);
232+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
233+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
234+
return;
235+
}
103236

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
237+
v8::String::ValueView view(isolate, source);
238+
size_t length_that_fits =
239+
std::min(static_cast<size_t>(view.length()), dest_length);
240+
241+
if (view.is_one_byte()) {
242+
auto data = reinterpret_cast<const char*>(view.data8());
243+
simdutf::result result =
244+
simdutf::validate_ascii_with_errors(data, length_that_fits);
245+
written = read = result.count;
246+
memcpy(write_result, data, read);
247+
write_result += read;
248+
data += read;
249+
length_that_fits -= read;
250+
dest_length -= read;
251+
if (length_that_fits != 0 && dest_length != 0) {
252+
if (size_t rest = findBestFit(data, length_that_fits, dest_length)) {
253+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
254+
written += simdutf::convert_latin1_to_utf8(data, rest, write_result);
255+
read += rest;
256+
}
257+
}
258+
} else {
259+
auto data = reinterpret_cast<const char16_t*>(view.data16());
260+
261+
// Limit conversion to what could fit in destination, avoiding splitting
262+
// a valid surrogate pair at the boundary, which could cause a spurious call
263+
// of simdutf::to_well_formed_utf16()
264+
if (length_that_fits > 0 && length_that_fits < view.length() &&
265+
isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) {
266+
length_that_fits--;
267+
}
268+
269+
// Check if input has unpaired surrogates - if so, convert to well-formed
270+
// first
271+
simdutf::result validation_result =
272+
simdutf::validate_utf16_with_errors(data, length_that_fits);
273+
274+
if (validation_result.error == simdutf::SUCCESS) {
275+
// Valid UTF-16 - use the fast path
276+
read = findBestFit(data, length_that_fits, dest_length);
277+
if (read != 0) {
278+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
279+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
280+
}
281+
} else {
282+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
283+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
284+
// available
285+
MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer(
286+
length_that_fits);
287+
simdutf::to_well_formed_utf16(
288+
data, length_that_fits, conversion_buffer.out());
289+
290+
// Now use findBestFit with the well-formed data
291+
read =
292+
findBestFit(conversion_buffer.out(), length_that_fits, dest_length);
293+
if (read != 0) {
294+
DCHECK_LE(
295+
simdutf::utf8_length_from_utf16(conversion_buffer.out(), read),
296+
dest_length);
297+
written = simdutf::convert_utf16_to_utf8(
298+
conversion_buffer.out(), read, write_result);
299+
}
300+
}
301+
}
302+
DCHECK_LE(written, dest->ByteLength());
110303

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
304+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
305+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113306
}
114307

115308
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)