|
6 | 6 | #include "node_external_reference.h" |
7 | 7 | #include "simdutf.h" |
8 | 8 | #include "string_bytes.h" |
| 9 | +#include "util.h" |
9 | 10 | #include "v8.h" |
10 | 11 |
|
| 12 | +#include <algorithm> |
11 | 13 | #include <cstdint> |
12 | 14 |
|
13 | 15 | namespace node { |
@@ -71,6 +73,113 @@ InternalFieldInfoBase* BindingData::Serialize(int index) { |
71 | 73 | return info; |
72 | 74 | } |
73 | 75 |
|
| 76 | +// The following code is adapted from Cloudflare workers. |
| 77 | +// Particularly from: https://github.com/cloudflare/workerd/pull/5448 |
| 78 | +// |
| 79 | +// Copyright (c) 2017-2025 Cloudflare, Inc. |
| 80 | +// Licensed under the Apache 2.0 license found in the LICENSE file or at: |
| 81 | +// https://opensource.org/licenses/Apache-2.0 |
| 82 | +namespace { |
| 83 | +constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096; |
| 84 | + |
| 85 | +constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) { |
| 86 | + return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00; |
| 87 | +} |
| 88 | + |
| 89 | +constexpr size_t simpleUtfEncodingLength(uint16_t c) { |
| 90 | + if (c < 0x80) return 1; |
| 91 | + if (c < 0x400) return 2; |
| 92 | + return 3; |
| 93 | +} |
| 94 | + |
| 95 | +// Finds the maximum number of input characters (UTF-16 or Latin1) that can be |
| 96 | +// encoded into a UTF-8 buffer of the given size. |
| 97 | +// |
| 98 | +// The challenge is that UTF-8 encoding expands characters by variable amounts: |
| 99 | +// - ASCII (< 0x80): 1 byte |
| 100 | +// - Code points < 0x800: 2 bytes |
| 101 | +// - Other BMP characters: 3 bytes |
| 102 | +// - Surrogate pairs (supplementary planes): 4 bytes total |
| 103 | +// |
| 104 | +// This function uses an adaptive chunking algorithm: |
| 105 | +// 1. Process the input in chunks, estimating how many characters will fit |
| 106 | +// 2. Calculate the actual UTF-8 length for each chunk using simdutf |
| 107 | +// 3. Adjust the expansion factor based on observed encoding ratios |
| 108 | +// 4. Fall back to character-by-character processing near the buffer boundary |
| 109 | +// 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries |
| 110 | +// |
| 111 | +// The algorithm starts with a conservative expansion estimate (1.15x) and |
| 112 | +// dynamically adjusts based on actual character distribution, making it |
| 113 | +// efficient for common ASCII-heavy text while remaining correct for |
| 114 | +// multi-byte heavy content. |
| 115 | +template <typename Char> |
| 116 | +size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { |
| 117 | + size_t pos = 0; |
| 118 | + size_t utf8Accumulated = 0; |
| 119 | + constexpr size_t CHUNK = 257; |
| 120 | + constexpr bool UTF16 = sizeof(Char) == 2; |
| 121 | + constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2; |
| 122 | + |
| 123 | + double expansion = 1.15; |
| 124 | + |
| 125 | + while (pos < length && utf8Accumulated < bufferSize) { |
| 126 | + size_t remainingInput = length - pos; |
| 127 | + size_t spaceRemaining = bufferSize - utf8Accumulated; |
| 128 | + DCHECK_GE(expansion, 1.15); |
| 129 | + |
| 130 | + size_t guaranteedToFit = spaceRemaining / MAX_FACTOR; |
| 131 | + if (guaranteedToFit >= remainingInput) { |
| 132 | + return length; |
| 133 | + } |
| 134 | + size_t likelyToFit = |
| 135 | + std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK); |
| 136 | + size_t fitEstimate = |
| 137 | + std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit)); |
| 138 | + size_t chunkSize = std::min(remainingInput, fitEstimate); |
| 139 | + if (chunkSize == 1) break; |
| 140 | + CHECK_GT(chunkSize, 1); |
| 141 | + |
| 142 | + size_t chunkUtf8Len; |
| 143 | + if constexpr (UTF16) { |
| 144 | + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when |
| 145 | + // available For now, validate and use utf8_length_from_utf16 |
| 146 | + size_t newPos = pos + chunkSize; |
| 147 | + if (newPos < length && isSurrogatePair(data[newPos - 1], data[newPos])) |
| 148 | + chunkSize--; |
| 149 | + chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); |
| 150 | + } else { |
| 151 | + chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); |
| 152 | + } |
| 153 | + |
| 154 | + if (utf8Accumulated + chunkUtf8Len > bufferSize) { |
| 155 | + DCHECK_GT(chunkSize, guaranteedToFit); |
| 156 | + expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize); |
| 157 | + } else { |
| 158 | + expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize); |
| 159 | + pos += chunkSize; |
| 160 | + utf8Accumulated += chunkUtf8Len; |
| 161 | + } |
| 162 | + } |
| 163 | + |
| 164 | + while (pos < length && utf8Accumulated < bufferSize) { |
| 165 | + size_t extra = simpleUtfEncodingLength(data[pos]); |
| 166 | + if (utf8Accumulated + extra > bufferSize) break; |
| 167 | + pos++; |
| 168 | + utf8Accumulated += extra; |
| 169 | + } |
| 170 | + |
| 171 | + if (UTF16 && pos != 0 && pos != length && |
| 172 | + isSurrogatePair(data[pos - 1], data[pos])) { |
| 173 | + if (utf8Accumulated < bufferSize) { |
| 174 | + pos++; |
| 175 | + } else { |
| 176 | + pos--; |
| 177 | + } |
| 178 | + } |
| 179 | + return pos; |
| 180 | +} |
| 181 | +} // namespace |
| 182 | + |
74 | 183 | void BindingData::Deserialize(Local<Context> context, |
75 | 184 | Local<Object> holder, |
76 | 185 | int index, |
@@ -98,18 +207,102 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) { |
98 | 207 |
|
99 | 208 | Local<Uint8Array> dest = args[1].As<Uint8Array>(); |
100 | 209 | Local<ArrayBuffer> buf = dest->Buffer(); |
| 210 | + |
| 211 | + // Handle detached buffers - return {read: 0, written: 0} |
| 212 | + if (buf->Data() == nullptr) { |
| 213 | + binding_data->encode_into_results_buffer_[0] = 0; |
| 214 | + binding_data->encode_into_results_buffer_[1] = 0; |
| 215 | + return; |
| 216 | + } |
| 217 | + |
101 | 218 | char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset(); |
102 | 219 | size_t dest_length = dest->ByteLength(); |
| 220 | + size_t read = 0; |
| 221 | + size_t written = 0; |
| 222 | + |
| 223 | + // For small strings (length <= 32), use the old V8 path for better |
| 224 | + // performance |
| 225 | + static constexpr int kSmallStringThreshold = 32; |
| 226 | + if (source->Length() <= kSmallStringThreshold) { |
| 227 | + written = source->WriteUtf8V2(isolate, |
| 228 | + write_result, |
| 229 | + dest_length, |
| 230 | + String::WriteFlags::kReplaceInvalidUtf8, |
| 231 | + &read); |
| 232 | + binding_data->encode_into_results_buffer_[0] = static_cast<double>(read); |
| 233 | + binding_data->encode_into_results_buffer_[1] = static_cast<double>(written); |
| 234 | + return; |
| 235 | + } |
103 | 236 |
|
104 | | - size_t nchars; |
105 | | - size_t written = source->WriteUtf8V2(isolate, |
106 | | - write_result, |
107 | | - dest_length, |
108 | | - String::WriteFlags::kReplaceInvalidUtf8, |
109 | | - &nchars); |
| 237 | + v8::String::ValueView view(isolate, source); |
| 238 | + size_t length_that_fits = |
| 239 | + std::min(static_cast<size_t>(view.length()), dest_length); |
| 240 | + |
| 241 | + if (view.is_one_byte()) { |
| 242 | + auto data = reinterpret_cast<const char*>(view.data8()); |
| 243 | + simdutf::result result = |
| 244 | + simdutf::validate_ascii_with_errors(data, length_that_fits); |
| 245 | + written = read = result.count; |
| 246 | + memcpy(write_result, data, read); |
| 247 | + write_result += read; |
| 248 | + data += read; |
| 249 | + length_that_fits -= read; |
| 250 | + dest_length -= read; |
| 251 | + if (length_that_fits != 0 && dest_length != 0) { |
| 252 | + if (size_t rest = findBestFit(data, length_that_fits, dest_length)) { |
| 253 | + DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length); |
| 254 | + written += simdutf::convert_latin1_to_utf8(data, rest, write_result); |
| 255 | + read += rest; |
| 256 | + } |
| 257 | + } |
| 258 | + } else { |
| 259 | + auto data = reinterpret_cast<const char16_t*>(view.data16()); |
| 260 | + |
| 261 | + // Limit conversion to what could fit in destination, avoiding splitting |
| 262 | + // a valid surrogate pair at the boundary, which could cause a spurious call |
| 263 | + // of simdutf::to_well_formed_utf16() |
| 264 | + if (length_that_fits > 0 && length_that_fits < view.length() && |
| 265 | + isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) { |
| 266 | + length_that_fits--; |
| 267 | + } |
| 268 | + |
| 269 | + // Check if input has unpaired surrogates - if so, convert to well-formed |
| 270 | + // first |
| 271 | + simdutf::result validation_result = |
| 272 | + simdutf::validate_utf16_with_errors(data, length_that_fits); |
| 273 | + |
| 274 | + if (validation_result.error == simdutf::SUCCESS) { |
| 275 | + // Valid UTF-16 - use the fast path |
| 276 | + read = findBestFit(data, length_that_fits, dest_length); |
| 277 | + if (read != 0) { |
| 278 | + DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length); |
| 279 | + written = simdutf::convert_utf16_to_utf8(data, read, write_result); |
| 280 | + } |
| 281 | + } else { |
| 282 | + // Invalid UTF-16 with unpaired surrogates - convert to well-formed first |
| 283 | + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when |
| 284 | + // available |
| 285 | + MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer( |
| 286 | + length_that_fits); |
| 287 | + simdutf::to_well_formed_utf16( |
| 288 | + data, length_that_fits, conversion_buffer.out()); |
| 289 | + |
| 290 | + // Now use findBestFit with the well-formed data |
| 291 | + read = |
| 292 | + findBestFit(conversion_buffer.out(), length_that_fits, dest_length); |
| 293 | + if (read != 0) { |
| 294 | + DCHECK_LE( |
| 295 | + simdutf::utf8_length_from_utf16(conversion_buffer.out(), read), |
| 296 | + dest_length); |
| 297 | + written = simdutf::convert_utf16_to_utf8( |
| 298 | + conversion_buffer.out(), read, write_result); |
| 299 | + } |
| 300 | + } |
| 301 | + } |
| 302 | + DCHECK_LE(written, dest->ByteLength()); |
110 | 303 |
|
111 | | - binding_data->encode_into_results_buffer_[0] = nchars; |
112 | | - binding_data->encode_into_results_buffer_[1] = written; |
| 304 | + binding_data->encode_into_results_buffer_[0] = static_cast<double>(read); |
| 305 | + binding_data->encode_into_results_buffer_[1] = static_cast<double>(written); |
113 | 306 | } |
114 | 307 |
|
115 | 308 | // Encode a single string to a UTF-8 Uint8Array (not Buffer). |
|
0 commit comments