|
6 | 6 | #include "node_external_reference.h" |
7 | 7 | #include "simdutf.h" |
8 | 8 | #include "string_bytes.h" |
| 9 | +#include "util.h" |
9 | 10 | #include "v8.h" |
10 | 11 |
|
| 12 | +#include <algorithm> |
11 | 13 | #include <cstdint> |
12 | 14 |
|
13 | 15 | namespace node { |
@@ -71,6 +73,90 @@ InternalFieldInfoBase* BindingData::Serialize(int index) { |
71 | 73 | return info; |
72 | 74 | } |
73 | 75 |
|
| 76 | +// The following code is adapted from Cloudflare workers. |
| 77 | +// Particularly from: https://github.com/cloudflare/workerd/pull/5448 |
| 78 | +// |
| 79 | +// Copyright (c) 2017-2025 Cloudflare, Inc. |
| 80 | +// Licensed under the Apache 2.0 license found in the LICENSE file or at: |
| 81 | +// https://opensource.org/licenses/Apache-2.0 |
| 82 | +namespace { |
| 83 | +constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096; |
| 84 | + |
| 85 | +constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) { |
| 86 | + return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00; |
| 87 | +} |
| 88 | + |
| 89 | +constexpr size_t simpleUtfEncodingLength(uint16_t c) { |
| 90 | + if (c < 0x80) return 1; |
| 91 | + if (c < 0x400) return 2; |
| 92 | + return 3; |
| 93 | +} |
| 94 | + |
| 95 | +template <typename Char> |
| 96 | +size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { |
| 97 | + size_t pos = 0; |
| 98 | + size_t utf8Accumulated = 0; |
| 99 | + constexpr size_t CHUNK = 257; |
| 100 | + constexpr bool UTF16 = sizeof(Char) == 2; |
| 101 | + constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2; |
| 102 | + |
| 103 | + double expansion = 1.15; |
| 104 | + |
| 105 | + while (pos < length && utf8Accumulated < bufferSize) { |
| 106 | + size_t remainingInput = length - pos; |
| 107 | + size_t spaceRemaining = bufferSize - utf8Accumulated; |
| 108 | + DCHECK_GE(expansion, 1.15); |
| 109 | + |
| 110 | + size_t guaranteedToFit = spaceRemaining / MAX_FACTOR; |
| 111 | + if (guaranteedToFit >= remainingInput) { |
| 112 | + return length; |
| 113 | + } |
| 114 | + size_t likelyToFit = |
| 115 | + std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK); |
| 116 | + size_t fitEstimate = |
| 117 | + std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit)); |
| 118 | + size_t chunkSize = std::min(remainingInput, fitEstimate); |
| 119 | + if (chunkSize == 1) break; |
| 120 | + DCHECK_GE(chunkSize, 1); |
| 121 | + |
| 122 | + size_t chunkUtf8Len; |
| 123 | + if constexpr (UTF16) { |
| 124 | + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when |
| 125 | + // available For now, validate and use utf8_length_from_utf16 |
| 126 | + chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); |
| 127 | + } else { |
| 128 | + chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); |
| 129 | + } |
| 130 | + |
| 131 | + if (utf8Accumulated + chunkUtf8Len > bufferSize) { |
| 132 | + DCHECK_GT(chunkSize, guaranteedToFit); |
| 133 | + expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize); |
| 134 | + } else { |
| 135 | + expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize); |
| 136 | + pos += chunkSize; |
| 137 | + utf8Accumulated += chunkUtf8Len; |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + while (pos < length && utf8Accumulated < bufferSize) { |
| 142 | + size_t extra = simpleUtfEncodingLength(data[pos]); |
| 143 | + if (utf8Accumulated + extra > bufferSize) break; |
| 144 | + pos++; |
| 145 | + utf8Accumulated += extra; |
| 146 | + } |
| 147 | + |
| 148 | + if (UTF16 && pos != 0 && pos != length && |
| 149 | + isSurrogatePair(data[pos - 1], data[pos])) { |
| 150 | + if (utf8Accumulated < bufferSize) { |
| 151 | + pos++; |
| 152 | + } else { |
| 153 | + pos--; |
| 154 | + } |
| 155 | + } |
| 156 | + return pos; |
| 157 | +} |
| 158 | +} // namespace |
| 159 | + |
74 | 160 | void BindingData::Deserialize(Local<Context> context, |
75 | 161 | Local<Object> holder, |
76 | 162 | int index, |
@@ -101,15 +187,75 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) { |
101 | 187 | char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset(); |
102 | 188 | size_t dest_length = dest->ByteLength(); |
103 | 189 |
|
104 | | - size_t nchars; |
105 | | - size_t written = source->WriteUtf8V2(isolate, |
106 | | - write_result, |
107 | | - dest_length, |
108 | | - String::WriteFlags::kReplaceInvalidUtf8, |
109 | | - &nchars); |
| 190 | + size_t read = 0; |
| 191 | + size_t written = 0; |
| 192 | + v8::String::ValueView view(isolate, source); |
| 193 | + size_t length = view.length(); |
| 194 | + |
| 195 | + if (view.is_one_byte()) { |
| 196 | + auto data = reinterpret_cast<const char*>(view.data8()); |
| 197 | + simdutf::result result = simdutf::validate_ascii_with_errors(data, length); |
| 198 | + written = read = result.count; |
| 199 | + auto out_addr = write_result; |
| 200 | + memcpy(out_addr, data, read); |
| 201 | + out_addr += read; |
| 202 | + data += read; |
| 203 | + length -= read; |
| 204 | + dest_length -= read; |
| 205 | + if (length != 0 && dest_length != 0) { |
| 206 | + size_t rest = findBestFit(data, length, dest_length); |
| 207 | + if (rest != 0) { |
| 208 | + DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length); |
| 209 | + written += simdutf::convert_latin1_to_utf8(data, rest, out_addr); |
| 210 | + read += rest; |
| 211 | + } |
| 212 | + } |
| 213 | + } else { |
| 214 | + auto data = reinterpret_cast<const char16_t*>(view.data16()); |
| 215 | + |
| 216 | + // Check if input has unpaired surrogates - if so, convert to well-formed |
| 217 | + // first |
| 218 | + simdutf::result validation_result = |
| 219 | + simdutf::validate_utf16_with_errors(data, length); |
| 220 | + |
| 221 | + if (validation_result.error == simdutf::SUCCESS) { |
| 222 | + // Valid UTF-16 - use the fast path |
| 223 | + read = findBestFit(data, length, dest_length); |
| 224 | + if (read != 0) { |
| 225 | + DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length); |
| 226 | + written = simdutf::convert_utf16_to_utf8(data, read, write_result); |
| 227 | + } |
| 228 | + } else { |
| 229 | + // Invalid UTF-16 with unpaired surrogates - convert to well-formed first |
| 230 | + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when |
| 231 | + // available |
| 232 | + // Limit conversion to what could fit in destination, avoiding splitting |
| 233 | + // a valid surrogate pair at the boundary |
| 234 | + size_t safe_length = std::min(length, dest_length); |
| 235 | + if (safe_length > 0 && safe_length < view.length() && |
| 236 | + isSurrogatePair(data[safe_length - 1], data[safe_length])) { |
| 237 | + safe_length--; |
| 238 | + } |
| 239 | + |
| 240 | + MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer( |
| 241 | + safe_length); |
| 242 | + simdutf::to_well_formed_utf16(data, safe_length, conversion_buffer.out()); |
| 243 | + |
| 244 | + // Now use findBestFit with the well-formed data |
| 245 | + read = findBestFit(conversion_buffer.out(), safe_length, dest_length); |
| 246 | + if (read != 0) { |
| 247 | + DCHECK_LE( |
| 248 | + simdutf::utf8_length_from_utf16(conversion_buffer.out(), read), |
| 249 | + dest_length); |
| 250 | + written = simdutf::convert_utf16_to_utf8( |
| 251 | + conversion_buffer.out(), read, write_result); |
| 252 | + } |
| 253 | + } |
| 254 | + } |
| 255 | + DCHECK_LE(written, dest_length); |
110 | 256 |
|
111 | | - binding_data->encode_into_results_buffer_[0] = nchars; |
112 | | - binding_data->encode_into_results_buffer_[1] = written; |
| 257 | + binding_data->encode_into_results_buffer_[0] = static_cast<double>(read); |
| 258 | + binding_data->encode_into_results_buffer_[1] = static_cast<double>(written); |
113 | 259 | } |
114 | 260 |
|
115 | 261 | // Encode a single string to a UTF-8 Uint8Array (not Buffer). |
|
0 commit comments