Skip to content

Commit d116bd1

Browse files
authored
[workerd-cxx] clarifying kj::String<->rust::String utf8 conversion (#62)
1 parent b246933 commit d116bd1

File tree

4 files changed

+234
-52
lines changed

4 files changed

+234
-52
lines changed

kj-rs/convert.h

Lines changed: 141 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,46 +12,87 @@
1212
// Converting C++ kj arrays, strings, etc to Rust:
1313
// - kjObject.as<Rust>() - creates zero-copy read-only Rust view
1414
// - kjObject.as<RustMutable>() - creates zero-copy mutable Rust view
15-
// - kjObject.as<RustCopy>() - creates owned Rust copy
15+
// - kjObject.as<RustCopy>() - creates owned Rust copy (safe byte arrays)
16+
// - kjObject.as<RustUncheckedUtf8>() - creates Rust string
17+
// - kjObject.as<RustCopyUncheckedUtf8>() - creates owned Rust string (assumes valid UTF-8)
1618
//
1719
// Converting Rust to C++ kj objects:
1820
// - from<Rust>(rustObject) - creates zero-copy C++ view
1921
// - from<RustCopy>(rustObject) - creates owned C++ copy
20-
// - kj::str(rustString) - automatic conversion (via KJ_STRINGIFY)
21-
// - kj::hashCode(rustString) - automatic hash computation (via KJ_HASHCODE)
22+
// - kj::str(rustString/rustSlice/rustVec) - automatic conversion (via KJ_STRINGIFY)
23+
// - kj::hashCode(rustString/rustSlice/rustVec) - automatic hash computation (via KJ_HASHCODE)
2224
//
2325
// ============================================================================
24-
// CONVERSION FUNCTIONS
26+
// ARRAY/COLLECTION CONVERSIONS
2527
// ============================================================================
2628
//
2729
// Zero-copy conversions from Rust to C++:
2830
// - from<Rust>(rust::Vec<T>) -> kj::ArrayPtr<const T>
2931
// - from<Rust>(rust::Slice<T>) -> kj::ArrayPtr<T>
30-
// - from<Rust>(rust::String) -> kj::ArrayPtr<const char>
31-
// - from<Rust>(rust::str) -> kj::ArrayPtr<const char>
32-
//
33-
// Owned conversions from Rust to C++:
34-
// - from<RustCopy>(rust::Slice<rust::str>) -> kj::Array<kj::String>
35-
// - from<RustCopy>(rust::Vec<rust::String>) -> kj::Array<kj::String>
3632
//
3733
// Zero-copy conversions from C++ to Rust (read-only):
3834
// - kjArray.as<Rust>() -> rust::Slice<const T>
39-
// - kjString.as<Rust>() -> rust::String
40-
// - kjStringPtr.as<Rust>() -> rust::str
41-
// - kjConstString.as<Rust>() -> rust::str
4235
//
4336
// Zero-copy conversions from C++ to Rust (mutable):
4437
// - kjArray.as<RustMutable>() -> rust::Slice<T>
4538
// - kjArrayPtr.as<RustMutable>() -> rust::Slice<T>
4639
//
4740
// Owned conversions from C++ to Rust (copying):
48-
// - kjStringPtr.as<RustCopy>() -> rust::String
49-
// - kjConstString.as<RustCopy>() -> rust::String
5041
// - kjArrayPtr.as<RustCopy>() -> rust::Vec<T>
5142
//
52-
// Automatic conversions (via ADL):
43+
// ============================================================================
44+
// STRING CONVERSIONS
45+
// ============================================================================
46+
//
47+
// IMPORTANT: Rust strings require valid UTF-8, but KJ strings don't!
48+
// This library provides both SAFE and UNSAFE string conversion options.
49+
//
50+
// --- RUST TO C++ STRING CONVERSIONS ---
51+
//
52+
// Zero-copy (always safe):
53+
// - from<Rust>(rust::String) -> kj::ArrayPtr<const char>
54+
// - from<Rust>(rust::str) -> kj::ArrayPtr<const char>
55+
//
56+
// Owned copies (always safe):
57+
// - from<RustCopy>(rust::Slice<rust::str>) -> kj::Array<kj::String>
58+
// - from<RustCopy>(rust::Vec<rust::String>) -> kj::Array<kj::String>
59+
// - kj::str(rust::str) -> kj::String
60+
// - kj::str(rust::String) -> kj::String
61+
//
62+
// --- C++ TO RUST STRING CONVERSIONS (SAFE) ---
63+
//
64+
// Returns raw bytes - use std::str::from_utf8() or from_utf8_lossy() on Rust side:
65+
// - kjString.as<Rust>() -> rust::Slice<const char> // Safe for non-UTF-8 data
66+
// - kjStringPtr.as<Rust>() -> rust::Slice<const char> // Safe for non-UTF-8 data
67+
// - kjConstString.as<Rust>() -> rust::Slice<const char> // Safe for non-UTF-8 data
68+
//
69+
// Returns owned bytes - use std::str::from_utf8() or from_utf8_lossy() on Rust side:
70+
// - kjStringPtr.as<RustCopy>() -> rust::Vec<char> // Safe byte array
71+
// - kjConstString.as<RustCopy>() -> rust::Vec<char> // Safe byte array
72+
//
73+
// --- C++ TO RUST STRING CONVERSIONS (UNSAFE) ---
74+
//
75+
// ⚠️ WARNING: These assume valid UTF-8 and rust code might panic or cause undefined behavior
76+
// if the KJ string contains invalid UTF-8 bytes!
77+
//
78+
// Zero-copy (UNSAFE - assumes valid UTF-8):
79+
// - kjString.as<RustUncheckedUtf8>() -> rust::String
80+
// - kjStringPtr.as<RustUncheckedUtf8>() -> rust::Str
81+
// - kjConstString.as<RustUncheckedUtf8>() -> rust::Str
82+
//
83+
// Owned copies (UNSAFE - assumes valid UTF-8):
84+
// - kjStringPtr.as<RustCopyUncheckedUtf8>() -> rust::String
85+
// - kjConstString.as<RustCopyUncheckedUtf8>() -> rust::String
86+
//
87+
// --- AUTOMATIC STRING CONVERSIONS ---
88+
//
89+
// These work with kj::str() and kj::hashCode() automatically:
5390
// - kj::str(rust::String) - uses KJ_STRINGIFY for seamless string conversion
91+
// - kj::str(rust::Slice<const char>) - uses KJ_STRINGIFY for slice conversion
92+
// - kj::str(rust::Vec<char>) - uses KJ_STRINGIFY for vector conversion
5493
// - kj::hashCode(rust::String) - uses KJ_HASHCODE for hash computation
94+
// - kj::hashCode(rust::Slice<const char>) - uses KJ_HASHCODE for slice hashing
95+
// - kj::hashCode(rust::Vec<char>) - uses KJ_HASHCODE for vector hashing
5596
//
5697
// ============================================================================
5798
// EXAMPLES
@@ -62,17 +103,22 @@
62103
// // Convert Rust to C++:
63104
// kj::ArrayPtr<const int> cppView = from<Rust>(rustVec);
64105
//
65-
// // Convert C++ to Rust (read-only):
66-
// rust::Slice<const int> rustView = cppArray.as<Rust>();
106+
// // Convert C++ to Rust (read-only, safe):
107+
// rust::Slice<const char> rustBytes = kjString.as<Rust>();
108+
// // Then in Rust: std::str::from_utf8(&rustBytes) or from_utf8_lossy(&rustBytes)
67109
//
68110
// // Convert C++ to Rust (mutable):
69111
// rust::Slice<int> rustMutableView = cppArray.as<RustMutable>();
70112
//
71-
// // Convert C++ to Rust (copying):
72-
// rust::String rustOwnedStr = cppStr.as<RustCopy>();
113+
// // Convert C++ to Rust (copying, safe):
114+
// rust::Vec<char> rustOwnedBytes = kjStr.as<RustCopy>();
115+
//
116+
// // Convert C++ to Rust (unsafe, assumes valid UTF-8):
117+
// rust::String rustStr = kjStr.as<RustUncheckedUtf8>(); // UNSAFE!
73118
//
74119
// // Automatic string conversion:
75120
// kj::String cppStr = kj::str(rustStr); // via KJ_STRINGIFY
121+
// kj::String cppStr2 = kj::str(rustSlice); // also works with slices/vecs
76122
//
77123

78124
#include <rust/cxx.h>
@@ -101,6 +147,16 @@ inline auto KJ_STRINGIFY(const ::rust::str& str) {
101147
return kj::ArrayPtr<const char>(str.data(), str.size());
102148
}
103149

150+
/// Converts rust::Slice<const char> to kj::ArrayPtr - called by kj::str(rustSlice)
151+
inline auto KJ_STRINGIFY(const ::rust::Slice<const char>& str) {
152+
return kj::ArrayPtr<const char>(str.data(), str.size());
153+
}
154+
155+
/// Converts rust::Vec<const char> to kj::ArrayPtr - called by kj::str(rustVec)
156+
inline auto KJ_STRINGIFY(const ::rust::Vec<char>& str) {
157+
return kj::ArrayPtr<const char>(str.data(), str.size());
158+
}
159+
104160
/// Hash code for rust::String - called by kj::hashCode(rustString)
105161
inline auto KJ_HASHCODE(const ::rust::String& str) {
106162
return kj::hashCode(kj::toCharSequence(str));
@@ -111,6 +167,16 @@ inline auto KJ_HASHCODE(const ::rust::str& str) {
111167
return kj::hashCode(kj::toCharSequence(str));
112168
}
113169

170+
/// Hash code for rust::Slice<const char> - called by kj::hashCode(rustSlice)
171+
inline auto KJ_HASHCODE(const ::rust::Slice<const char>& str) {
172+
return kj::hashCode(kj::toCharSequence(str));
173+
}
174+
175+
/// Hash code for rust::Vec<const char> - called by kj::hashCode(rustVec)
176+
inline auto KJ_HASHCODE(const ::rust::Vec<char>& str) {
177+
return kj::hashCode(kj::toCharSequence(str));
178+
}
179+
114180
} // namespace rust
115181

116182
namespace kj_rs {
@@ -138,18 +204,18 @@ struct Rust {
138204
}
139205

140206
/// kjString.as<Rust>() - via Rust::from(&kjString)
141-
static ::rust::String from(const kj::String* str) {
142-
return ::rust::String(str->begin(), str->size());
207+
static ::rust::Slice<const char> from(const kj::String* str) {
208+
return ::rust::Slice(str->begin(), str->size());
143209
}
144210

145211
/// kjStringPtr.as<Rust>() - via Rust::from(&kjStringPtr)
146-
static ::rust::Str from(const kj::StringPtr* str) {
147-
return ::rust::Str(str->begin(), str->size());
212+
static ::rust::Slice<const char> from(const kj::StringPtr* str) {
213+
return ::rust::Slice(str->begin(), str->size());
148214
}
149215

150216
/// kjConstString.as<Rust>() - via Rust::from(&kjConstString)
151-
static ::rust::Str from(const kj::ConstString* str) {
152-
return ::rust::Str(str->begin(), str->size());
217+
static ::rust::Slice<const char> from(const kj::ConstString* str) {
218+
return ::rust::Slice(str->begin(), str->size());
153219
}
154220

155221
// into() methods for from<Rust>(rustObject) - converting Rust to KJ
@@ -179,16 +245,6 @@ struct Rust {
179245

180246
/// Owned Rust copies: kjObject.as<RustCopy>() and from<RustCopy>(kjObject)
181247
struct RustCopy {
182-
/// kjStringPtr.as<RustCopy>() - via RustCopy::from(&kjStringPtr)
183-
static ::rust::String from(const kj::StringPtr* str) {
184-
return ::rust::String(str->begin(), str->size());
185-
}
186-
187-
/// kjConstString.as<RustCopy>() - via RustCopy::from(&kjConstString)
188-
static ::rust::String from(const kj::ConstString* str) {
189-
return ::rust::String(str->begin(), str->size());
190-
}
191-
192248
/// kjArrayPtr.as<RustCopy>() - via RustCopy::from(&kjArrayPtr)
193249
template <typename T>
194250
static ::rust::Vec<T> from(kj::ArrayPtr<const T>* arr) {
@@ -200,6 +256,19 @@ struct RustCopy {
200256
return result;
201257
}
202258

259+
/// kjStringPtr.as<RustCopy>() - via RustCopy::from(&kjStringPtr)
260+
static ::rust::Vec<char> from(const kj::StringPtr* str) {
261+
auto ptr = str->asArray();
262+
return from(&ptr);
263+
}
264+
265+
/// kjConstString.as<RustCopy>() - via RustCopy::from(&kjConstString)
266+
static ::rust::Vec<char> from(const kj::ConstString* str) {
267+
auto ptr = str->asArray();
268+
return from(&ptr);
269+
}
270+
271+
203272
/// from<RustCopy>(rustSliceOfStrs) - Copy slice of strs to null-terminated KJ strings
204273
static kj::Array<kj::String> into(::rust::Slice<::rust::str> slice) {
205274
auto res = kj::heapArrayBuilder<kj::String>(slice.size());
@@ -234,4 +303,39 @@ struct RustMutable {
234303
}
235304
};
236305

237-
} // namespace kj_rs
306+
// Rust strings require valid utf8 content, which is not enforced by `kj::String`.
307+
// Passing invalid utf8 to `rust::String` could result in panics and other unexpected behaviour.
308+
// Use this struct to convert `kj::String` to `rust::String` without checking for valid utf8
309+
// when you are confident about the content of the string or do not care about the consequences.
310+
// It is also safer to convert strings to slices and use `from_utf8_lossy` or friends on rust side.
311+
struct RustUncheckedUtf8 {
312+
/// kjString.as<Rust>() - via Rust::from(&kjString)
313+
static ::rust::String from(const kj::String* str) {
314+
return ::rust::String(str->begin(), str->size());
315+
}
316+
317+
/// kjStringPtr.as<Rust>() - via Rust::from(&kjStringPtr)
318+
static ::rust::Str from(const kj::StringPtr* str) {
319+
return ::rust::Str(str->begin(), str->size());
320+
}
321+
322+
/// kjConstString.as<Rust>() - via Rust::from(&kjConstString)
323+
static ::rust::Str from(const kj::ConstString* str) {
324+
return ::rust::Str(str->begin(), str->size());
325+
}
326+
};
327+
328+
// Copying conversion for string types. See comment for `RustUncheckedUtf8` for details.
329+
struct RustCopyUncheckedUtf8 {
330+
/// kjStringPtr.as<RustCopy>() - via RustCopy::from(&kjStringPtr)
331+
static ::rust::String from(const kj::StringPtr* str) {
332+
return ::rust::String(str->begin(), str->size());
333+
}
334+
335+
/// kjConstString.as<RustCopy>() - via RustCopy::from(&kjConstString)
336+
static ::rust::String from(const kj::ConstString* str) {
337+
return ::rust::String(str->begin(), str->size());
338+
}
339+
};
340+
341+
} // namespace kj_rs

kj-rs/tests/BUILD.bazel

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,10 @@ cc_test(
110110
)
111111

112112
cc_test(
113-
name = "kj-test",
113+
name = "convert-test",
114114
size = "small",
115115
srcs = [
116-
"convert.c++",
116+
"convert-test.c++",
117117
],
118118
linkstatic = select({
119119
"@platforms//os:windows": True,

0 commit comments

Comments
 (0)