@@ -110,20 +110,84 @@ struct GenerateImpl {
110110 return OK (writer.Double (val));
111111 }
112112
113- Status GenerateAscii (const DataType&) {
114- auto size = std::poisson_distribution<>{4 }(e);
115- std::uniform_int_distribution<uint16_t > gen_char (32 , 126 ); // FIXME generate UTF8
116- std::string s (size, ' \0 ' );
117- for (char & ch : s) ch = static_cast <char >(gen_char (e));
113+ Status GenerateUtf8 (const DataType&) {
114+ // Generate random UTF-8 encoded strings from valid Unicode scalar values.
115+ auto num_codepoints = std::poisson_distribution<>{4 }(e);
116+ std::string s;
117+ s.reserve (num_codepoints * 3 );
118+
119+ for (int i = 0 ; i < num_codepoints; ++i) {
120+ uint32_t codepoint;
121+ std::uniform_int_distribution<uint32_t > plane_dist (0 , 3 );
122+ uint32_t plane = plane_dist (e);
123+
124+ if (plane == 0 ) {
125+ // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
126+ // Exclude surrogate code points (U+D800 to U+DFFF)
127+ // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
128+ // Exclude control chars below U+0020 for readability
129+ // Generate from two ranges with equal probability (overrepresents the smaller upper range):
130+ // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
131+ // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
132+ if (std::bernoulli_distribution (0.5 )(e)) {
133+ // Lower range: U+0020 to U+D7FF (before surrogate range)
134+ codepoint = std::uniform_int_distribution<uint32_t >(0x0020 , 0xD7FF )(e);
135+ } else {
136+ // Upper range: U+E000 to U+FFFD (after surrogate range)
137+ // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
138+ // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
139+ // as they are valid Unicode scalar values per the Unicode Standard
140+ codepoint = std::uniform_int_distribution<uint32_t >(0xE000 , 0xFFFD )(e);
141+ }
142+ } else if (plane == 1 ) {
143+ // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
144+ // https://www.unicode.org/roadmaps/smp/
145+ codepoint = std::uniform_int_distribution<uint32_t >(0x10000 , 0x1FFFF )(e);
146+ } else if (plane == 2 ) {
147+ // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
148+ // https://www.unicode.org/roadmaps/sip/
149+ codepoint = std::uniform_int_distribution<uint32_t >(0x20000 , 0x2FFFF )(e);
150+ } else {
151+ // Planes 3–16: U+30000–U+10FFFF
152+ // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
153+ // Max valid Unicode codepoint is U+10FFFF per the Standard
154+ // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
155+ codepoint = std::uniform_int_distribution<uint32_t >(0x30000 , 0x10FFFF )(e);
156+ }
157+
158+ // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
159+ // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
160+ if (codepoint <= 0x7F ) {
161+ // 1-byte sequence: 0xxxxxxx
162+ s.push_back (static_cast <char >(codepoint));
163+ } else if (codepoint <= 0x7FF ) {
164+ // 2-byte sequence: 110xxxxx 10xxxxxx
165+ s.push_back (static_cast <char >(0xC0 | (codepoint >> 6 )));
166+ s.push_back (static_cast <char >(0x80 | (codepoint & 0x3F )));
167+ } else if (codepoint <= 0xFFFF ) {
168+ // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
169+ s.push_back (static_cast <char >(0xE0 | (codepoint >> 12 )));
170+ s.push_back (static_cast <char >(0x80 | ((codepoint >> 6 ) & 0x3F )));
171+ s.push_back (static_cast <char >(0x80 | (codepoint & 0x3F )));
172+ } else {
173+ // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
174+ s.push_back (static_cast <char >(0xF0 | (codepoint >> 18 )));
175+ s.push_back (static_cast <char >(0x80 | ((codepoint >> 12 ) & 0x3F )));
176+ s.push_back (static_cast <char >(0x80 | ((codepoint >> 6 ) & 0x3F )));
177+ s.push_back (static_cast <char >(0x80 | (codepoint & 0x3F )));
178+ }
179+ }
180+ // Using c_str() is safe here because generation excludes U+0000 (no embedded nulls).
181+ // U+0000 can only exist in plane 0 (BMP), and BMP generation starts at U+0020.
118182 return OK (writer.String (s.c_str ()));
119183 }
120184
121185 template <typename T>
122186 enable_if_base_binary<T, Status> Visit (const T& t) {
123- return GenerateAscii (t);
187+ return GenerateUtf8 (t);
124188 }
125189
126- Status Visit (const BinaryViewType& t) { return GenerateAscii (t); }
190+ Status Visit (const BinaryViewType& t) { return GenerateUtf8 (t); }
127191
128192 template <typename T>
129193 enable_if_list_like<T, Status> Visit (const T& t) {
0 commit comments