Skip to content

Commit 28452f1

Browse files
committed
[C++] Generate proper UTF-8 strings in JSON test utilities
1 parent 8e13dbc commit 28452f1

File tree

1 file changed

+71
-7
lines changed

1 file changed

+71
-7
lines changed

cpp/src/arrow/json/test_common.h

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -110,20 +110,84 @@ struct GenerateImpl {
110110
return OK(writer.Double(val));
111111
}
112112

113-
Status GenerateAscii(const DataType&) {
114-
auto size = std::poisson_distribution<>{4}(e);
115-
std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME generate UTF8
116-
std::string s(size, '\0');
117-
for (char& ch : s) ch = static_cast<char>(gen_char(e));
113+
Status GenerateUtf8(const DataType&) {
114+
// Generate random UTF-8 encoded strings from valid Unicode scalar values.
115+
auto num_codepoints = std::poisson_distribution<>{4}(e);
116+
std::string s;
117+
s.reserve(num_codepoints * 3);
118+
119+
for (int i = 0; i < num_codepoints; ++i) {
120+
uint32_t codepoint;
121+
std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
122+
uint32_t plane = plane_dist(e);
123+
124+
if (plane == 0) {
125+
// Basic Multilingual Plane (BMP): U+0000 to U+FFFF
126+
// Exclude surrogate code points (U+D800 to U+DFFF)
127+
// https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
128+
// Exclude control chars below U+0020 for readability
129+
// Generate from two ranges with equal probability (overrepresents the smaller upper range):
130+
// - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
131+
// - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
132+
if (std::bernoulli_distribution(0.5)(e)) {
133+
// Lower range: U+0020 to U+D7FF (before surrogate range)
134+
codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 0xD7FF)(e);
135+
} else {
136+
// Upper range: U+E000 to U+FFFD (after surrogate range)
137+
// Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
138+
// Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
139+
// as they are valid Unicode scalar values per the Unicode Standard
140+
codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 0xFFFD)(e);
141+
}
142+
} else if (plane == 1) {
143+
// Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
144+
// https://www.unicode.org/roadmaps/smp/
145+
codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 0x1FFFF)(e);
146+
} else if (plane == 2) {
147+
// Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
148+
// https://www.unicode.org/roadmaps/sip/
149+
codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 0x2FFFF)(e);
150+
} else {
151+
// Planes 3–16: U+30000–U+10FFFF
152+
// Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
153+
// Max valid Unicode codepoint is U+10FFFF per the Standard
154+
// https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
155+
codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 0x10FFFF)(e);
156+
}
157+
158+
// Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
159+
// https://www.rfc-editor.org/rfc/rfc3629.html#section-3
160+
if (codepoint <= 0x7F) {
161+
// 1-byte sequence: 0xxxxxxx
162+
s.push_back(static_cast<char>(codepoint));
163+
} else if (codepoint <= 0x7FF) {
164+
// 2-byte sequence: 110xxxxx 10xxxxxx
165+
s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
166+
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
167+
} else if (codepoint <= 0xFFFF) {
168+
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
169+
s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
170+
s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
171+
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
172+
} else {
173+
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
174+
s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
175+
s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
176+
s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
177+
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
178+
}
179+
}
180+
// Using c_str() is safe here because generation excludes U+0000 (no embedded nulls).
181+
// U+0000 can only exist in plane 0 (BMP), and BMP generation starts at U+0020.
118182
return OK(writer.String(s.c_str()));
119183
}
120184

121185
template <typename T>
122186
enable_if_base_binary<T, Status> Visit(const T& t) {
123-
return GenerateAscii(t);
187+
return GenerateUtf8(t);
124188
}
125189

126-
Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
190+
Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); }
127191

128192
template <typename T>
129193
enable_if_list_like<T, Status> Visit(const T& t) {

0 commit comments

Comments
 (0)