[C++] Generate proper UTF-8 strings in JSON test utilities

HyukjinKwon · HyukjinKwon · commit 28452f1bfc84 · 2026-01-22T18:21:59.000+09:00
diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
@@ -110,20 +110,84 @@ struct GenerateImpl {
     return OK(writer.Double(val));
   }
 
-  Status GenerateAscii(const DataType&) {
-    auto size = std::poisson_distribution<>{4}(e);
-    std::uniform_int_distribution<uint16_t> gen_char(32, 126);  // FIXME generate UTF8
-    std::string s(size, '\0');
-    for (char& ch : s) ch = static_cast<char>(gen_char(e));
+  Status GenerateUtf8(const DataType&) {
+    // Generate random UTF-8 encoded strings from valid Unicode scalar values.
+    auto num_codepoints = std::poisson_distribution<>{4}(e);
+    std::string s;
+    s.reserve(num_codepoints * 3);
+
+    for (int i = 0; i < num_codepoints; ++i) {
+      uint32_t codepoint;
+      std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
+      uint32_t plane = plane_dist(e);
+
+      if (plane == 0) {
+        // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
+        // Exclude surrogate code points (U+D800 to U+DFFF)
+        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
+        // Exclude control chars below U+0020 for readability
+        // Generate from two ranges with equal probability (overrepresents the smaller upper range):
+        // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
+        // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
+        if (std::bernoulli_distribution(0.5)(e)) {
+          // Lower range: U+0020 to U+D7FF (before surrogate range)
+          codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 0xD7FF)(e);
+        } else {
+          // Upper range: U+E000 to U+FFFD (after surrogate range)
+          // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
+          // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
+          // as they are valid Unicode scalar values per the Unicode Standard
+          codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 0xFFFD)(e);
+        }
+      } else if (plane == 1) {
+        // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
+        // https://www.unicode.org/roadmaps/smp/
+        codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 0x1FFFF)(e);
+      } else if (plane == 2) {
+        // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
+        // https://www.unicode.org/roadmaps/sip/
+        codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 0x2FFFF)(e);
+      } else {
+        // Planes 3–16: U+30000–U+10FFFF
+        // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
+        // Max valid Unicode codepoint is U+10FFFF per the Standard
+        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
+        codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 0x10FFFF)(e);
+      }
+
+      // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
+      // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+      if (codepoint <= 0x7F) {
+        // 1-byte sequence: 0xxxxxxx
+        s.push_back(static_cast<char>(codepoint));
+      } else if (codepoint <= 0x7FF) {
+        // 2-byte sequence: 110xxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      } else if (codepoint <= 0xFFFF) {
+        // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      } else {
+        // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      }
+    }
+    // Using c_str() is safe here because generation excludes U+0000 (no embedded nulls).
+    // U+0000 can only exist in plane 0 (BMP), and BMP generation starts at U+0020.
     return OK(writer.String(s.c_str()));
   }
 
   template <typename T>
   enable_if_base_binary<T, Status> Visit(const T& t) {
-    return GenerateAscii(t);
+    return GenerateUtf8(t);
   }
 
-  Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
+  Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); }
 
   template <typename T>
   enable_if_list_like<T, Status> Visit(const T& t) {