Skip to content

Commit 19f3b85

Browse files
authored
GH-47189: [C++][Parquet] Fix ParquetFilePrinter to produce valid JSON statistics (#47190)
### Rationale for this change ParquetFilePrinter uses FormatStatValue to print min/max stats. If stats contain non-UTF8 data, produced JSON is invalid. ### What changes are included in this PR? Make FormatStatValue aware of logical type and print hex values for binary data. ### Are these changes tested? Added test case to validate json output. ### Are there any user-facing changes? Yes, users may see hex values for binary values instead of char values in the past. * GitHub Issue: #47189 Authored-by: Gang Wu <[email protected]> Signed-off-by: Gang Wu <[email protected]>
1 parent 2cbf122 commit 19f3b85

File tree

5 files changed

+224
-44
lines changed

5 files changed

+224
-44
lines changed

cpp/src/parquet/printer.cc

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,10 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
168168
std::string min = stats->min(), max = stats->max();
169169
stream << ", Null Values: " << stats->null_count
170170
<< ", Distinct Values: " << stats->distinct_count << std::endl
171-
<< " Max: " << FormatStatValue(descr->physical_type(), max)
172-
<< ", Min: " << FormatStatValue(descr->physical_type(), min);
171+
<< " Max: "
172+
<< FormatStatValue(descr->physical_type(), max, descr->logical_type())
173+
<< ", Min: "
174+
<< FormatStatValue(descr->physical_type(), min, descr->logical_type());
173175
} else {
174176
stream << " Statistics Not Set";
175177
}
@@ -334,9 +336,12 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
334336
if (stats->HasMinMax()) {
335337
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
336338
stream << ", "
337-
<< R"("Max": ")" << FormatStatValue(descr->physical_type(), max)
339+
<< R"("Max": ")"
340+
<< FormatStatValue(descr->physical_type(), max, descr->logical_type())
338341
<< "\", "
339-
<< R"("Min": ")" << FormatStatValue(descr->physical_type(), min) << "\"";
342+
<< R"("Min": ")"
343+
<< FormatStatValue(descr->physical_type(), min, descr->logical_type())
344+
<< "\"";
340345
}
341346
stream << " },";
342347
} else {

cpp/src/parquet/reader_test.cc

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,19 +1196,24 @@ TEST_F(TestJSONWithLocalFile, JSONOutputSortColumns) {
11961196
EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
11971197
}
11981198

1199+
namespace {
1200+
1201+
::arrow::Status CheckJsonValid(std::string_view json_string) {
1202+
rj::Document json_doc;
1203+
constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag;
1204+
json_doc.Parse<kParseFlags>(json_string.data(), json_string.length());
1205+
if (json_doc.HasParseError()) {
1206+
return ::arrow::Status::Invalid("JSON parse error at offset ",
1207+
json_doc.GetErrorOffset(), ": ",
1208+
rj::GetParseError_En(json_doc.GetParseError()));
1209+
}
1210+
return ::arrow::Status::OK();
1211+
}
1212+
1213+
} // namespace
1214+
11991215
// GH-44101: Test that JSON output is valid JSON
12001216
TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
1201-
auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status {
1202-
rj::Document json_doc;
1203-
constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag;
1204-
json_doc.Parse<kParseFlags>(json_string.data(), json_string.length());
1205-
if (json_doc.HasParseError()) {
1206-
return ::arrow::Status::Invalid("JSON parse error at offset ",
1207-
json_doc.GetErrorOffset(), ": ",
1208-
rj::GetParseError_En(json_doc.GetParseError()));
1209-
}
1210-
return ::arrow::Status::OK();
1211-
};
12121217
std::vector<std::string_view> check_file_lists = {
12131218
"data_index_bloom_encoding_with_length.parquet",
12141219
"data_index_bloom_encoding_stats.parquet",
@@ -1218,11 +1223,62 @@ TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
12181223
"sort_columns.parquet"};
12191224
for (const auto& file : check_file_lists) {
12201225
std::string json_content = ReadFromLocalFile(file);
1221-
ASSERT_OK(check_json_valid(json_content))
1226+
ASSERT_OK(CheckJsonValid(json_content))
12221227
<< "Invalid JSON output for file: " << file << ", content:" << json_content;
12231228
}
12241229
}
12251230

1231+
TEST(TestJSONWithMemoryFile, ValidJsonOutput) {
1232+
using ::arrow::internal::checked_cast;
1233+
auto schema = std::static_pointer_cast<GroupNode>(GroupNode::Make(
1234+
"schema", Repetition::REQUIRED,
1235+
schema::NodeVector{PrimitiveNode::Make("string_field", Repetition::REQUIRED,
1236+
LogicalType::String(), Type::BYTE_ARRAY),
1237+
PrimitiveNode::Make("binary_field", Repetition::REQUIRED,
1238+
LogicalType::None(), Type::BYTE_ARRAY)}));
1239+
1240+
ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create());
1241+
auto file_writer = ParquetFileWriter::Open(out_file, schema);
1242+
auto row_group_writer = file_writer->AppendRowGroup();
1243+
1244+
// Write string column with valid UTF8 data
1245+
auto string_writer = checked_cast<ByteArrayWriter*>(row_group_writer->NextColumn());
1246+
std::vector<std::string> utf8_strings = {"Hello", "World", "UTF8 测试", "🌟"};
1247+
std::vector<ByteArray> string_values;
1248+
for (const auto& str : utf8_strings) {
1249+
string_values.emplace_back(std::string_view(str));
1250+
}
1251+
string_writer->WriteBatch(string_values.size(), nullptr, nullptr, string_values.data());
1252+
1253+
// Write binary column with non-UTF8 data
1254+
auto binary_writer = checked_cast<ByteArrayWriter*>(row_group_writer->NextColumn());
1255+
std::vector<std::vector<uint8_t>> binary_data = {{0x00, 0x01, 0x02, 0x03},
1256+
{0xFF, 0xFE, 0xFD, 0xFC},
1257+
{0x80, 0x81, 0x82, 0x83},
1258+
{0xC0, 0xC1, 0xF5, 0xF6}};
1259+
std::vector<ByteArray> binary_values;
1260+
for (const auto& data : binary_data) {
1261+
binary_values.emplace_back(
1262+
std::string_view(reinterpret_cast<const char*>(data.data()), data.size()));
1263+
}
1264+
binary_writer->WriteBatch(binary_values.size(), nullptr, nullptr, binary_values.data());
1265+
1266+
row_group_writer->Close();
1267+
file_writer->Close();
1268+
1269+
// Read the file back and print as JSON
1270+
ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish());
1271+
auto reader =
1272+
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(file_buf));
1273+
ParquetFilePrinter printer(reader.get());
1274+
1275+
// Verify the output is valid JSON
1276+
std::stringstream json_output;
1277+
printer.JSONPrint(json_output, {});
1278+
std::string json_content = json_output.str();
1279+
ASSERT_OK(CheckJsonValid(json_content)) << "Invalid JSON output: " << json_content;
1280+
}
1281+
12261282
TEST(TestFileReader, BufferedReadsWithDictionary) {
12271283
const int num_rows = 1000;
12281284

cpp/src/parquet/types.cc

Lines changed: 106 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,16 @@
1818
#include <array>
1919
#include <cmath>
2020
#include <cstdint>
21+
#include <iomanip>
2122
#include <memory>
2223
#include <sstream>
2324
#include <string>
2425

2526
#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
2627
#include "arrow/util/checked_cast.h"
2728
#include "arrow/util/compression.h"
29+
#include "arrow/util/decimal.h"
30+
#include "arrow/util/float16.h"
2831
#include "arrow/util/logging_internal.h"
2932

3033
#include <rapidjson/document.h>
@@ -96,9 +99,87 @@ bool PageCanUseChecksum(PageType::type pageType) {
9699
}
97100
}
98101

99-
std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
102+
namespace {
103+
104+
template <typename T>
105+
std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
106+
::std::string_view val) {
107+
std::stringstream result;
108+
T value{};
109+
std::memcpy(&value, val.data(), sizeof(T));
110+
result << value;
111+
return result.str();
112+
}
113+
114+
std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val,
115+
const std::shared_ptr<const LogicalType>& logical_type) {
116+
ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal());
117+
118+
const auto& decimal_type =
119+
::arrow::internal::checked_cast<const DecimalLogicalType&>(*logical_type);
120+
const int32_t scale = decimal_type.scale();
121+
122+
std::stringstream result;
123+
switch (parquet_type) {
124+
case Type::INT32: {
125+
int32_t int_value{};
126+
std::memcpy(&int_value, val.data(), sizeof(int32_t));
127+
::arrow::Decimal128 decimal_value(int_value);
128+
result << decimal_value.ToString(scale);
129+
break;
130+
}
131+
case Type::INT64: {
132+
int64_t long_value{};
133+
std::memcpy(&long_value, val.data(), sizeof(int64_t));
134+
::arrow::Decimal128 decimal_value(long_value);
135+
result << decimal_value.ToString(scale);
136+
break;
137+
}
138+
case Type::FIXED_LEN_BYTE_ARRAY:
139+
case Type::BYTE_ARRAY: {
140+
auto decimal_result = ::arrow::Decimal128::FromBigEndian(
141+
reinterpret_cast<const uint8_t*>(val.data()), static_cast<int32_t>(val.size()));
142+
if (!decimal_result.ok()) {
143+
throw ParquetException("Failed to parse decimal value: ",
144+
decimal_result.status().message());
145+
}
146+
result << decimal_result.ValueUnsafe().ToString(scale);
147+
break;
148+
}
149+
default:
150+
throw ParquetException("Unsupported decimal type: ", TypeToString(parquet_type));
151+
}
152+
153+
return result.str();
154+
}
155+
156+
std::string FormatNonUTF8Value(::std::string_view val) {
157+
if (val.empty()) {
158+
return "";
159+
}
160+
100161
std::stringstream result;
162+
result << "0x" << std::hex;
163+
for (const auto& c : val) {
164+
result << std::setw(2) << std::setfill('0')
165+
<< static_cast<int>(static_cast<unsigned char>(c));
166+
}
167+
return result.str();
168+
}
101169

170+
std::string FormatFloat16Value(::std::string_view val) {
171+
std::stringstream result;
172+
auto float16 = ::arrow::util::Float16::FromLittleEndian(
173+
reinterpret_cast<const uint8_t*>(val.data()));
174+
result << float16.ToFloat();
175+
return result.str();
176+
}
177+
178+
} // namespace
179+
180+
std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
181+
const std::shared_ptr<const LogicalType>& logical_type) {
182+
std::stringstream result;
102183
const char* bytes = val.data();
103184
switch (parquet_type) {
104185
case Type::BOOLEAN: {
@@ -108,28 +189,22 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
108189
break;
109190
}
110191
case Type::INT32: {
111-
int32_t value{};
112-
std::memcpy(&value, bytes, sizeof(int32_t));
113-
result << value;
114-
break;
192+
if (logical_type != nullptr && logical_type->is_decimal()) {
193+
return FormatDecimalValue(parquet_type, val, logical_type);
194+
}
195+
return FormatNumericValue<int32_t>(val);
115196
}
116197
case Type::INT64: {
117-
int64_t value{};
118-
std::memcpy(&value, bytes, sizeof(int64_t));
119-
result << value;
120-
break;
198+
if (logical_type != nullptr && logical_type->is_decimal()) {
199+
return FormatDecimalValue(parquet_type, val, logical_type);
200+
}
201+
return FormatNumericValue<int64_t>(val);
121202
}
122203
case Type::DOUBLE: {
123-
double value{};
124-
std::memcpy(&value, bytes, sizeof(double));
125-
result << value;
126-
break;
204+
return FormatNumericValue<double>(val);
127205
}
128206
case Type::FLOAT: {
129-
float value{};
130-
std::memcpy(&value, bytes, sizeof(float));
131-
result << value;
132-
break;
207+
return FormatNumericValue<float>(val);
133208
}
134209
case Type::INT96: {
135210
std::array<int32_t, 3> values{};
@@ -139,8 +214,18 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
139214
}
140215
case Type::BYTE_ARRAY:
141216
case Type::FIXED_LEN_BYTE_ARRAY: {
142-
result << val;
143-
break;
217+
if (logical_type != nullptr) {
218+
if (logical_type->is_decimal()) {
219+
return FormatDecimalValue(parquet_type, val, logical_type);
220+
}
221+
if (logical_type->is_string()) {
222+
return std::string(val);
223+
}
224+
if (logical_type->is_float16()) {
225+
return FormatFloat16Value(val);
226+
}
227+
}
228+
return FormatNonUTF8Value(val);
144229
}
145230
case Type::UNDEFINED:
146231
default:
@@ -1730,7 +1815,7 @@ format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
17301815
format::LogicalType type;
17311816
format::GeometryType geometry_type;
17321817

1733-
// Canonially export crs of "" as an unset CRS
1818+
// Canonically export crs of "" as an unset CRS
17341819
if (!crs_.empty()) {
17351820
geometry_type.__set_crs(crs_);
17361821
}
@@ -1825,7 +1910,7 @@ format::LogicalType LogicalType::Impl::Geography::ToThrift() const {
18251910
format::LogicalType type;
18261911
format::GeographyType geography_type;
18271912

1828-
// Canonially export crs of "" as an unset CRS
1913+
// Canonically export crs of "" as an unset CRS
18291914
if (!crs_.empty()) {
18301915
geography_type.__set_crs(crs_);
18311916
}

cpp/src/parquet/types.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -853,8 +853,9 @@ PARQUET_EXPORT std::string TypeToString(Type::type t);
853853

854854
PARQUET_EXPORT std::string TypeToString(Type::type t, int type_length);
855855

856-
PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
857-
::std::string_view val);
856+
PARQUET_EXPORT std::string FormatStatValue(
857+
Type::type parquet_type, ::std::string_view val,
858+
const std::shared_ptr<const LogicalType>& logical_type = NULLPTR);
858859

859860
PARQUET_EXPORT int GetTypeByteSize(Type::type t);
860861

cpp/src/parquet/types_test.cc

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,17 +117,50 @@ TEST(TypePrinter, StatisticsTypes) {
117117

118118
smin = std::string("abcdef");
119119
smax = std::string("ijklmnop");
120-
ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin).c_str());
121-
ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax).c_str());
120+
ASSERT_EQ(smin, FormatStatValue(Type::BYTE_ARRAY, smin, LogicalType::String()));
121+
ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax, LogicalType::String()));
122+
ASSERT_EQ("0x616263646566", FormatStatValue(Type::BYTE_ARRAY, smin));
123+
ASSERT_EQ("0x696a6b6c6d6e6f70", FormatStatValue(Type::BYTE_ARRAY, smax));
122124

123125
// PARQUET-1357: FormatStatValue truncates binary statistics on zero character
124126
smax.push_back('\0');
125-
ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax));
127+
ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax, LogicalType::String()));
128+
ASSERT_EQ("0x696a6b6c6d6e6f7000", FormatStatValue(Type::BYTE_ARRAY, smax));
126129

130+
// String
127131
smin = std::string("abcdefgh");
128132
smax = std::string("ijklmnop");
129-
ASSERT_STREQ("abcdefgh", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin).c_str());
130-
ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax).c_str());
133+
ASSERT_EQ(smin,
134+
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin, LogicalType::String()));
135+
ASSERT_EQ(smax,
136+
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax, LogicalType::String()));
137+
ASSERT_EQ("0x6162636465666768", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin));
138+
ASSERT_EQ("0x696a6b6c6d6e6f70", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax));
139+
140+
// Decimal
141+
int32_t int32_decimal = 1024;
142+
smin = std::string(reinterpret_cast<char*>(&int32_decimal), sizeof(int32_t));
143+
ASSERT_EQ("10.24", FormatStatValue(Type::INT32, smin, LogicalType::Decimal(6, 2)));
144+
145+
int64_t int64_decimal = 102'400'000'000;
146+
smin = std::string(reinterpret_cast<char*>(&int64_decimal), sizeof(int64_t));
147+
ASSERT_EQ("10240000.0000",
148+
FormatStatValue(Type::INT64, smin, LogicalType::Decimal(18, 4)));
149+
150+
std::vector<char> bytes = {0x11, 0x22, 0x33, 0x44};
151+
smin = std::string(bytes.begin(), bytes.end());
152+
ASSERT_EQ("28745.4020",
153+
FormatStatValue(Type::BYTE_ARRAY, smin, LogicalType::Decimal(10, 4)));
154+
ASSERT_EQ("28745.4020", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin,
155+
LogicalType::Decimal(10, 4)));
156+
ASSERT_EQ("0x11223344", FormatStatValue(Type::BYTE_ARRAY, smin));
157+
ASSERT_EQ("0x11223344", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin));
158+
159+
// Float16
160+
bytes = {0x1c, 0x50};
161+
smin = std::string(bytes.begin(), bytes.end());
162+
ASSERT_EQ("32.875",
163+
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin, LogicalType::Float16()));
131164
}
132165

133166
TEST(TestInt96Timestamp, Decoding) {

0 commit comments

Comments
 (0)