Skip to content

Commit 1175741

Browse files
committed
Add test for Printer Json format with statistics
1 parent 8045030 commit 1175741

File tree

3 files changed

+43
-7
lines changed

3 files changed

+43
-7
lines changed

cpp/src/parquet/metadata.cc

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ std::string ParquetVersionToString(ParquetVersion::type ver) {
9292
template <typename DType>
9393
static std::shared_ptr<Statistics> MakeTypedColumnStats(
9494
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
95+
std::optional<bool> min_exact =
96+
metadata.statistics.__isset.is_min_value_exact
97+
? std::optional<bool>(metadata.statistics.is_min_value_exact)
98+
: std::nullopt;
99+
std::optional<bool> max_exact =
100+
metadata.statistics.__isset.is_max_value_exact
101+
? std::optional<bool>(metadata.statistics.is_max_value_exact)
102+
: std::nullopt;
95103
// If ColumnOrder is defined, return max_value and min_value
96104
if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
97105
return MakeStatistics<DType>(
@@ -100,9 +108,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
100108
metadata.statistics.null_count, metadata.statistics.distinct_count,
101109
metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
102110
metadata.statistics.__isset.null_count,
103-
metadata.statistics.__isset.distinct_count,
104-
metadata.statistics.__isset.is_min_value_exact,
105-
metadata.statistics.__isset.is_max_value_exact);
111+
metadata.statistics.__isset.distinct_count, min_exact, max_exact);
106112
}
107113
// Default behavior
108114
return MakeStatistics<DType>(
@@ -111,8 +117,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
111117
metadata.statistics.null_count, metadata.statistics.distinct_count,
112118
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
113119
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count,
114-
metadata.statistics.__isset.is_min_value_exact,
115-
metadata.statistics.__isset.is_max_value_exact);
120+
min_exact, max_exact);
116121
}
117122

118123
namespace {

cpp/src/parquet/printer.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,15 +353,15 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
353353
if (stats->is_max_value_exact().has_value()) {
354354
stream << ", "
355355
<< R"("IsMaxValueExact": ")"
356-
<< (stats->is_max_value_exact().value() ? "true" : "false") << "\"";
356+
<< (stats->is_max_value_exact().value() ? "True" : "False") << "\"";
357357
} else {
358358
stream << ", "
359359
<< R"("IsMaxValueExact": "unknown")";
360360
}
361361
if (stats->is_min_value_exact().has_value()) {
362362
stream << ", "
363363
<< R"("IsMinValueExact": ")"
364-
<< (stats->is_min_value_exact().value() ? "true" : "false") << "\"";
364+
<< (stats->is_min_value_exact().value() ? "True" : "False") << "\"";
365365
} else {
366366
stream << ", "
367367
<< R"("IsMinValueExact": "unknown")";

cpp/src/parquet/reader_test.cc

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,37 @@ class TestJSONWithLocalFile : public ::testing::Test {
11081108
}
11091109
};
11101110

1111+
TEST_F(TestJSONWithLocalFile, JSONOutputWithStatistics) {
1112+
std::string json_output = R"###({
1113+
"FileName": "nested_lists.snappy.parquet",
1114+
"Version": "1.0",
1115+
"CreatedBy": "parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)",
1116+
"TotalRows": "3",
1117+
"NumberOfRowGroups": "1",
1118+
"NumberOfRealColumns": "2",
1119+
"NumberOfColumns": "2",
1120+
"Columns": [
1121+
{ "Id": "0", "Name": "a.list.element.list.element.list.element", "PhysicalType": "BYTE_ARRAY", "ConvertedType": "UTF8", "LogicalType": {"Type": "String"} },
1122+
{ "Id": "1", "Name": "b", "PhysicalType": "INT32", "ConvertedType": "NONE", "LogicalType": {"Type": "None"} }
1123+
],
1124+
"RowGroups": [
1125+
{
1126+
"Id": "0", "TotalBytes": "155", "TotalCompressedBytes": "0", "Rows": "3",
1127+
"ColumnChunks": [
1128+
{"Id": "0", "Values": "18", "StatsSet": "False",
1129+
"Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE) PLAIN_DICTIONARY", "UncompressedSize": "103", "CompressedSize": "104" },
1130+
{"Id": "1", "Values": "3", "StatsSet": "True", "Stats": {"NumNulls": "0", "Max": "1", "Min": "1", "IsMaxValueExact": "unknown", "IsMinValueExact": "unknown" },
1131+
"Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE) PLAIN_DICTIONARY", "UncompressedSize": "52", "CompressedSize": "56" }
1132+
]
1133+
}
1134+
]
1135+
}
1136+
)###";
1137+
1138+
std::string json_content = ReadFromLocalFile("nested_lists.snappy.parquet");
1139+
ASSERT_EQ(json_output, json_content);
1140+
}
1141+
11111142
TEST_F(TestJSONWithLocalFile, JSONOutput) {
11121143
std::string json_output = R"###({
11131144
"FileName": "alltypes_plain.parquet",

0 commit comments

Comments
 (0)