Skip to content

Commit 7189472

Browse files
authored
GH-46905: [C++][Parquet] Expose Statistics.is_{min/max}_value_exact and default set to true if min/max are set (#46992)
### Rationale for this change The `is_{min/max}_value_exact` fields exist on the thrift definition and some implementations are already using them and truncating min and max values. This PR aims to expose those values and to default to true when writing files on C++ as no truncation is happening at the moment. If min/max statistics are generated we can set `is_{min/max}_value_exact` to true. Truncation for string and binary min/max is out of scope for this PR, we can do this on a following one. ### What changes are included in this PR? - The fields have been added to EncodedStatistics and Statistics along with the Thrift integration. - Tests and validation with new parquet-testing file generated where there fields are present (apache/parquet-testing#88) - Tests with existing files without the fields. - Update existing tests to validate the new fields. - Add new fields to `ParquetFilePrinter` ### Are these changes tested? Yes on CI. ### Are there any user-facing changes? Yes, the new fields will be available for the users on the API when reading Parquet files. * GitHub Issue: #46905 Authored-by: Raúl Cumplido <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 2059243 commit 7189472

File tree

8 files changed

+322
-20
lines changed

8 files changed

+322
-20
lines changed

cpp/src/parquet/metadata.cc

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ std::string ParquetVersionToString(ParquetVersion::type ver) {
9292
template <typename DType>
9393
static std::shared_ptr<Statistics> MakeTypedColumnStats(
9494
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
95+
std::optional<bool> min_exact =
96+
metadata.statistics.__isset.is_min_value_exact
97+
? std::optional<bool>(metadata.statistics.is_min_value_exact)
98+
: std::nullopt;
99+
std::optional<bool> max_exact =
100+
metadata.statistics.__isset.is_max_value_exact
101+
? std::optional<bool>(metadata.statistics.is_max_value_exact)
102+
: std::nullopt;
95103
// If ColumnOrder is defined, return max_value and min_value
96104
if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
97105
return MakeStatistics<DType>(
@@ -100,15 +108,16 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
100108
metadata.statistics.null_count, metadata.statistics.distinct_count,
101109
metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
102110
metadata.statistics.__isset.null_count,
103-
metadata.statistics.__isset.distinct_count);
111+
metadata.statistics.__isset.distinct_count, min_exact, max_exact);
104112
}
105113
// Default behavior
106114
return MakeStatistics<DType>(
107115
descr, metadata.statistics.min, metadata.statistics.max,
108116
metadata.num_values - metadata.statistics.null_count,
109117
metadata.statistics.null_count, metadata.statistics.distinct_count,
110118
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
111-
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count);
119+
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count,
120+
min_exact, max_exact);
112121
}
113122

114123
namespace {

cpp/src/parquet/printer.cc

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,19 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
166166
stream << " Values: " << column_chunk->num_values();
167167
if (column_chunk->is_stats_set()) {
168168
std::string min = stats->min(), max = stats->max();
169+
std::string max_exact =
170+
stats->is_max_value_exact.has_value()
171+
? (stats->is_max_value_exact.value() ? "true" : "false")
172+
: "unknown";
173+
std::string min_exact =
174+
stats->is_min_value_exact.has_value()
175+
? (stats->is_min_value_exact.value() ? "true" : "false")
176+
: "unknown";
169177
stream << ", Null Values: " << stats->null_count
170178
<< ", Distinct Values: " << stats->distinct_count << std::endl
171-
<< " Max: "
179+
<< " Max (exact: " << max_exact << "): "
172180
<< FormatStatValue(descr->physical_type(), max, descr->logical_type())
173-
<< ", Min: "
181+
<< ", Min (exact: " << min_exact << "): "
174182
<< FormatStatValue(descr->physical_type(), min, descr->logical_type());
175183
} else {
176184
stream << " Statistics Not Set";
@@ -342,6 +350,22 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
342350
<< R"("Min": ")"
343351
<< FormatStatValue(descr->physical_type(), min, descr->logical_type())
344352
<< "\"";
353+
if (stats->is_max_value_exact().has_value()) {
354+
stream << ", "
355+
<< R"("IsMaxValueExact": ")"
356+
<< (stats->is_max_value_exact().value() ? "True" : "False") << "\"";
357+
} else {
358+
stream << ", "
359+
<< R"("IsMaxValueExact": "unknown")";
360+
}
361+
if (stats->is_min_value_exact().has_value()) {
362+
stream << ", "
363+
<< R"("IsMinValueExact": ")"
364+
<< (stats->is_min_value_exact().value() ? "True" : "False") << "\"";
365+
} else {
366+
stream << ", "
367+
<< R"("IsMinValueExact": "unknown")";
368+
}
345369
}
346370
stream << " },";
347371
} else {

cpp/src/parquet/reader_test.cc

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,7 @@ Column 0
10231023
Uncompressed Size: 103, Compressed Size: 104
10241024
Column 1
10251025
Values: 3, Null Values: 0, Distinct Values: 0
1026-
Max: 1, Min: 1
1026+
Max (exact: unknown): 1, Min (exact: unknown): 1
10271027
Compression: SNAPPY, Encodings: PLAIN_DICTIONARY(DICT_PAGE) PLAIN_DICTIONARY
10281028
Uncompressed Size: 52, Compressed Size: 56
10291029
)###";
@@ -1108,6 +1108,37 @@ class TestJSONWithLocalFile : public ::testing::Test {
11081108
}
11091109
};
11101110

1111+
TEST_F(TestJSONWithLocalFile, JSONOutputWithStatistics) {
1112+
std::string json_output = R"###({
1113+
"FileName": "nested_lists.snappy.parquet",
1114+
"Version": "1.0",
1115+
"CreatedBy": "parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)",
1116+
"TotalRows": "3",
1117+
"NumberOfRowGroups": "1",
1118+
"NumberOfRealColumns": "2",
1119+
"NumberOfColumns": "2",
1120+
"Columns": [
1121+
{ "Id": "0", "Name": "a.list.element.list.element.list.element", "PhysicalType": "BYTE_ARRAY", "ConvertedType": "UTF8", "LogicalType": {"Type": "String"} },
1122+
{ "Id": "1", "Name": "b", "PhysicalType": "INT32", "ConvertedType": "NONE", "LogicalType": {"Type": "None"} }
1123+
],
1124+
"RowGroups": [
1125+
{
1126+
"Id": "0", "TotalBytes": "155", "TotalCompressedBytes": "0", "Rows": "3",
1127+
"ColumnChunks": [
1128+
{"Id": "0", "Values": "18", "StatsSet": "False",
1129+
"Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE) PLAIN_DICTIONARY", "UncompressedSize": "103", "CompressedSize": "104" },
1130+
{"Id": "1", "Values": "3", "StatsSet": "True", "Stats": {"NumNulls": "0", "Max": "1", "Min": "1", "IsMaxValueExact": "unknown", "IsMinValueExact": "unknown" },
1131+
"Compression": "SNAPPY", "Encodings": "PLAIN_DICTIONARY(DICT_PAGE) PLAIN_DICTIONARY", "UncompressedSize": "52", "CompressedSize": "56" }
1132+
]
1133+
}
1134+
]
1135+
}
1136+
)###";
1137+
1138+
std::string json_content = ReadFromLocalFile("nested_lists.snappy.parquet");
1139+
ASSERT_EQ(json_output, json_content);
1140+
}
1141+
11111142
TEST_F(TestJSONWithLocalFile, JSONOutput) {
11121143
std::string json_output = R"###({
11131144
"FileName": "alltypes_plain.parquet",

cpp/src/parquet/statistics.cc

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -590,13 +590,27 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
590590
Copy(min, &min_, min_buffer_.get());
591591
Copy(max, &max_, max_buffer_.get());
592592
has_min_max_ = true;
593+
statistics_.is_min_value_exact = true;
594+
statistics_.is_max_value_exact = true;
593595
}
594596

595597
// Create stats from a thrift Statistics object.
596598
TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
597599
const std::string& encoded_max, int64_t num_values,
598600
int64_t null_count, int64_t distinct_count, bool has_min_max,
599601
bool has_null_count, bool has_distinct_count, MemoryPool* pool)
602+
: TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count,
603+
distinct_count, has_min_max, has_null_count,
604+
has_distinct_count,
605+
/*is_min_value_exact=*/std::nullopt,
606+
/*is_max_value_exact=*/std::nullopt, pool) {}
607+
608+
TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
609+
const std::string& encoded_max, int64_t num_values,
610+
int64_t null_count, int64_t distinct_count, bool has_min_max,
611+
bool has_null_count, bool has_distinct_count,
612+
std::optional<bool> is_min_value_exact,
613+
std::optional<bool> is_max_value_exact, MemoryPool* pool)
600614
: TypedStatisticsImpl(descr, pool) {
601615
TypedStatisticsImpl::IncrementNumValues(num_values);
602616
if (has_null_count) {
@@ -613,6 +627,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
613627
if (has_min_max) {
614628
PlainDecode(encoded_min, &min_);
615629
PlainDecode(encoded_max, &max_);
630+
statistics_.is_min_value_exact = is_min_value_exact;
631+
statistics_.is_max_value_exact = is_max_value_exact;
616632
}
617633

618634
has_min_max_ = has_min_max;
@@ -659,7 +675,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
659675

660676
return null_count() == other.null_count() &&
661677
distinct_count() == other.distinct_count() &&
662-
num_values() == other.num_values();
678+
num_values() == other.num_values() &&
679+
is_min_value_exact() == other.is_min_value_exact() &&
680+
is_max_value_exact() == other.is_max_value_exact();
663681
}
664682

665683
bool MinMaxEqual(const TypedStatisticsImpl& other) const;
@@ -742,6 +760,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
742760
if (HasMinMax()) {
743761
s.set_min(this->EncodeMin());
744762
s.set_max(this->EncodeMax());
763+
s.is_min_value_exact = this->is_min_value_exact();
764+
s.is_max_value_exact = this->is_max_value_exact();
745765
}
746766
if (HasNullCount()) {
747767
s.set_null_count(this->null_count());
@@ -757,6 +777,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
757777
int64_t null_count() const override { return statistics_.null_count; }
758778
int64_t distinct_count() const override { return statistics_.distinct_count; }
759779
int64_t num_values() const override { return num_values_; }
780+
std::optional<bool> is_min_value_exact() const override {
781+
return statistics_.is_min_value_exact;
782+
}
783+
std::optional<bool> is_max_value_exact() const override {
784+
return statistics_.is_max_value_exact;
785+
}
760786

761787
private:
762788
const ColumnDescriptor* descr_;
@@ -821,6 +847,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
821847
Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get());
822848
Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get());
823849
}
850+
statistics_.is_min_value_exact = true;
851+
statistics_.is_max_value_exact = true;
824852
}
825853
};
826854

@@ -1042,7 +1070,8 @@ std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
10421070
return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values,
10431071
encoded_stats->null_count, encoded_stats->distinct_count,
10441072
encoded_stats->has_min && encoded_stats->has_max,
1045-
encoded_stats->has_null_count, encoded_stats->has_distinct_count, pool);
1073+
encoded_stats->has_null_count, encoded_stats->has_distinct_count,
1074+
encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool);
10461075
}
10471076

10481077
std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
@@ -1052,11 +1081,24 @@ std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
10521081
int64_t distinct_count, bool has_min_max,
10531082
bool has_null_count, bool has_distinct_count,
10541083
::arrow::MemoryPool* pool) {
1084+
return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
1085+
distinct_count, has_min_max, has_null_count, has_distinct_count,
1086+
/*is_min_value_exact=*/std::nullopt,
1087+
/*is_max_value_exact=*/std::nullopt, pool);
1088+
}
1089+
1090+
std::shared_ptr<Statistics> Statistics::Make(
1091+
const ColumnDescriptor* descr, const std::string& encoded_min,
1092+
const std::string& encoded_max, int64_t num_values, int64_t null_count,
1093+
int64_t distinct_count, bool has_min_max, bool has_null_count,
1094+
bool has_distinct_count, std::optional<bool> is_min_value_exact,
1095+
std::optional<bool> is_max_value_exact, ::arrow::MemoryPool* pool) {
10551096
#define MAKE_STATS(CAP_TYPE, KLASS) \
10561097
case Type::CAP_TYPE: \
10571098
return std::make_shared<TypedStatisticsImpl<KLASS>>( \
10581099
descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
1059-
has_min_max, has_null_count, has_distinct_count, pool)
1100+
has_min_max, has_null_count, has_distinct_count, is_min_value_exact, \
1101+
is_max_value_exact, pool)
10601102

10611103
switch (descr->physical_type()) {
10621104
MAKE_STATS(BOOLEAN, BooleanType);

cpp/src/parquet/statistics.h

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ class PARQUET_EXPORT EncodedStatistics {
128128
const std::string& max() const { return max_; }
129129
const std::string& min() const { return min_; }
130130

131+
std::optional<bool> is_max_value_exact;
132+
std::optional<bool> is_min_value_exact;
133+
131134
int64_t null_count = 0;
132135
int64_t distinct_count = 0;
133136

@@ -151,10 +154,12 @@ class PARQUET_EXPORT EncodedStatistics {
151154
if (max_.length() > length) {
152155
has_max = false;
153156
max_.clear();
157+
is_max_value_exact = std::nullopt;
154158
}
155159
if (min_.length() > length) {
156160
has_min = false;
157161
min_.clear();
162+
is_min_value_exact = std::nullopt;
158163
}
159164
}
160165

@@ -223,6 +228,28 @@ class PARQUET_EXPORT Statistics {
223228
bool has_distinct_count,
224229
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
225230

231+
/// \brief Create a new statistics instance given a column schema
232+
/// definition and preexisting state
233+
/// \param[in] descr the column schema
234+
/// \param[in] encoded_min the encoded minimum value
235+
/// \param[in] encoded_max the encoded maximum value
236+
/// \param[in] num_values total number of values
237+
/// \param[in] null_count number of null values
238+
/// \param[in] distinct_count number of distinct values
239+
/// \param[in] has_min_max whether the min/max statistics are set
240+
/// \param[in] has_null_count whether the null_count statistics are set
241+
/// \param[in] has_distinct_count whether the distinct_count statistics are set
242+
/// \param[in] is_min_value_exact whether the min value is exact
243+
/// \param[in] is_max_value_exact whether the max value is exact
244+
/// \param[in] pool a memory pool to use for any memory allocations, optional
245+
static std::shared_ptr<Statistics> Make(
246+
const ColumnDescriptor* descr, const std::string& encoded_min,
247+
const std::string& encoded_max, int64_t num_values, int64_t null_count,
248+
int64_t distinct_count, bool has_min_max, bool has_null_count,
249+
bool has_distinct_count, std::optional<bool> is_min_value_exact,
250+
std::optional<bool> is_max_value_exact,
251+
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
252+
226253
// Helper function to convert EncodedStatistics to Statistics.
227254
// EncodedStatistics does not contain number of non-null values, and it can be
228255
// passed using the num_values parameter.
@@ -259,6 +286,14 @@ class PARQUET_EXPORT Statistics {
259286
/// \brief Plain-encoded maximum value
260287
virtual std::string EncodeMax() const = 0;
261288

289+
/// \brief Return the minimum value exact flag if set.
290+
/// It will be true if there was no truncation.
291+
virtual std::optional<bool> is_min_value_exact() const = 0;
292+
293+
/// \brief Return the maximum value exact flag if set.
294+
/// It will be true if there was no truncation.
295+
virtual std::optional<bool> is_max_value_exact() const = 0;
296+
262297
/// \brief The finalized encoded form of the statistics for transport
263298
virtual EncodedStatistics Encode() = 0;
264299

@@ -376,7 +411,23 @@ std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
376411
bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
377412
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
378413
descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
379-
has_min_max, has_null_count, has_distinct_count, pool));
414+
has_min_max, has_null_count, has_distinct_count,
415+
/*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool));
416+
}
417+
418+
/// \brief Typed version of Statistics::Make
419+
template <typename DType>
420+
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
421+
const ColumnDescriptor* descr, const std::string& encoded_min,
422+
const std::string& encoded_max, int64_t num_values, int64_t null_count,
423+
int64_t distinct_count, bool has_min_max, bool has_null_count,
424+
bool has_distinct_count, std::optional<bool> is_min_value_exact,
425+
std::optional<bool> is_max_value_exact,
426+
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
427+
return std::static_pointer_cast<TypedStatistics<DType>>(
428+
Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
429+
distinct_count, has_min_max, has_null_count, has_distinct_count,
430+
is_min_value_exact, is_max_value_exact, pool));
380431
}
381432

382433
} // namespace parquet

0 commit comments

Comments
 (0)