Skip to content

Commit cbd36b8

Browse files
raulcdwgtmac
andauthored
GH-47449: [C++][Parquet] Do not drop all Statistics if SortOrder is UNKNOWN (#47466)
### Rationale for this change Currently we drop all statistics if `SortOrder` is `UNKNOWN`. This seems too broad and there are some statistics, like `null_count` that could be maintained. https://github.com/apache/arrow/blob/6f6138b7eedece0841b04f4e235e3bedf5a3ee29/cpp/src/parquet/metadata.cc#L330-L335 Clearing `min/max` but allowing to keep `null_count` when `SortOrder` is `UNKNOWN` would allow users to use them. ### What changes are included in this PR? Maintain Statistics when reading them if `SortOrder::UNKNOWK` but clear min/max ### Are these changes tested? Yes, there is a file on parquet-testing which allows us to validate this exact scenario. ### Are there any user-facing changes? No changes to APIs, users will be able to read statistics on this case. * GitHub Issue: #47449 Lead-authored-by: Raúl Cumplido <[email protected]> Co-authored-by: Gang Wu <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent ef1af63 commit cbd36b8

File tree

4 files changed

+57
-13
lines changed

4 files changed

+57
-13
lines changed

cpp/src/parquet/metadata.cc

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -327,17 +327,18 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
327327
// 2) Statistics must not be corrupted
328328
inline bool is_stats_set() const {
329329
DCHECK(writer_version_ != nullptr);
330-
// If the column statistics don't exist or column sort order is unknown
331-
// we cannot use the column stats
332-
if (!column_metadata_->__isset.statistics ||
333-
descr_->sort_order() == SortOrder::UNKNOWN) {
330+
if (!column_metadata_->__isset.statistics) {
334331
return false;
335332
}
336333
{
337334
const std::lock_guard<std::mutex> guard(stats_mutex_);
338335
if (possible_encoded_stats_ == nullptr) {
339336
possible_encoded_stats_ =
340337
std::make_shared<EncodedStatistics>(FromThrift(column_metadata_->statistics));
338+
if (descr_->sort_order() == SortOrder::UNKNOWN) {
339+
// If the column SortOrder is Unknown we can't trust max/min.
340+
possible_encoded_stats_->ClearMinMax();
341+
}
341342
}
342343
}
343344
return writer_version_->HasCorrectStatistics(type(), *possible_encoded_stats_,
@@ -1588,11 +1589,6 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
15881589
return true;
15891590
}
15901591

1591-
// Unknown sort order has incorrect stats
1592-
if (SortOrder::UNKNOWN == sort_order) {
1593-
return false;
1594-
}
1595-
15961592
// PARQUET-251
15971593
if (VersionLt(PARQUET_251_FIXED_VERSION())) {
15981594
return false;

cpp/src/parquet/statistics.cc

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,11 @@ CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) {
336336
return min_max;
337337
}
338338

339+
std::optional<std::pair<Int96, Int96>> CleanStatistic(std::pair<Int96, Int96> min_max,
340+
LogicalType::Type::type) {
341+
return min_max;
342+
}
343+
339344
// In case of floating point types, the following rules are applied (as per
340345
// upstream parquet-mr):
341346
// - If any of min/max is NaN, return nothing.
@@ -573,7 +578,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
573578
min_buffer_(AllocateBuffer(pool_, 0)),
574579
max_buffer_(AllocateBuffer(pool_, 0)),
575580
logical_type_(LogicalTypeId(descr_)) {
576-
comparator_ = MakeComparator<DType>(descr);
581+
if (descr->sort_order() != SortOrder::UNKNOWN) {
582+
comparator_ = MakeComparator<DType>(descr);
583+
}
577584
TypedStatisticsImpl::Reset();
578585
}
579586

@@ -732,6 +739,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
732739
return;
733740
}
734741

742+
if (comparator_ == nullptr) return;
735743
SetMinMaxPair(comparator_->GetMinMax(values));
736744
}
737745

@@ -832,6 +840,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
832840
}
833841

834842
void SetMinMaxPair(std::pair<T, T> min_max) {
843+
if (comparator_ == nullptr) return;
835844
// CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
836845
auto maybe_min_max = CleanStatistic(min_max, logical_type_);
837846
if (!maybe_min_max) return;
@@ -894,7 +903,7 @@ void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,
894903
IncrementNullCount(null_count);
895904
IncrementNumValues(num_values);
896905

897-
if (num_values == 0) return;
906+
if (num_values == 0 || comparator_ == nullptr) return;
898907
SetMinMaxPair(comparator_->GetMinMax(values, num_values));
899908
}
900909

@@ -909,7 +918,7 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* va
909918
IncrementNullCount(null_count);
910919
IncrementNumValues(num_values);
911920

912-
if (num_values == 0) return;
921+
if (num_values == 0 || comparator_ == nullptr) return;
913922
SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
914923
valid_bits_offset));
915924
}
@@ -1104,6 +1113,7 @@ std::shared_ptr<Statistics> Statistics::Make(
11041113
MAKE_STATS(BOOLEAN, BooleanType);
11051114
MAKE_STATS(INT32, Int32Type);
11061115
MAKE_STATS(INT64, Int64Type);
1116+
MAKE_STATS(INT96, Int96Type);
11071117
MAKE_STATS(FLOAT, FloatType);
11081118
MAKE_STATS(DOUBLE, DoubleType);
11091119
MAKE_STATS(BYTE_ARRAY, ByteArrayType);

cpp/src/parquet/statistics.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,14 @@ class PARQUET_EXPORT EncodedStatistics {
163163
}
164164
}
165165

166+
// Clear Min Max.
167+
void ClearMinMax() {
168+
has_max = false;
169+
max_.clear();
170+
has_min = false;
171+
min_.clear();
172+
}
173+
166174
bool is_set() const {
167175
return has_min || has_max || has_null_count || has_distinct_count;
168176
}

cpp/src/parquet/statistics_test.cc

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -916,7 +916,7 @@ TEST(CorrectStatistics, Basics) {
916916
AssertStatsSet(version, props, schema.Column(1), true);
917917
AssertStatsSet(version, props, schema.Column(2), true);
918918
AssertStatsSet(version, props, schema.Column(3), true);
919-
AssertStatsSet(version, props, schema.Column(4), false);
919+
AssertStatsSet(version, props, schema.Column(4), true);
920920
AssertStatsSet(version, props, schema.Column(5), true);
921921
}
922922

@@ -1632,6 +1632,36 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) {
16321632
AssertUnsetMinMax(stats, nans_but_last, &all_but_last_valid);
16331633
}
16341634

1635+
// Test read statistics for column with UNKNOWN sort order
1636+
TEST(TestStatisticsSortOrder, UNKNOWN) {
1637+
std::string dir_string(test::get_data_dir());
1638+
std::stringstream ss;
1639+
ss << dir_string << "/int96_from_spark.parquet";
1640+
auto path = ss.str();
1641+
1642+
// The file contains a single column of INT96 type (deprecated)
1643+
// with SortOrder UNKNOWN.
1644+
// It contains 6 values with a single null value.
1645+
// The null_count statistics value is preserved.
1646+
auto file_reader = ParquetFileReader::OpenFile(path);
1647+
auto rg_reader = file_reader->RowGroup(0);
1648+
auto metadata = rg_reader->metadata();
1649+
auto column_schema = metadata->schema()->Column(0);
1650+
ASSERT_EQ(SortOrder::UNKNOWN, column_schema->sort_order());
1651+
1652+
auto column_chunk = metadata->ColumnChunk(0);
1653+
ASSERT_TRUE(column_chunk->is_stats_set());
1654+
1655+
std::shared_ptr<EncodedStatistics> enc_stats = column_chunk->encoded_statistics();
1656+
ASSERT_TRUE(enc_stats->has_null_count);
1657+
ASSERT_FALSE(enc_stats->has_distinct_count);
1658+
ASSERT_FALSE(enc_stats->has_min);
1659+
ASSERT_FALSE(enc_stats->has_max);
1660+
ASSERT_EQ(1, enc_stats->null_count);
1661+
ASSERT_FALSE(enc_stats->is_max_value_exact.has_value());
1662+
ASSERT_FALSE(enc_stats->is_min_value_exact.has_value());
1663+
}
1664+
16351665
// Test statistics for binary column with UNSIGNED sort order
16361666
TEST(TestStatisticsSortOrderMinMax, Unsigned) {
16371667
std::string dir_string(test::get_data_dir());

0 commit comments

Comments
 (0)