Skip to content

Commit 0d71897

Browse files
raulcdpitrou
andauthored
GH-46462: [C++][Parquet] Expose currently thrown EncodedStatistics when checking is_stats_set (#46463)
### Rationale for this change We are currently throwing away `EncodedStatistics` that are built when calling `is_stats_set`. If we require `EncodedStatistics` we are doing some spurious conversions that could potentially be optimized. For example generating the `TypedStatistics` only when necessary. ### What changes are included in this PR? - cache the generated `encoded_statistics` on `is_stats_set` - provide accessor to `encoded_statistics` - avoid generating the typed statistics if all we care is about `encoded_statistics` ### Are these changes tested? Existing tests are successful and adapted tests to validate new accessor and `encoded_statistics` values. ### Are there any user-facing changes? No * GitHub Issue: #46462 Lead-authored-by: Raúl Cumplido <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent a2941dd commit 0d71897

File tree

7 files changed

+81
-39
lines changed

7 files changed

+81
-39
lines changed

cpp/src/parquet/column_reader.cc

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -197,34 +197,8 @@ namespace {
197197
template <typename H>
198198
EncodedStatistics ExtractStatsFromHeader(const H& header) {
199199
EncodedStatistics page_statistics;
200-
if (!header.__isset.statistics) {
201-
return page_statistics;
202-
}
203-
const format::Statistics& stats = header.statistics;
204-
// Use the new V2 min-max statistics over the former one if it is filled
205-
if (stats.__isset.max_value || stats.__isset.min_value) {
206-
// TODO: check if the column_order is TYPE_DEFINED_ORDER.
207-
if (stats.__isset.max_value) {
208-
page_statistics.set_max(stats.max_value);
209-
}
210-
if (stats.__isset.min_value) {
211-
page_statistics.set_min(stats.min_value);
212-
}
213-
} else if (stats.__isset.max || stats.__isset.min) {
214-
// TODO: check created_by to see if it is corrupted for some types.
215-
// TODO: check if the sort_order is SIGNED.
216-
if (stats.__isset.max) {
217-
page_statistics.set_max(stats.max);
218-
}
219-
if (stats.__isset.min) {
220-
page_statistics.set_min(stats.min);
221-
}
222-
}
223-
if (stats.__isset.null_count) {
224-
page_statistics.set_null_count(stats.null_count);
225-
}
226-
if (stats.__isset.distinct_count) {
227-
page_statistics.set_distinct_count(stats.distinct_count);
200+
if (header.__isset.statistics) {
201+
page_statistics = FromThrift(header.statistics);
228202
}
229203
return page_statistics;
230204
}

cpp/src/parquet/metadata.cc

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
280280
size_statistics_->Validate(descr_);
281281
}
282282
possible_stats_ = nullptr;
283+
possible_encoded_stats_ = nullptr;
283284
possible_geo_stats_ = nullptr;
284285
InitKeyValueMetadata();
285286
}
@@ -312,19 +313,17 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
312313
return false;
313314
}
314315
{
315-
// Because we are modifying possible_stats_ in a const method
316316
const std::lock_guard<std::mutex> guard(stats_mutex_);
317-
if (possible_stats_ == nullptr) {
318-
possible_stats_ = MakeColumnStats(*column_metadata_, descr_);
317+
if (possible_encoded_stats_ == nullptr) {
318+
possible_encoded_stats_ =
319+
std::make_shared<EncodedStatistics>(FromThrift(column_metadata_->statistics));
319320
}
320321
}
321-
EncodedStatistics encodedStatistics = possible_stats_->Encode();
322-
return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
322+
return writer_version_->HasCorrectStatistics(type(), *possible_encoded_stats_,
323323
descr_->sort_order());
324324
}
325325

326326
inline bool is_geo_stats_set() const {
327-
// Because we are modifying possible_geo_stats_ in a const method
328327
const std::lock_guard<std::mutex> guard(stats_mutex_);
329328
if (possible_geo_stats_ == nullptr &&
330329
column_metadata_->__isset.geospatial_statistics) {
@@ -334,8 +333,19 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
334333
return possible_geo_stats_ != nullptr && possible_geo_stats_->is_valid();
335334
}
336335

336+
inline std::shared_ptr<EncodedStatistics> encoded_statistics() const {
337+
return is_stats_set() ? possible_encoded_stats_ : nullptr;
338+
}
339+
337340
inline std::shared_ptr<Statistics> statistics() const {
338-
return is_stats_set() ? possible_stats_ : nullptr;
341+
if (is_stats_set()) {
342+
const std::lock_guard<std::mutex> guard(stats_mutex_);
343+
if (possible_stats_ == nullptr) {
344+
possible_stats_ = MakeColumnStats(*column_metadata_, descr_);
345+
}
346+
return possible_stats_;
347+
}
348+
return nullptr;
339349
}
340350

341351
inline std::shared_ptr<SizeStatistics> size_statistics() const {
@@ -425,6 +435,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
425435
}
426436

427437
mutable std::mutex stats_mutex_;
438+
mutable std::shared_ptr<EncodedStatistics> possible_encoded_stats_;
428439
mutable std::shared_ptr<Statistics> possible_stats_;
429440
mutable std::shared_ptr<geospatial::GeoStatistics> possible_geo_stats_;
430441
std::vector<Encoding::type> encodings_;
@@ -474,6 +485,10 @@ std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const
474485
return impl_->path_in_schema();
475486
}
476487

488+
std::shared_ptr<EncodedStatistics> ColumnChunkMetaData::encoded_statistics() const {
489+
return impl_->encoded_statistics();
490+
}
491+
477492
std::shared_ptr<Statistics> ColumnChunkMetaData::statistics() const {
478493
return impl_->statistics();
479494
}

cpp/src/parquet/metadata.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ class PARQUET_EXPORT ColumnChunkMetaData {
143143
bool is_stats_set() const;
144144
bool is_geo_stats_set() const;
145145
std::shared_ptr<Statistics> statistics() const;
146+
std::shared_ptr<EncodedStatistics> encoded_statistics() const;
146147
std::shared_ptr<SizeStatistics> size_statistics() const;
147148
std::shared_ptr<geospatial::GeoStatistics> geo_statistics() const;
148149

cpp/src/parquet/metadata_test.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,10 @@ TEST(Metadata, TestBuildAccess) {
158158
ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax());
159159
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
160160
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
161+
ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min());
162+
ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max());
163+
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
164+
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
161165
ASSERT_EQ(0, rg1_column1->statistics()->null_count());
162166
ASSERT_EQ(0, rg1_column2->statistics()->null_count());
163167
ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count());
@@ -205,6 +209,10 @@ TEST(Metadata, TestBuildAccess) {
205209
ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax());
206210
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
207211
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
212+
ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min());
213+
ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max());
214+
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
215+
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
208216
ASSERT_EQ(0, rg2_column1->statistics()->null_count());
209217
ASSERT_EQ(0, rg2_column2->statistics()->null_count());
210218
ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count());

cpp/src/parquet/printer.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
156156
// Print column metadata
157157
for (auto i : selected_columns) {
158158
auto column_chunk = group_metadata->ColumnChunk(i);
159-
std::shared_ptr<Statistics> stats = column_chunk->statistics();
159+
std::shared_ptr<EncodedStatistics> stats = column_chunk->encoded_statistics();
160160

161161
const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
162162
stream << "Column " << i << std::endl;
@@ -165,9 +165,9 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
165165
}
166166
stream << " Values: " << column_chunk->num_values();
167167
if (column_chunk->is_stats_set()) {
168-
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
169-
stream << ", Null Values: " << stats->null_count()
170-
<< ", Distinct Values: " << stats->distinct_count() << std::endl
168+
std::string min = stats->min(), max = stats->max();
169+
stream << ", Null Values: " << stats->null_count
170+
<< ", Distinct Values: " << stats->distinct_count << std::endl
171171
<< " Max: " << FormatStatValue(descr->physical_type(), max)
172172
<< ", Min: " << FormatStatValue(descr->physical_type(), min);
173173
} else {

cpp/src/parquet/statistics_test.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,13 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
452452
EXPECT_TRUE(expected_stats->HasMinMax());
453453
EXPECT_EQ(expected_stats->EncodeMin(), stats->EncodeMin());
454454
EXPECT_EQ(expected_stats->EncodeMax(), stats->EncodeMax());
455+
456+
std::shared_ptr<EncodedStatistics> enc_stats = column_chunk->encoded_statistics();
457+
EXPECT_EQ(null_count, enc_stats->null_count);
458+
EXPECT_TRUE(enc_stats->has_min);
459+
EXPECT_TRUE(enc_stats->has_max);
460+
EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min());
461+
EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max());
455462
}
456463
};
457464

@@ -800,6 +807,11 @@ void AssertStatsSet(const ApplicationVersion& version,
800807
stats.set_is_signed(false);
801808
metadata_builder->SetStatistics(stats);
802809
ASSERT_EQ(column_chunk->is_stats_set(), expected_is_set);
810+
if (expected_is_set) {
811+
ASSERT_TRUE(column_chunk->encoded_statistics() != nullptr);
812+
} else {
813+
ASSERT_TRUE(column_chunk->encoded_statistics() == nullptr);
814+
}
803815
}
804816

805817
// Statistics are restricted for few types in older parquet version

cpp/src/parquet/thrift_internal.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,38 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
262262
aesGcmCtrV1.supply_aad_prefix};
263263
}
264264

265+
static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
266+
EncodedStatistics out;
267+
268+
// Use the new V2 min-max statistics over the former one if it is filled
269+
if (stats.__isset.max_value || stats.__isset.min_value) {
270+
// TODO: check if the column_order is TYPE_DEFINED_ORDER.
271+
if (stats.__isset.max_value) {
272+
out.set_max(stats.max_value);
273+
}
274+
if (stats.__isset.min_value) {
275+
out.set_min(stats.min_value);
276+
}
277+
} else if (stats.__isset.max || stats.__isset.min) {
278+
// TODO: check created_by to see if it is corrupted for some types.
279+
// TODO: check if the sort_order is SIGNED.
280+
if (stats.__isset.max) {
281+
out.set_max(stats.max);
282+
}
283+
if (stats.__isset.min) {
284+
out.set_min(stats.min);
285+
}
286+
}
287+
if (stats.__isset.null_count) {
288+
out.set_null_count(stats.null_count);
289+
}
290+
if (stats.__isset.distinct_count) {
291+
out.set_distinct_count(stats.distinct_count);
292+
}
293+
294+
return out;
295+
}
296+
265297
static inline geospatial::EncodedGeoStatistics FromThrift(
266298
const format::GeospatialStatistics& geo_stats) {
267299
geospatial::EncodedGeoStatistics out;

0 commit comments

Comments
 (0)