Skip to content

Commit a1d2895

Browse files
committed
GH-48089: [C++][Parquet] Read statistics and other metadata when fuzzing
1 parent c10847c commit a1d2895

File tree

2 files changed

+51
-4
lines changed

2 files changed

+51
-4
lines changed

cpp/src/parquet/arrow/reader.cc

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <algorithm>
2121
#include <cstring>
2222
#include <memory>
23+
#include <random>
2324
#include <unordered_set>
2425
#include <utility>
2526
#include <vector>
@@ -40,7 +41,10 @@
4041
#include "arrow/util/parallel.h"
4142
#include "arrow/util/range.h"
4243
#include "arrow/util/tracing_internal.h"
44+
4345
#include "parquet/arrow/reader_internal.h"
46+
#include "parquet/bloom_filter.h"
47+
#include "parquet/bloom_filter_reader.h"
4448
#include "parquet/column_reader.h"
4549
#include "parquet/exception.h"
4650
#include "parquet/file_reader.h"
@@ -1419,11 +1423,51 @@ Status FuzzReader(const uint8_t* data, int64_t size) {
14191423
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
14201424
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
14211425
auto pool = ::arrow::default_memory_pool();
1426+
auto reader_properties = default_reader_properties();
1427+
std::default_random_engine rng(/*seed*/ 42);
14221428

14231429
// Read Parquet file metadata only once, which will reduce iteration time slightly
14241430
std::shared_ptr<FileMetaData> pq_md;
1425-
BEGIN_PARQUET_CATCH_EXCEPTIONS
1426-
pq_md = ParquetFileReader::Open(file)->metadata();
1431+
int num_row_groups, num_columns;
1432+
BEGIN_PARQUET_CATCH_EXCEPTIONS {
1433+
// Read some additional metadata (often lazy-decoded, such as statistics)
1434+
pq_md = ParquetFileReader::Open(file)->metadata();
1435+
num_row_groups = pq_md->num_row_groups();
1436+
num_columns = pq_md->num_columns();
1437+
for (int i = 0; i < num_row_groups; ++i) {
1438+
auto rg = pq_md->RowGroup(i);
1439+
rg->sorting_columns();
1440+
for (int j = 0; j < num_columns; ++j) {
1441+
auto col = rg->ColumnChunk(j);
1442+
col->encoded_statistics();
1443+
col->statistics();
1444+
col->geo_statistics();
1445+
col->size_statistics();
1446+
col->key_value_metadata();
1447+
col->encodings();
1448+
col->encoding_stats();
1449+
// TODO read offset index
1450+
// TODO read column index
1451+
}
1452+
}
1453+
}
1454+
{
1455+
// Read and decode bloom filters
1456+
auto bloom_reader = BloomFilterReader::Make(file, pq_md, reader_properties);
1457+
std::uniform_int_distribution<uint64_t> hash_dist;
1458+
for (int i = 0; i < num_row_groups; ++i) {
1459+
auto bloom_rg = bloom_reader->RowGroup(i);
1460+
for (int j = 0; j < num_columns; ++j) {
1461+
auto bloom = bloom_rg->GetColumnBloomFilter(j);
1462+
// If the column has a bloom filter, find a bunch of random hashes
1463+
if (bloom != nullptr) {
1464+
for (int k = 0; k < 100; ++k) {
1465+
bloom->FindHash(hash_dist(rng));
1466+
}
1467+
}
1468+
}
1469+
}
1470+
}
14271471
END_PARQUET_CATCH_EXCEPTIONS
14281472

14291473
// Note that very small batch sizes probably make fuzzing slower
@@ -1435,7 +1479,7 @@ Status FuzzReader(const uint8_t* data, int64_t size) {
14351479

14361480
std::unique_ptr<ParquetFileReader> pq_file_reader;
14371481
BEGIN_PARQUET_CATCH_EXCEPTIONS
1438-
pq_file_reader = ParquetFileReader::Open(file, default_reader_properties(), pq_md);
1482+
pq_file_reader = ParquetFileReader::Open(file, reader_properties, pq_md);
14391483
END_PARQUET_CATCH_EXCEPTIONS
14401484

14411485
std::unique_ptr<FileReader> reader;

cpp/src/parquet/bloom_filter_reader.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ std::unique_ptr<BloomFilter> RowGroupBloomFilterReaderImpl::GetColumnBloomFilter
6060
return nullptr;
6161
}
6262
PARQUET_ASSIGN_OR_THROW(auto file_size, input_->GetSize());
63+
if (*bloom_filter_offset < 0) {
64+
throw ParquetException("bloom_filter_offset less than 0");
65+
}
6366
if (file_size <= *bloom_filter_offset) {
6467
throw ParquetException("file size less or equal than bloom offset");
6568
}
@@ -68,7 +71,7 @@ std::unique_ptr<BloomFilter> RowGroupBloomFilterReaderImpl::GetColumnBloomFilter
6871
if (*bloom_filter_length < 0) {
6972
throw ParquetException("bloom_filter_length less than 0");
7073
}
71-
if (*bloom_filter_length + *bloom_filter_offset > file_size) {
74+
if (*bloom_filter_length > file_size - *bloom_filter_offset) {
7275
throw ParquetException(
7376
"bloom filter length + bloom filter offset greater than file size");
7477
}

0 commit comments

Comments
 (0)