2020#include < algorithm>
2121#include < cstring>
2222#include < memory>
23+ #include < random>
2324#include < unordered_set>
2425#include < utility>
2526#include < vector>
4041#include " arrow/util/parallel.h"
4142#include " arrow/util/range.h"
4243#include " arrow/util/tracing_internal.h"
44+
4345#include " parquet/arrow/reader_internal.h"
46+ #include " parquet/bloom_filter.h"
47+ #include " parquet/bloom_filter_reader.h"
4448#include " parquet/column_reader.h"
4549#include " parquet/exception.h"
4650#include " parquet/file_reader.h"
@@ -1419,11 +1423,51 @@ Status FuzzReader(const uint8_t* data, int64_t size) {
14191423 auto buffer = std::make_shared<::arrow::Buffer>(data, size);
14201424 auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
14211425 auto pool = ::arrow::default_memory_pool ();
1426+ auto reader_properties = default_reader_properties ();
1427+ std::default_random_engine rng (/* seed*/ 42 );
14221428
14231429 // Read Parquet file metadata only once, which will reduce iteration time slightly
14241430 std::shared_ptr<FileMetaData> pq_md;
1425- BEGIN_PARQUET_CATCH_EXCEPTIONS
1426- pq_md = ParquetFileReader::Open (file)->metadata ();
1431+ int num_row_groups, num_columns;
1432+ BEGIN_PARQUET_CATCH_EXCEPTIONS {
1433+ // Read some additional metadata (often lazy-decoded, such as statistics)
1434+ pq_md = ParquetFileReader::Open (file)->metadata ();
1435+ num_row_groups = pq_md->num_row_groups ();
1436+ num_columns = pq_md->num_columns ();
1437+ for (int i = 0 ; i < num_row_groups; ++i) {
1438+ auto rg = pq_md->RowGroup (i);
1439+ rg->sorting_columns ();
1440+ for (int j = 0 ; j < num_columns; ++j) {
1441+ auto col = rg->ColumnChunk (j);
1442+ col->encoded_statistics ();
1443+ col->statistics ();
1444+ col->geo_statistics ();
1445+ col->size_statistics ();
1446+ col->key_value_metadata ();
1447+ col->encodings ();
1448+ col->encoding_stats ();
1449+ // TODO read offset index
1450+ // TODO read column index
1451+ }
1452+ }
1453+ }
1454+ {
1455+ // Read and decode bloom filters
1456+ auto bloom_reader = BloomFilterReader::Make (file, pq_md, reader_properties);
1457+ std::uniform_int_distribution<uint64_t > hash_dist;
1458+ for (int i = 0 ; i < num_row_groups; ++i) {
1459+ auto bloom_rg = bloom_reader->RowGroup (i);
1460+ for (int j = 0 ; j < num_columns; ++j) {
1461+ auto bloom = bloom_rg->GetColumnBloomFilter (j);
1462+ // If the column has a bloom filter, find a bunch of random hashes
1463+ if (bloom != nullptr ) {
1464+ for (int k = 0 ; k < 100 ; ++k) {
1465+ bloom->FindHash (hash_dist (rng));
1466+ }
1467+ }
1468+ }
1469+ }
1470+ }
14271471 END_PARQUET_CATCH_EXCEPTIONS
14281472
14291473 // Note that very small batch sizes probably make fuzzing slower
@@ -1435,7 +1479,7 @@ Status FuzzReader(const uint8_t* data, int64_t size) {
14351479
14361480 std::unique_ptr<ParquetFileReader> pq_file_reader;
14371481 BEGIN_PARQUET_CATCH_EXCEPTIONS
1438- pq_file_reader = ParquetFileReader::Open (file, default_reader_properties () , pq_md);
1482+ pq_file_reader = ParquetFileReader::Open (file, reader_properties , pq_md);
14391483 END_PARQUET_CATCH_EXCEPTIONS
14401484
14411485 std::unique_ptr<FileReader> reader;
0 commit comments