Skip to content

Commit 2f70708

Browse files
committed
GH-48335: [C++][Parquet] Fuzz encrypted files
1 parent b2e8f25 commit 2f70708

File tree

11 files changed

+606
-308
lines changed

11 files changed

+606
-308
lines changed

cpp/src/arrow/util/fuzz_internal.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void LogFuzzStatus(const Status& st, const uint8_t* data, int64_t size) {
4949
return value;
5050
}();
5151

52-
if (kVerbosity >= 1) {
52+
if (!st.ok() && kVerbosity >= 1) {
5353
ARROW_LOG(WARNING) << "Fuzzing input with size=" << size
5454
<< " failed: " << st.ToString();
5555
} else if (st.IsOutOfMemory()) {

cpp/src/parquet/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ endif()
151151
# Library config
152152

153153
set(PARQUET_SRCS
154+
arrow/fuzz_internal.cc
154155
arrow/path_internal.cc
155156
arrow/reader.cc
156157
arrow/reader_internal.cc

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
#include "parquet/api/reader.h"
6666
#include "parquet/api/writer.h"
6767

68+
#include "parquet/arrow/fuzz_internal.h"
6869
#include "parquet/arrow/reader.h"
6970
#include "parquet/arrow/reader_internal.h"
7071
#include "parquet/arrow/schema.h"
@@ -5830,29 +5831,45 @@ TEST(TestArrowReadWrite, MultithreadedWrite) {
58305831
}
58315832

58325833
TEST(TestArrowReadWrite, FuzzReader) {
5834+
using ::parquet::fuzzing::internal::FuzzReader;
5835+
58335836
constexpr size_t kMaxFileSize = 1024 * 1024 * 1;
5837+
58345838
auto check_bad_file = [&](const std::string& file_name) {
58355839
SCOPED_TRACE(file_name);
58365840
auto path = test::get_data_file(file_name, /*is_good=*/false);
58375841
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
58385842
path, ::arrow::io::FileMode::READ));
58395843
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5840-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5844+
auto s = FuzzReader(buffer->data(), buffer->size());
58415845
ASSERT_NOT_OK(s);
58425846
};
5847+
5848+
auto check_good_file = [&](const std::string& file_name) {
5849+
SCOPED_TRACE(file_name);
5850+
auto path = test::get_data_file(file_name, /*is_good=*/true);
5851+
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5852+
path, ::arrow::io::FileMode::READ));
5853+
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5854+
auto s = FuzzReader(buffer->data(), buffer->size());
5855+
ASSERT_OK(s);
5856+
};
5857+
58435858
check_bad_file("PARQUET-1481.parquet");
58445859
check_bad_file("ARROW-GH-41317.parquet");
58455860
check_bad_file("ARROW-GH-41321.parquet");
58465861
check_bad_file("ARROW-RS-GH-6229-LEVELS.parquet");
58475862
check_bad_file("ARROW-RS-GH-6229-DICTHEADER.parquet");
5848-
{
5849-
auto path = test::get_data_file("alltypes_plain.parquet", /*is_good=*/true);
5850-
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5851-
path, ::arrow::io::FileMode::READ));
5852-
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5853-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5854-
ASSERT_OK(s);
5855-
}
5863+
5864+
check_good_file("alltypes_plain.parquet");
5865+
check_good_file("data_index_bloom_encoding_stats.parquet");
5866+
check_good_file("data_index_bloom_encoding_with_length.parquet");
5867+
// Encrypted files in the testing repo should be ok, except those
5868+
// that require external key material or an explicitly-supplied AAD.
5869+
check_good_file("uniform_encryption.parquet.encrypted");
5870+
check_good_file("encrypt_columns_and_footer_aad.parquet.encrypted");
5871+
check_good_file("encrypt_columns_and_footer.parquet.encrypted");
5872+
check_good_file("encrypt_columns_plaintext_footer.parquet.encrypted");
58565873
}
58575874

58585875
// Test writing table with a closed writer, should not segfault (GH-37969).

cpp/src/parquet/arrow/fuzz.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717

1818
#include "arrow/status.h"
1919
#include "arrow/util/fuzz_internal.h"
20-
#include "parquet/arrow/reader.h"
20+
#include "parquet/arrow/fuzz_internal.h"
2121

2222
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
23-
auto status = parquet::arrow::internal::FuzzReader(data, static_cast<int64_t>(size));
23+
auto status = parquet::fuzzing::internal::FuzzReader(data, static_cast<int64_t>(size));
2424
arrow::internal::LogFuzzStatus(status, data, static_cast<int64_t>(size));
2525
return 0;
2626
}
Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "parquet/arrow/fuzz_internal.h"
19+
20+
#include <cstdint>
21+
#include <random>
22+
#include <string_view>
23+
#include <unordered_map>
24+
25+
#include "arrow/io/memory.h"
26+
#include "arrow/table.h"
27+
#include "arrow/util/base64.h"
28+
#include "arrow/util/fuzz_internal.h"
29+
#include "arrow/util/string.h"
30+
#include "parquet/arrow/reader.h"
31+
#include "parquet/bloom_filter.h"
32+
#include "parquet/bloom_filter_reader.h"
33+
#include "parquet/page_index.h"
34+
#include "parquet/properties.h"
35+
36+
namespace parquet::fuzzing::internal {
37+
38+
using ::arrow::MemoryPool;
39+
using ::arrow::Status;
40+
using ::arrow::Table;
41+
using ::arrow::util::SecureString;
42+
using ::parquet::arrow::FileReader;
43+
44+
namespace {
45+
46+
constexpr std::string_view kInlineKeyPrefix = "inline:";
47+
48+
const std::unordered_map<std::string, SecureString> kTestingKeys = {
49+
{"kf", SecureString("0123456789012345")},
50+
{"kc1", SecureString("1234567890123450")},
51+
{"kc2", SecureString("1234567890123451")},
52+
};
53+
54+
} // namespace
55+
56+
EncryptionKey MakeEncryptionKey(int key_len) {
57+
// Keep the engine persistent to generate a different key every time
58+
static auto gen = []() { return std::default_random_engine(/*seed=*/42); }();
59+
60+
std::uniform_int_distribution<unsigned int> chars_dist(0, 255);
61+
std::string key(key_len, '\x00');
62+
for (auto& c : key) {
63+
c = static_cast<uint8_t>(chars_dist(gen));
64+
}
65+
66+
std::string key_metadata(kInlineKeyPrefix);
67+
key_metadata += ::arrow::util::base64_encode(key);
68+
69+
return {SecureString(std::move(key)), std::move(key_metadata)};
70+
}
71+
72+
class FuzzDecryptionKeyRetriever : public DecryptionKeyRetriever {
73+
public:
74+
SecureString GetKey(const std::string& key_id) override {
75+
// Is it one of the keys used in parquet-testing?
76+
auto it = kTestingKeys.find(key_id);
77+
if (it != kTestingKeys.end()) {
78+
return it->second;
79+
}
80+
// Is it a key generated by MakeEncryptionKey?
81+
if (::arrow::internal::StartsWith(key_id, kInlineKeyPrefix)) {
82+
return SecureString(
83+
::arrow::util::base64_decode(key_id.substr(kInlineKeyPrefix.length())));
84+
}
85+
throw ParquetException("Unknown fuzz encryption key_id");
86+
}
87+
};
88+
89+
std::shared_ptr<DecryptionKeyRetriever> MakeKeyRetriever() {
90+
return std::make_shared<FuzzDecryptionKeyRetriever>();
91+
}
92+
93+
namespace {
94+
95+
Status FuzzReadData(std::unique_ptr<FileReader> reader) {
96+
auto st = Status::OK();
97+
for (int i = 0; i < reader->num_row_groups(); ++i) {
98+
std::shared_ptr<Table> table;
99+
auto row_group_status = reader->ReadRowGroup(i, &table);
100+
if (row_group_status.ok()) {
101+
row_group_status &= table->ValidateFull();
102+
}
103+
st &= row_group_status;
104+
}
105+
return st;
106+
}
107+
108+
template <typename DType>
109+
Status FuzzReadTypedColumnIndex(const TypedColumnIndex<DType>* index) {
110+
index->min_values();
111+
index->max_values();
112+
return Status::OK();
113+
}
114+
115+
Status FuzzReadColumnIndex(const ColumnIndex* index, const ColumnDescriptor* descr) {
116+
Status st;
117+
BEGIN_PARQUET_CATCH_EXCEPTIONS
118+
index->definition_level_histograms();
119+
index->repetition_level_histograms();
120+
index->null_pages();
121+
index->null_counts();
122+
index->non_null_page_indices();
123+
index->encoded_min_values();
124+
index->encoded_max_values();
125+
switch (descr->physical_type()) {
126+
case Type::BOOLEAN:
127+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const BoolColumnIndex*>(index));
128+
break;
129+
case Type::INT32:
130+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int32ColumnIndex*>(index));
131+
break;
132+
case Type::INT64:
133+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int64ColumnIndex*>(index));
134+
break;
135+
case Type::INT96:
136+
st &= FuzzReadTypedColumnIndex(
137+
dynamic_cast<const TypedColumnIndex<Int96Type>*>(index));
138+
break;
139+
case Type::FLOAT:
140+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const FloatColumnIndex*>(index));
141+
break;
142+
case Type::DOUBLE:
143+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const DoubleColumnIndex*>(index));
144+
break;
145+
case Type::FIXED_LEN_BYTE_ARRAY:
146+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const FLBAColumnIndex*>(index));
147+
break;
148+
case Type::BYTE_ARRAY:
149+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const ByteArrayColumnIndex*>(index));
150+
break;
151+
case Type::UNDEFINED:
152+
break;
153+
}
154+
END_PARQUET_CATCH_EXCEPTIONS
155+
return st;
156+
}
157+
158+
Status FuzzReadPageIndex(RowGroupPageIndexReader* reader, const SchemaDescriptor* schema,
159+
int column) {
160+
Status st;
161+
BEGIN_PARQUET_CATCH_EXCEPTIONS
162+
auto offset_index = reader->GetOffsetIndex(column);
163+
if (offset_index) {
164+
offset_index->page_locations();
165+
offset_index->unencoded_byte_array_data_bytes();
166+
}
167+
auto col_index = reader->GetColumnIndex(column);
168+
if (col_index) {
169+
st &= FuzzReadColumnIndex(col_index.get(), schema->Column(column));
170+
}
171+
END_PARQUET_CATCH_EXCEPTIONS
172+
return st;
173+
}
174+
175+
ReaderProperties MakeFuzzReaderProperties(MemoryPool* pool) {
176+
FileDecryptionProperties::Builder builder;
177+
builder.key_retriever(MakeKeyRetriever());
178+
builder.plaintext_files_allowed();
179+
// XXX Cannot set a AAD prefix as that would fail on files
180+
// that store their own ADD prefix.
181+
auto decryption_properties = builder.build();
182+
183+
ReaderProperties properties(pool);
184+
properties.file_decryption_properties(decryption_properties);
185+
return properties;
186+
}
187+
188+
} // namespace
189+
190+
Status FuzzReader(const uint8_t* data, int64_t size) {
191+
Status st;
192+
193+
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
194+
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
195+
auto pool = ::arrow::internal::fuzzing_memory_pool();
196+
auto reader_properties = MakeFuzzReaderProperties(pool);
197+
198+
std::default_random_engine rng(/*seed*/ 42);
199+
200+
// Read Parquet file metadata only once, which will reduce iteration time slightly
201+
std::shared_ptr<FileMetaData> pq_md;
202+
BEGIN_PARQUET_CATCH_EXCEPTIONS {
203+
int num_row_groups, num_columns;
204+
auto pq_file_reader = ParquetFileReader::Open(file, reader_properties);
205+
{
206+
// Read some additional metadata (often lazy-decoded, such as statistics)
207+
pq_md = pq_file_reader->metadata();
208+
num_row_groups = pq_md->num_row_groups();
209+
num_columns = pq_md->num_columns();
210+
for (int i = 0; i < num_row_groups; ++i) {
211+
auto rg = pq_md->RowGroup(i);
212+
rg->sorting_columns();
213+
for (int j = 0; j < num_columns; ++j) {
214+
auto col = rg->ColumnChunk(j);
215+
col->encoded_statistics();
216+
col->statistics();
217+
col->geo_statistics();
218+
col->size_statistics();
219+
col->key_value_metadata();
220+
col->encodings();
221+
col->encoding_stats();
222+
}
223+
}
224+
}
225+
{
226+
// Read and decode bloom filters
227+
try {
228+
auto& bloom_reader = pq_file_reader->GetBloomFilterReader();
229+
std::uniform_int_distribution<uint64_t> hash_dist;
230+
for (int i = 0; i < num_row_groups; ++i) {
231+
auto bloom_rg = bloom_reader.RowGroup(i);
232+
for (int j = 0; j < num_columns; ++j) {
233+
std::unique_ptr<BloomFilter> bloom;
234+
bloom = bloom_rg->GetColumnBloomFilter(j);
235+
// If the column has a bloom filter, find a bunch of random hashes
236+
if (bloom != nullptr) {
237+
for (int k = 0; k < 100; ++k) {
238+
bloom->FindHash(hash_dist(rng));
239+
}
240+
}
241+
}
242+
}
243+
} catch (const ParquetException& exc) {
244+
// XXX we just want to ignore encrypted bloom filters and validate the
245+
// rest of the file; there is no better way of doing this until GH-46597
246+
// is done.
247+
// (also see GH-48334 for reading encrypted bloom filters)
248+
if (std::string_view(exc.what())
249+
.find("BloomFilter decryption is not yet supported") ==
250+
std::string_view::npos) {
251+
throw;
252+
}
253+
}
254+
}
255+
{
256+
// Read and decode page indexes
257+
auto index_reader = pq_file_reader->GetPageIndexReader();
258+
for (int i = 0; i < num_row_groups; ++i) {
259+
auto index_rg = index_reader->RowGroup(i);
260+
if (index_rg) {
261+
for (int j = 0; j < num_columns; ++j) {
262+
st &= FuzzReadPageIndex(index_rg.get(), pq_md->schema(), j);
263+
}
264+
}
265+
}
266+
}
267+
}
268+
END_PARQUET_CATCH_EXCEPTIONS
269+
270+
// Note that very small batch sizes probably make fuzzing slower
271+
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13, 300}) {
272+
ArrowReaderProperties properties;
273+
if (batch_size) {
274+
properties.set_batch_size(batch_size.value());
275+
}
276+
277+
std::unique_ptr<ParquetFileReader> pq_file_reader;
278+
BEGIN_PARQUET_CATCH_EXCEPTIONS
279+
pq_file_reader = ParquetFileReader::Open(file, reader_properties, pq_md);
280+
END_PARQUET_CATCH_EXCEPTIONS
281+
282+
std::unique_ptr<FileReader> reader;
283+
RETURN_NOT_OK(FileReader::Make(pool, std::move(pq_file_reader), properties, &reader));
284+
st &= FuzzReadData(std::move(reader));
285+
}
286+
return st;
287+
}
288+
289+
} // namespace parquet::fuzzing::internal

0 commit comments

Comments
 (0)