Skip to content

Commit fe50a06

Browse files
committed
GH-48335: [C++][Parquet] Fuzz encrypted files
1 parent b2e8f25 commit fe50a06

File tree

11 files changed

+609
-308
lines changed

11 files changed

+609
-308
lines changed

cpp/src/arrow/util/fuzz_internal.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void LogFuzzStatus(const Status& st, const uint8_t* data, int64_t size) {
4949
return value;
5050
}();
5151

52-
if (kVerbosity >= 1) {
52+
if (!st.ok() && kVerbosity >= 1) {
5353
ARROW_LOG(WARNING) << "Fuzzing input with size=" << size
5454
<< " failed: " << st.ToString();
5555
} else if (st.IsOutOfMemory()) {

cpp/src/parquet/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ endif()
151151
# Library config
152152

153153
set(PARQUET_SRCS
154+
arrow/fuzz_internal.cc
154155
arrow/path_internal.cc
155156
arrow/reader.cc
156157
arrow/reader_internal.cc

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
#include "parquet/api/reader.h"
6666
#include "parquet/api/writer.h"
6767

68+
#include "parquet/arrow/fuzz_internal.h"
6869
#include "parquet/arrow/reader.h"
6970
#include "parquet/arrow/reader_internal.h"
7071
#include "parquet/arrow/schema.h"
@@ -5830,29 +5831,45 @@ TEST(TestArrowReadWrite, MultithreadedWrite) {
58305831
}
58315832

58325833
TEST(TestArrowReadWrite, FuzzReader) {
5834+
using ::parquet::fuzzing::internal::FuzzReader;
5835+
58335836
constexpr size_t kMaxFileSize = 1024 * 1024 * 1;
5837+
58345838
auto check_bad_file = [&](const std::string& file_name) {
58355839
SCOPED_TRACE(file_name);
58365840
auto path = test::get_data_file(file_name, /*is_good=*/false);
58375841
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
58385842
path, ::arrow::io::FileMode::READ));
58395843
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5840-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5844+
auto s = FuzzReader(buffer->data(), buffer->size());
58415845
ASSERT_NOT_OK(s);
58425846
};
5847+
5848+
auto check_good_file = [&](const std::string& file_name) {
5849+
SCOPED_TRACE(file_name);
5850+
auto path = test::get_data_file(file_name, /*is_good=*/true);
5851+
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5852+
path, ::arrow::io::FileMode::READ));
5853+
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5854+
auto s = FuzzReader(buffer->data(), buffer->size());
5855+
ASSERT_OK(s);
5856+
};
5857+
58435858
check_bad_file("PARQUET-1481.parquet");
58445859
check_bad_file("ARROW-GH-41317.parquet");
58455860
check_bad_file("ARROW-GH-41321.parquet");
58465861
check_bad_file("ARROW-RS-GH-6229-LEVELS.parquet");
58475862
check_bad_file("ARROW-RS-GH-6229-DICTHEADER.parquet");
5848-
{
5849-
auto path = test::get_data_file("alltypes_plain.parquet", /*is_good=*/true);
5850-
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5851-
path, ::arrow::io::FileMode::READ));
5852-
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5853-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5854-
ASSERT_OK(s);
5855-
}
5863+
5864+
check_good_file("alltypes_plain.parquet");
5865+
check_good_file("data_index_bloom_encoding_stats.parquet");
5866+
check_good_file("data_index_bloom_encoding_with_length.parquet");
5867+
// Encrypted files in the testing repo should be ok, except those
5868+
// that require external key material or an explicitly-supplied AAD.
5869+
check_good_file("uniform_encryption.parquet.encrypted");
5870+
check_good_file("encrypt_columns_and_footer_aad.parquet.encrypted");
5871+
check_good_file("encrypt_columns_and_footer.parquet.encrypted");
5872+
check_good_file("encrypt_columns_plaintext_footer.parquet.encrypted");
58565873
}
58575874

58585875
// Test writing table with a closed writer, should not segfault (GH-37969).

cpp/src/parquet/arrow/fuzz.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717

1818
#include "arrow/status.h"
1919
#include "arrow/util/fuzz_internal.h"
20-
#include "parquet/arrow/reader.h"
20+
#include "parquet/arrow/fuzz_internal.h"
2121

2222
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
23-
auto status = parquet::arrow::internal::FuzzReader(data, static_cast<int64_t>(size));
23+
auto status = parquet::fuzzing::internal::FuzzReader(data, static_cast<int64_t>(size));
2424
arrow::internal::LogFuzzStatus(status, data, static_cast<int64_t>(size));
2525
return 0;
2626
}
Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "parquet/arrow/fuzz_internal.h"
19+
20+
#include <cstdint>
21+
#include <random>
22+
#include <string_view>
23+
#include <unordered_map>
24+
25+
#include "arrow/io/memory.h"
26+
#include "arrow/table.h"
27+
#include "arrow/util/base64.h"
28+
#include "arrow/util/fuzz_internal.h"
29+
#include "arrow/util/string.h"
30+
#include "parquet/arrow/reader.h"
31+
#include "parquet/bloom_filter.h"
32+
#include "parquet/bloom_filter_reader.h"
33+
#include "parquet/page_index.h"
34+
#include "parquet/properties.h"
35+
36+
namespace parquet::fuzzing::internal {
37+
38+
using ::arrow::MemoryPool;
39+
using ::arrow::Status;
40+
using ::arrow::Table;
41+
using ::arrow::util::SecureString;
42+
using ::parquet::arrow::FileReader;
43+
44+
namespace {
45+
46+
constexpr std::string_view kInlineKeyPrefix = "inline:";
47+
48+
// See https://github.com/apache/parquet-testing/blob/master/data/README.md#encrypted-files
49+
const std::unordered_map<std::string, SecureString> kTestingKeys = {
50+
{"kf", SecureString("0123456789012345")},
51+
{"kc1", SecureString("1234567890123450")},
52+
{"kc2", SecureString("1234567890123451")},
53+
};
54+
55+
} // namespace
56+
57+
EncryptionKey MakeEncryptionKey(int key_len) {
58+
// Keep the engine persistent to generate a different key every time
59+
static auto gen = []() { return std::default_random_engine(/*seed=*/42); }();
60+
61+
std::uniform_int_distribution<unsigned int> chars_dist(0, 255);
62+
std::string key(key_len, '\x00');
63+
for (auto& c : key) {
64+
c = static_cast<uint8_t>(chars_dist(gen));
65+
}
66+
67+
std::string key_metadata(kInlineKeyPrefix);
68+
key_metadata += ::arrow::util::base64_encode(key);
69+
70+
return {SecureString(std::move(key)), std::move(key_metadata)};
71+
}
72+
73+
class FuzzDecryptionKeyRetriever : public DecryptionKeyRetriever {
74+
public:
75+
SecureString GetKey(const std::string& key_id) override {
76+
// Is it one of the keys used in parquet-testing?
77+
auto it = kTestingKeys.find(key_id);
78+
if (it != kTestingKeys.end()) {
79+
return it->second;
80+
}
81+
// Is it a key generated by MakeEncryptionKey?
82+
if (::arrow::internal::StartsWith(key_id, kInlineKeyPrefix)) {
83+
return SecureString(
84+
::arrow::util::base64_decode(key_id.substr(kInlineKeyPrefix.length())));
85+
}
86+
throw ParquetException("Unknown fuzz encryption key_id");
87+
}
88+
};
89+
90+
std::shared_ptr<DecryptionKeyRetriever> MakeKeyRetriever() {
91+
return std::make_shared<FuzzDecryptionKeyRetriever>();
92+
}
93+
94+
namespace {
95+
96+
Status FuzzReadData(std::unique_ptr<FileReader> reader) {
97+
auto st = Status::OK();
98+
for (int i = 0; i < reader->num_row_groups(); ++i) {
99+
std::shared_ptr<Table> table;
100+
auto row_group_status = reader->ReadRowGroup(i, &table);
101+
if (row_group_status.ok()) {
102+
row_group_status &= table->ValidateFull();
103+
}
104+
st &= row_group_status;
105+
}
106+
return st;
107+
}
108+
109+
template <typename DType>
110+
Status FuzzReadTypedColumnIndex(const TypedColumnIndex<DType>* index) {
111+
index->min_values();
112+
index->max_values();
113+
return Status::OK();
114+
}
115+
116+
Status FuzzReadColumnIndex(const ColumnIndex* index, const ColumnDescriptor* descr) {
117+
Status st;
118+
BEGIN_PARQUET_CATCH_EXCEPTIONS
119+
index->definition_level_histograms();
120+
index->repetition_level_histograms();
121+
index->null_pages();
122+
index->null_counts();
123+
index->non_null_page_indices();
124+
index->encoded_min_values();
125+
index->encoded_max_values();
126+
switch (descr->physical_type()) {
127+
case Type::BOOLEAN:
128+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const BoolColumnIndex*>(index));
129+
break;
130+
case Type::INT32:
131+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int32ColumnIndex*>(index));
132+
break;
133+
case Type::INT64:
134+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int64ColumnIndex*>(index));
135+
break;
136+
case Type::INT96:
137+
st &= FuzzReadTypedColumnIndex(
138+
dynamic_cast<const TypedColumnIndex<Int96Type>*>(index));
139+
break;
140+
case Type::FLOAT:
141+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const FloatColumnIndex*>(index));
142+
break;
143+
case Type::DOUBLE:
144+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const DoubleColumnIndex*>(index));
145+
break;
146+
case Type::FIXED_LEN_BYTE_ARRAY:
147+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const FLBAColumnIndex*>(index));
148+
break;
149+
case Type::BYTE_ARRAY:
150+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const ByteArrayColumnIndex*>(index));
151+
break;
152+
case Type::UNDEFINED:
153+
break;
154+
}
155+
END_PARQUET_CATCH_EXCEPTIONS
156+
return st;
157+
}
158+
159+
Status FuzzReadPageIndex(RowGroupPageIndexReader* reader, const SchemaDescriptor* schema,
160+
int column) {
161+
Status st;
162+
BEGIN_PARQUET_CATCH_EXCEPTIONS
163+
auto offset_index = reader->GetOffsetIndex(column);
164+
if (offset_index) {
165+
offset_index->page_locations();
166+
offset_index->unencoded_byte_array_data_bytes();
167+
}
168+
auto col_index = reader->GetColumnIndex(column);
169+
if (col_index) {
170+
st &= FuzzReadColumnIndex(col_index.get(), schema->Column(column));
171+
}
172+
END_PARQUET_CATCH_EXCEPTIONS
173+
return st;
174+
}
175+
176+
ReaderProperties MakeFuzzReaderProperties(MemoryPool* pool) {
177+
FileDecryptionProperties::Builder builder;
178+
builder.key_retriever(MakeKeyRetriever());
179+
builder.plaintext_files_allowed();
180+
// XXX Cannot set a AAD prefix as that would fail on files
181+
// that store their own ADD prefix.
182+
auto decryption_properties = builder.build();
183+
184+
ReaderProperties properties(pool);
185+
properties.file_decryption_properties(decryption_properties);
186+
return properties;
187+
}
188+
189+
} // namespace
190+
191+
Status FuzzReader(const uint8_t* data, int64_t size) {
192+
Status st;
193+
194+
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
195+
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
196+
auto pool = ::arrow::internal::fuzzing_memory_pool();
197+
auto reader_properties = MakeFuzzReaderProperties(pool);
198+
199+
std::default_random_engine rng(/*seed*/ 42);
200+
201+
// Read Parquet file metadata only once, which will reduce iteration time slightly
202+
std::shared_ptr<FileMetaData> pq_md;
203+
BEGIN_PARQUET_CATCH_EXCEPTIONS {
204+
int num_row_groups, num_columns;
205+
auto pq_file_reader = ParquetFileReader::Open(file, reader_properties);
206+
{
207+
// Read some additional metadata (often lazy-decoded, such as statistics)
208+
pq_md = pq_file_reader->metadata();
209+
num_row_groups = pq_md->num_row_groups();
210+
num_columns = pq_md->num_columns();
211+
for (int i = 0; i < num_row_groups; ++i) {
212+
auto rg = pq_md->RowGroup(i);
213+
rg->sorting_columns();
214+
for (int j = 0; j < num_columns; ++j) {
215+
auto col = rg->ColumnChunk(j);
216+
col->encoded_statistics();
217+
col->statistics();
218+
col->geo_statistics();
219+
col->size_statistics();
220+
col->key_value_metadata();
221+
col->encodings();
222+
col->encoding_stats();
223+
}
224+
}
225+
}
226+
{
227+
// Read and decode bloom filters
228+
try {
229+
auto& bloom_reader = pq_file_reader->GetBloomFilterReader();
230+
std::uniform_int_distribution<uint64_t> hash_dist;
231+
for (int i = 0; i < num_row_groups; ++i) {
232+
auto bloom_rg = bloom_reader.RowGroup(i);
233+
for (int j = 0; j < num_columns; ++j) {
234+
std::unique_ptr<BloomFilter> bloom;
235+
bloom = bloom_rg->GetColumnBloomFilter(j);
236+
// If the column has a bloom filter, find a bunch of random hashes
237+
if (bloom != nullptr) {
238+
for (int k = 0; k < 100; ++k) {
239+
bloom->FindHash(hash_dist(rng));
240+
}
241+
}
242+
}
243+
}
244+
} catch (const ParquetException& exc) {
245+
// XXX we just want to ignore encrypted bloom filters and validate the
246+
// rest of the file; there is no better way of doing this until GH-46597
247+
// is done.
248+
// (also see GH-48334 for reading encrypted bloom filters)
249+
if (std::string_view(exc.what())
250+
.find("BloomFilter decryption is not yet supported") ==
251+
std::string_view::npos) {
252+
throw;
253+
}
254+
}
255+
}
256+
{
257+
// Read and decode page indexes
258+
auto index_reader = pq_file_reader->GetPageIndexReader();
259+
for (int i = 0; i < num_row_groups; ++i) {
260+
auto index_rg = index_reader->RowGroup(i);
261+
if (index_rg) {
262+
for (int j = 0; j < num_columns; ++j) {
263+
st &= FuzzReadPageIndex(index_rg.get(), pq_md->schema(), j);
264+
}
265+
}
266+
}
267+
}
268+
}
269+
END_PARQUET_CATCH_EXCEPTIONS
270+
271+
// Note that very small batch sizes probably make fuzzing slower
272+
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13, 300}) {
273+
ArrowReaderProperties properties;
274+
if (batch_size) {
275+
properties.set_batch_size(batch_size.value());
276+
}
277+
278+
std::unique_ptr<ParquetFileReader> pq_file_reader;
279+
BEGIN_PARQUET_CATCH_EXCEPTIONS
280+
pq_file_reader = ParquetFileReader::Open(file, reader_properties, pq_md);
281+
END_PARQUET_CATCH_EXCEPTIONS
282+
283+
std::unique_ptr<FileReader> reader;
284+
RETURN_NOT_OK(FileReader::Make(pool, std::move(pq_file_reader), properties, &reader));
285+
st &= FuzzReadData(std::move(reader));
286+
}
287+
return st;
288+
}
289+
290+
} // namespace parquet::fuzzing::internal

0 commit comments

Comments
 (0)