Skip to content

Commit eb4c4c8

Browse files
committed
GH-48335: [C++][Parquet] Fuzz encrypted files
1 parent b2e8f25 commit eb4c4c8

File tree

11 files changed

+610
-308
lines changed

11 files changed

+610
-308
lines changed

cpp/src/arrow/util/fuzz_internal.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void LogFuzzStatus(const Status& st, const uint8_t* data, int64_t size) {
4949
return value;
5050
}();
5151

52-
if (kVerbosity >= 1) {
52+
if (!st.ok() && kVerbosity >= 1) {
5353
ARROW_LOG(WARNING) << "Fuzzing input with size=" << size
5454
<< " failed: " << st.ToString();
5555
} else if (st.IsOutOfMemory()) {

cpp/src/parquet/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ endif()
151151
# Library config
152152

153153
set(PARQUET_SRCS
154+
arrow/fuzz_internal.cc
154155
arrow/path_internal.cc
155156
arrow/reader.cc
156157
arrow/reader_internal.cc

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
#include "parquet/api/reader.h"
6666
#include "parquet/api/writer.h"
6767

68+
#include "parquet/arrow/fuzz_internal.h"
6869
#include "parquet/arrow/reader.h"
6970
#include "parquet/arrow/reader_internal.h"
7071
#include "parquet/arrow/schema.h"
@@ -5830,29 +5831,45 @@ TEST(TestArrowReadWrite, MultithreadedWrite) {
58305831
}
58315832

58325833
TEST(TestArrowReadWrite, FuzzReader) {
5834+
using ::parquet::fuzzing::internal::FuzzReader;
5835+
58335836
constexpr size_t kMaxFileSize = 1024 * 1024 * 1;
5837+
58345838
auto check_bad_file = [&](const std::string& file_name) {
58355839
SCOPED_TRACE(file_name);
58365840
auto path = test::get_data_file(file_name, /*is_good=*/false);
58375841
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
58385842
path, ::arrow::io::FileMode::READ));
58395843
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5840-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5844+
auto s = FuzzReader(buffer->data(), buffer->size());
58415845
ASSERT_NOT_OK(s);
58425846
};
5847+
5848+
auto check_good_file = [&](const std::string& file_name) {
5849+
SCOPED_TRACE(file_name);
5850+
auto path = test::get_data_file(file_name, /*is_good=*/true);
5851+
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5852+
path, ::arrow::io::FileMode::READ));
5853+
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5854+
auto s = FuzzReader(buffer->data(), buffer->size());
5855+
ASSERT_OK(s);
5856+
};
5857+
58435858
check_bad_file("PARQUET-1481.parquet");
58445859
check_bad_file("ARROW-GH-41317.parquet");
58455860
check_bad_file("ARROW-GH-41321.parquet");
58465861
check_bad_file("ARROW-RS-GH-6229-LEVELS.parquet");
58475862
check_bad_file("ARROW-RS-GH-6229-DICTHEADER.parquet");
5848-
{
5849-
auto path = test::get_data_file("alltypes_plain.parquet", /*is_good=*/true);
5850-
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5851-
path, ::arrow::io::FileMode::READ));
5852-
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5853-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5854-
ASSERT_OK(s);
5855-
}
5863+
5864+
check_good_file("alltypes_plain.parquet");
5865+
check_good_file("data_index_bloom_encoding_stats.parquet");
5866+
check_good_file("data_index_bloom_encoding_with_length.parquet");
5867+
// Encrypted files in the testing repo should be ok, except those
5868+
// that require external key material or an explicitly-supplied AAD.
5869+
check_good_file("uniform_encryption.parquet.encrypted");
5870+
check_good_file("encrypt_columns_and_footer_aad.parquet.encrypted");
5871+
check_good_file("encrypt_columns_and_footer.parquet.encrypted");
5872+
check_good_file("encrypt_columns_plaintext_footer.parquet.encrypted");
58565873
}
58575874

58585875
// Test writing table with a closed writer, should not segfault (GH-37969).

cpp/src/parquet/arrow/fuzz.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717

1818
#include "arrow/status.h"
1919
#include "arrow/util/fuzz_internal.h"
20-
#include "parquet/arrow/reader.h"
20+
#include "parquet/arrow/fuzz_internal.h"
2121

2222
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
23-
auto status = parquet::arrow::internal::FuzzReader(data, static_cast<int64_t>(size));
23+
auto status = parquet::fuzzing::internal::FuzzReader(data, static_cast<int64_t>(size));
2424
arrow::internal::LogFuzzStatus(status, data, static_cast<int64_t>(size));
2525
return 0;
2626
}
Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "parquet/arrow/fuzz_internal.h"
19+
20+
#include <cstdint>
21+
#include <random>
22+
#include <string_view>
23+
#include <unordered_map>
24+
25+
#include "arrow/io/memory.h"
26+
#include "arrow/table.h"
27+
#include "arrow/util/base64.h"
28+
#include "arrow/util/fuzz_internal.h"
29+
#include "arrow/util/string.h"
30+
#include "parquet/arrow/reader.h"
31+
#include "parquet/bloom_filter.h"
32+
#include "parquet/bloom_filter_reader.h"
33+
#include "parquet/page_index.h"
34+
#include "parquet/properties.h"
35+
36+
namespace parquet::fuzzing::internal {
37+
38+
using ::arrow::MemoryPool;
39+
using ::arrow::Status;
40+
using ::arrow::Table;
41+
using ::arrow::util::SecureString;
42+
using ::parquet::arrow::FileReader;
43+
44+
namespace {
45+
46+
constexpr std::string_view kInlineKeyPrefix = "inline:";
47+
48+
// See
49+
// https://github.com/apache/parquet-testing/blob/master/data/README.md#encrypted-files
50+
const std::unordered_map<std::string, SecureString> kTestingKeys = {
51+
{"kf", SecureString("0123456789012345")},
52+
{"kc1", SecureString("1234567890123450")},
53+
{"kc2", SecureString("1234567890123451")},
54+
};
55+
56+
} // namespace
57+
58+
EncryptionKey MakeEncryptionKey(int key_len) {
59+
// Keep the engine persistent to generate a different key every time
60+
static auto gen = []() { return std::default_random_engine(/*seed=*/42); }();
61+
62+
std::uniform_int_distribution<unsigned int> chars_dist(0, 255);
63+
std::string key(key_len, '\x00');
64+
for (auto& c : key) {
65+
c = static_cast<uint8_t>(chars_dist(gen));
66+
}
67+
68+
std::string key_metadata(kInlineKeyPrefix);
69+
key_metadata += ::arrow::util::base64_encode(key);
70+
71+
return {SecureString(std::move(key)), std::move(key_metadata)};
72+
}
73+
74+
class FuzzDecryptionKeyRetriever : public DecryptionKeyRetriever {
75+
public:
76+
SecureString GetKey(const std::string& key_id) override {
77+
// Is it one of the keys used in parquet-testing?
78+
auto it = kTestingKeys.find(key_id);
79+
if (it != kTestingKeys.end()) {
80+
return it->second;
81+
}
82+
// Is it a key generated by MakeEncryptionKey?
83+
if (::arrow::internal::StartsWith(key_id, kInlineKeyPrefix)) {
84+
return SecureString(
85+
::arrow::util::base64_decode(key_id.substr(kInlineKeyPrefix.length())));
86+
}
87+
throw ParquetException("Unknown fuzz encryption key_id");
88+
}
89+
};
90+
91+
std::shared_ptr<DecryptionKeyRetriever> MakeKeyRetriever() {
92+
return std::make_shared<FuzzDecryptionKeyRetriever>();
93+
}
94+
95+
namespace {
96+
97+
Status FuzzReadData(std::unique_ptr<FileReader> reader) {
98+
auto st = Status::OK();
99+
for (int i = 0; i < reader->num_row_groups(); ++i) {
100+
std::shared_ptr<Table> table;
101+
auto row_group_status = reader->ReadRowGroup(i, &table);
102+
if (row_group_status.ok()) {
103+
row_group_status &= table->ValidateFull();
104+
}
105+
st &= row_group_status;
106+
}
107+
return st;
108+
}
109+
110+
template <typename DType>
111+
Status FuzzReadTypedColumnIndex(const TypedColumnIndex<DType>* index) {
112+
index->min_values();
113+
index->max_values();
114+
return Status::OK();
115+
}
116+
117+
Status FuzzReadColumnIndex(const ColumnIndex* index, const ColumnDescriptor* descr) {
118+
Status st;
119+
BEGIN_PARQUET_CATCH_EXCEPTIONS
120+
index->definition_level_histograms();
121+
index->repetition_level_histograms();
122+
index->null_pages();
123+
index->null_counts();
124+
index->non_null_page_indices();
125+
index->encoded_min_values();
126+
index->encoded_max_values();
127+
switch (descr->physical_type()) {
128+
case Type::BOOLEAN:
129+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const BoolColumnIndex*>(index));
130+
break;
131+
case Type::INT32:
132+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int32ColumnIndex*>(index));
133+
break;
134+
case Type::INT64:
135+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int64ColumnIndex*>(index));
136+
break;
137+
case Type::INT96:
138+
st &= FuzzReadTypedColumnIndex(
139+
dynamic_cast<const TypedColumnIndex<Int96Type>*>(index));
140+
break;
141+
case Type::FLOAT:
142+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const FloatColumnIndex*>(index));
143+
break;
144+
case Type::DOUBLE:
145+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const DoubleColumnIndex*>(index));
146+
break;
147+
case Type::FIXED_LEN_BYTE_ARRAY:
148+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const FLBAColumnIndex*>(index));
149+
break;
150+
case Type::BYTE_ARRAY:
151+
st &= FuzzReadTypedColumnIndex(dynamic_cast<const ByteArrayColumnIndex*>(index));
152+
break;
153+
case Type::UNDEFINED:
154+
break;
155+
}
156+
END_PARQUET_CATCH_EXCEPTIONS
157+
return st;
158+
}
159+
160+
Status FuzzReadPageIndex(RowGroupPageIndexReader* reader, const SchemaDescriptor* schema,
161+
int column) {
162+
Status st;
163+
BEGIN_PARQUET_CATCH_EXCEPTIONS
164+
auto offset_index = reader->GetOffsetIndex(column);
165+
if (offset_index) {
166+
offset_index->page_locations();
167+
offset_index->unencoded_byte_array_data_bytes();
168+
}
169+
auto col_index = reader->GetColumnIndex(column);
170+
if (col_index) {
171+
st &= FuzzReadColumnIndex(col_index.get(), schema->Column(column));
172+
}
173+
END_PARQUET_CATCH_EXCEPTIONS
174+
return st;
175+
}
176+
177+
ReaderProperties MakeFuzzReaderProperties(MemoryPool* pool) {
178+
FileDecryptionProperties::Builder builder;
179+
builder.key_retriever(MakeKeyRetriever());
180+
builder.plaintext_files_allowed();
181+
// XXX Cannot set a AAD prefix as that would fail on files
182+
// that store their own ADD prefix.
183+
auto decryption_properties = builder.build();
184+
185+
ReaderProperties properties(pool);
186+
properties.file_decryption_properties(decryption_properties);
187+
return properties;
188+
}
189+
190+
} // namespace
191+
192+
Status FuzzReader(const uint8_t* data, int64_t size) {
193+
Status st;
194+
195+
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
196+
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
197+
auto pool = ::arrow::internal::fuzzing_memory_pool();
198+
auto reader_properties = MakeFuzzReaderProperties(pool);
199+
200+
std::default_random_engine rng(/*seed*/ 42);
201+
202+
// Read Parquet file metadata only once, which will reduce iteration time slightly
203+
std::shared_ptr<FileMetaData> pq_md;
204+
BEGIN_PARQUET_CATCH_EXCEPTIONS {
205+
int num_row_groups, num_columns;
206+
auto pq_file_reader = ParquetFileReader::Open(file, reader_properties);
207+
{
208+
// Read some additional metadata (often lazy-decoded, such as statistics)
209+
pq_md = pq_file_reader->metadata();
210+
num_row_groups = pq_md->num_row_groups();
211+
num_columns = pq_md->num_columns();
212+
for (int i = 0; i < num_row_groups; ++i) {
213+
auto rg = pq_md->RowGroup(i);
214+
rg->sorting_columns();
215+
for (int j = 0; j < num_columns; ++j) {
216+
auto col = rg->ColumnChunk(j);
217+
col->encoded_statistics();
218+
col->statistics();
219+
col->geo_statistics();
220+
col->size_statistics();
221+
col->key_value_metadata();
222+
col->encodings();
223+
col->encoding_stats();
224+
}
225+
}
226+
}
227+
{
228+
// Read and decode bloom filters
229+
try {
230+
auto& bloom_reader = pq_file_reader->GetBloomFilterReader();
231+
std::uniform_int_distribution<uint64_t> hash_dist;
232+
for (int i = 0; i < num_row_groups; ++i) {
233+
auto bloom_rg = bloom_reader.RowGroup(i);
234+
for (int j = 0; j < num_columns; ++j) {
235+
std::unique_ptr<BloomFilter> bloom;
236+
bloom = bloom_rg->GetColumnBloomFilter(j);
237+
// If the column has a bloom filter, find a bunch of random hashes
238+
if (bloom != nullptr) {
239+
for (int k = 0; k < 100; ++k) {
240+
bloom->FindHash(hash_dist(rng));
241+
}
242+
}
243+
}
244+
}
245+
} catch (const ParquetException& exc) {
246+
// XXX we just want to ignore encrypted bloom filters and validate the
247+
// rest of the file; there is no better way of doing this until GH-46597
248+
// is done.
249+
// (also see GH-48334 for reading encrypted bloom filters)
250+
if (std::string_view(exc.what())
251+
.find("BloomFilter decryption is not yet supported") ==
252+
std::string_view::npos) {
253+
throw;
254+
}
255+
}
256+
}
257+
{
258+
// Read and decode page indexes
259+
auto index_reader = pq_file_reader->GetPageIndexReader();
260+
for (int i = 0; i < num_row_groups; ++i) {
261+
auto index_rg = index_reader->RowGroup(i);
262+
if (index_rg) {
263+
for (int j = 0; j < num_columns; ++j) {
264+
st &= FuzzReadPageIndex(index_rg.get(), pq_md->schema(), j);
265+
}
266+
}
267+
}
268+
}
269+
}
270+
END_PARQUET_CATCH_EXCEPTIONS
271+
272+
// Note that very small batch sizes probably make fuzzing slower
273+
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13, 300}) {
274+
ArrowReaderProperties properties;
275+
if (batch_size) {
276+
properties.set_batch_size(batch_size.value());
277+
}
278+
279+
std::unique_ptr<ParquetFileReader> pq_file_reader;
280+
BEGIN_PARQUET_CATCH_EXCEPTIONS
281+
pq_file_reader = ParquetFileReader::Open(file, reader_properties, pq_md);
282+
END_PARQUET_CATCH_EXCEPTIONS
283+
284+
std::unique_ptr<FileReader> reader;
285+
RETURN_NOT_OK(FileReader::Make(pool, std::move(pq_file_reader), properties, &reader));
286+
st &= FuzzReadData(std::move(reader));
287+
}
288+
return st;
289+
}
290+
291+
} // namespace parquet::fuzzing::internal

0 commit comments

Comments
 (0)