Skip to content

Commit fa4e593

Browse files
authored
GH-48335: [C++][Parquet] Fuzz encrypted files (#48336)
### Rationale for this change Currently, if given an encrypted file (whether valid or invalid), the Parquet fuzz target will bail out almost immediately since it does not have any decryption key configured. This prevents the fuzzer from covering any significant part of the Parquet encryption codebase. ### What changes are included in this PR? 1. Improve the fuzz target so as to be able to read encrypted files present in the seed corpus (except those that require a external AAD or external key material) 2. Generate encrypted files in the seed corpus that the fuzz target is able to successfully decrypt ### Are these changes tested? In expanded unit test, and locally. ### Are there any user-facing changes? No. * GitHub Issue: #48335 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 85ea78d commit fa4e593

14 files changed

+618
-312
lines changed

cpp/src/arrow/util/fuzz_internal.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void LogFuzzStatus(const Status& st, const uint8_t* data, int64_t size) {
4949
return value;
5050
}();
5151

52-
if (kVerbosity >= 1) {
52+
if (!st.ok() && kVerbosity >= 1) {
5353
ARROW_LOG(WARNING) << "Fuzzing input with size=" << size
5454
<< " failed: " << st.ToString();
5555
} else if (st.IsOutOfMemory()) {

cpp/src/parquet/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ endif()
151151
# Library config
152152

153153
set(PARQUET_SRCS
154+
arrow/fuzz_internal.cc
154155
arrow/path_internal.cc
155156
arrow/reader.cc
156157
arrow/reader_internal.cc

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
#include "arrow/type_fwd.h"
5252
#include "arrow/type_traits.h"
5353
#include "arrow/util/checked_cast.h"
54-
#include "arrow/util/config.h" // for ARROW_CSV definition
54+
#include "arrow/util/config.h" // for ARROW_CSV and PARQUET_REQUIRE_ENCRYPTION
5555
#include "arrow/util/decimal.h"
5656
#include "arrow/util/future.h"
5757
#include "arrow/util/key_value_metadata.h"
@@ -65,6 +65,7 @@
6565
#include "parquet/api/reader.h"
6666
#include "parquet/api/writer.h"
6767

68+
#include "parquet/arrow/fuzz_internal.h"
6869
#include "parquet/arrow/reader.h"
6970
#include "parquet/arrow/reader_internal.h"
7071
#include "parquet/arrow/schema.h"
@@ -5830,29 +5831,53 @@ TEST(TestArrowReadWrite, MultithreadedWrite) {
58305831
}
58315832

58325833
TEST(TestArrowReadWrite, FuzzReader) {
5834+
using ::parquet::fuzzing::internal::FuzzReader;
5835+
58335836
constexpr size_t kMaxFileSize = 1024 * 1024 * 1;
5837+
58345838
auto check_bad_file = [&](const std::string& file_name) {
58355839
SCOPED_TRACE(file_name);
58365840
auto path = test::get_data_file(file_name, /*is_good=*/false);
58375841
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
58385842
path, ::arrow::io::FileMode::READ));
58395843
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5840-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5844+
auto s = FuzzReader(buffer->data(), buffer->size());
58415845
ASSERT_NOT_OK(s);
58425846
};
5847+
5848+
auto check_good_file = [&](const std::string& file_name, bool expect_error = false) {
5849+
SCOPED_TRACE(file_name);
5850+
auto path = test::get_data_file(file_name, /*is_good=*/true);
5851+
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5852+
path, ::arrow::io::FileMode::READ));
5853+
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5854+
auto s = FuzzReader(buffer->data(), buffer->size());
5855+
if (expect_error) {
5856+
ASSERT_NOT_OK(s);
5857+
} else {
5858+
ASSERT_OK(s);
5859+
}
5860+
};
5861+
58435862
check_bad_file("PARQUET-1481.parquet");
58445863
check_bad_file("ARROW-GH-41317.parquet");
58455864
check_bad_file("ARROW-GH-41321.parquet");
58465865
check_bad_file("ARROW-RS-GH-6229-LEVELS.parquet");
58475866
check_bad_file("ARROW-RS-GH-6229-DICTHEADER.parquet");
5848-
{
5849-
auto path = test::get_data_file("alltypes_plain.parquet", /*is_good=*/true);
5850-
PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
5851-
path, ::arrow::io::FileMode::READ));
5852-
PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
5853-
auto s = internal::FuzzReader(buffer->data(), buffer->size());
5854-
ASSERT_OK(s);
5855-
}
5867+
5868+
check_good_file("alltypes_plain.parquet");
5869+
check_good_file("data_index_bloom_encoding_stats.parquet");
5870+
check_good_file("data_index_bloom_encoding_with_length.parquet");
5871+
#ifdef PARQUET_REQUIRE_ENCRYPTION
5872+
// Encrypted files in the testing repo should be ok, except those
5873+
// that require external key material or an explicitly-supplied AAD.
5874+
check_good_file("uniform_encryption.parquet.encrypted");
5875+
check_good_file("encrypt_columns_and_footer_aad.parquet.encrypted");
5876+
check_good_file("encrypt_columns_and_footer.parquet.encrypted");
5877+
check_good_file("encrypt_columns_plaintext_footer.parquet.encrypted");
5878+
#else
5879+
check_good_file("uniform_encryption.parquet.encrypted", /*expect_error=*/true);
5880+
#endif
58565881
}
58575882

58585883
// Test writing table with a closed writer, should not segfault (GH-37969).

cpp/src/parquet/arrow/fuzz.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717

1818
#include "arrow/status.h"
1919
#include "arrow/util/fuzz_internal.h"
20-
#include "parquet/arrow/reader.h"
20+
#include "parquet/arrow/fuzz_internal.h"
2121

2222
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
23-
auto status = parquet::arrow::internal::FuzzReader(data, static_cast<int64_t>(size));
23+
auto status = parquet::fuzzing::internal::FuzzReader(data, static_cast<int64_t>(size));
2424
arrow::internal::LogFuzzStatus(status, data, static_cast<int64_t>(size));
2525
return 0;
2626
}

0 commit comments

Comments
 (0)