apache
diff --git a/‎cpp/src/arrow/util/fuzz_internal.cc‎
Lines changed: 1 addition & 1 deletion b/‎cpp/src/arrow/util/fuzz_internal.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/src/parquet/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/src/parquet/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/src/parquet/arrow/arrow_reader_writer_test.cc‎
Lines changed: 26 additions & 9 deletions b/‎cpp/src/parquet/arrow/arrow_reader_writer_test.cc‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎cpp/src/parquet/arrow/fuzz.cc‎
Lines changed: 2 additions & 2 deletions b/‎cpp/src/parquet/arrow/fuzz.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/src/parquet/arrow/fuzz_internal.cc‎
Lines changed: 290 additions & 0 deletions b/‎cpp/src/parquet/arrow/fuzz_internal.cc‎
Lines changed: 290 additions & 0 deletions
@@ -49,7 +49,7 @@ void LogFuzzStatus(const Status& st, const uint8_t* data, int64_t size) {
     return value;
   }();
 
-  if (kVerbosity >= 1) {
+  if (!st.ok() && kVerbosity >= 1) {
     ARROW_LOG(WARNING) << "Fuzzing input with size=" << size
                        << " failed: " << st.ToString();
   } else if (st.IsOutOfMemory()) {
 
@@ -151,6 +151,7 @@ endif()
 # Library config
 
 set(PARQUET_SRCS
+    arrow/fuzz_internal.cc
     arrow/path_internal.cc
     arrow/reader.cc
     arrow/reader_internal.cc
 
@@ -65,6 +65,7 @@
 #include "parquet/api/reader.h"
 #include "parquet/api/writer.h"
 
+#include "parquet/arrow/fuzz_internal.h"
 #include "parquet/arrow/reader.h"
 #include "parquet/arrow/reader_internal.h"
 #include "parquet/arrow/schema.h"
@@ -5830,29 +5831,45 @@ TEST(TestArrowReadWrite, MultithreadedWrite) {
 }
 
 TEST(TestArrowReadWrite, FuzzReader) {
+  using ::parquet::fuzzing::internal::FuzzReader;
+
   constexpr size_t kMaxFileSize = 1024 * 1024 * 1;
+
   auto check_bad_file = [&](const std::string& file_name) {
     SCOPED_TRACE(file_name);
     auto path = test::get_data_file(file_name, /*is_good=*/false);
     PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
                                              path, ::arrow::io::FileMode::READ));
     PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
-    auto s = internal::FuzzReader(buffer->data(), buffer->size());
+    auto s = FuzzReader(buffer->data(), buffer->size());
     ASSERT_NOT_OK(s);
   };
+
+  auto check_good_file = [&](const std::string& file_name) {
+    SCOPED_TRACE(file_name);
+    auto path = test::get_data_file(file_name, /*is_good=*/true);
+    PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
+                                             path, ::arrow::io::FileMode::READ));
+    PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
+    auto s = FuzzReader(buffer->data(), buffer->size());
+    ASSERT_OK(s);
+  };
+
   check_bad_file("PARQUET-1481.parquet");
   check_bad_file("ARROW-GH-41317.parquet");
   check_bad_file("ARROW-GH-41321.parquet");
   check_bad_file("ARROW-RS-GH-6229-LEVELS.parquet");
   check_bad_file("ARROW-RS-GH-6229-DICTHEADER.parquet");
-  {
-    auto path = test::get_data_file("alltypes_plain.parquet", /*is_good=*/true);
-    PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::MemoryMappedFile::Open(
-                                             path, ::arrow::io::FileMode::READ));
-    PARQUET_ASSIGN_OR_THROW(auto buffer, source->Read(kMaxFileSize));
-    auto s = internal::FuzzReader(buffer->data(), buffer->size());
-    ASSERT_OK(s);
-  }
+
+  check_good_file("alltypes_plain.parquet");
+  check_good_file("data_index_bloom_encoding_stats.parquet");
+  check_good_file("data_index_bloom_encoding_with_length.parquet");
+  // Encrypted files in the testing repo should be ok, except those
+  // that require external key material or an explicitly-supplied AAD.
+  check_good_file("uniform_encryption.parquet.encrypted");
+  check_good_file("encrypt_columns_and_footer_aad.parquet.encrypted");
+  check_good_file("encrypt_columns_and_footer.parquet.encrypted");
+  check_good_file("encrypt_columns_plaintext_footer.parquet.encrypted");
 }
 
 // Test writing table with a closed writer, should not segfault (GH-37969).
 
@@ -17,10 +17,10 @@
 
 #include "arrow/status.h"
 #include "arrow/util/fuzz_internal.h"
-#include "parquet/arrow/reader.h"
+#include "parquet/arrow/fuzz_internal.h"
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  auto status = parquet::arrow::internal::FuzzReader(data, static_cast<int64_t>(size));
+  auto status = parquet::fuzzing::internal::FuzzReader(data, static_cast<int64_t>(size));
   arrow::internal::LogFuzzStatus(status, data, static_cast<int64_t>(size));
   return 0;
 }
@@ -0,0 +1,290 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/fuzz_internal.h"
+
+#include <cstdint>
+#include <random>
+#include <string_view>
+#include <unordered_map>
+
+#include "arrow/io/memory.h"
+#include "arrow/table.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/fuzz_internal.h"
+#include "arrow/util/string.h"
+#include "parquet/arrow/reader.h"
+#include "parquet/bloom_filter.h"
+#include "parquet/bloom_filter_reader.h"
+#include "parquet/page_index.h"
+#include "parquet/properties.h"
+
+namespace parquet::fuzzing::internal {
+
+using ::arrow::MemoryPool;
+using ::arrow::Status;
+using ::arrow::Table;
+using ::arrow::util::SecureString;
+using ::parquet::arrow::FileReader;
+
+namespace {
+
+constexpr std::string_view kInlineKeyPrefix = "inline:";
+
+// See https://github.com/apache/parquet-testing/blob/master/data/README.md#encrypted-files
+const std::unordered_map<std::string, SecureString> kTestingKeys = {
+    {"kf", SecureString("0123456789012345")},
+    {"kc1", SecureString("1234567890123450")},
+    {"kc2", SecureString("1234567890123451")},
+};
+
+}  // namespace
+
+EncryptionKey MakeEncryptionKey(int key_len) {
+  // Keep the engine persistent to generate a different key every time
+  static auto gen = []() { return std::default_random_engine(/*seed=*/42); }();
+
+  std::uniform_int_distribution<unsigned int> chars_dist(0, 255);
+  std::string key(key_len, '\x00');
+  for (auto& c : key) {
+    c = static_cast<uint8_t>(chars_dist(gen));
+  }
+
+  std::string key_metadata(kInlineKeyPrefix);
+  key_metadata += ::arrow::util::base64_encode(key);
+
+  return {SecureString(std::move(key)), std::move(key_metadata)};
+}
+
+class FuzzDecryptionKeyRetriever : public DecryptionKeyRetriever {
+ public:
+  SecureString GetKey(const std::string& key_id) override {
+    // Is it one of the keys used in parquet-testing?
+    auto it = kTestingKeys.find(key_id);
+    if (it != kTestingKeys.end()) {
+      return it->second;
+    }
+    // Is it a key generated by MakeEncryptionKey?
+    if (::arrow::internal::StartsWith(key_id, kInlineKeyPrefix)) {
+      return SecureString(
+          ::arrow::util::base64_decode(key_id.substr(kInlineKeyPrefix.length())));
+    }
+    throw ParquetException("Unknown fuzz encryption key_id");
+  }
+};
+
+std::shared_ptr<DecryptionKeyRetriever> MakeKeyRetriever() {
+  return std::make_shared<FuzzDecryptionKeyRetriever>();
+}
+
+namespace {
+
+Status FuzzReadData(std::unique_ptr<FileReader> reader) {
+  auto st = Status::OK();
+  for (int i = 0; i < reader->num_row_groups(); ++i) {
+    std::shared_ptr<Table> table;
+    auto row_group_status = reader->ReadRowGroup(i, &table);
+    if (row_group_status.ok()) {
+      row_group_status &= table->ValidateFull();
+    }
+    st &= row_group_status;
+  }
+  return st;
+}
+
+template <typename DType>
+Status FuzzReadTypedColumnIndex(const TypedColumnIndex<DType>* index) {
+  index->min_values();
+  index->max_values();
+  return Status::OK();
+}
+
+Status FuzzReadColumnIndex(const ColumnIndex* index, const ColumnDescriptor* descr) {
+  Status st;
+  BEGIN_PARQUET_CATCH_EXCEPTIONS
+  index->definition_level_histograms();
+  index->repetition_level_histograms();
+  index->null_pages();
+  index->null_counts();
+  index->non_null_page_indices();
+  index->encoded_min_values();
+  index->encoded_max_values();
+  switch (descr->physical_type()) {
+    case Type::BOOLEAN:
+      st &= FuzzReadTypedColumnIndex(dynamic_cast<const BoolColumnIndex*>(index));
+      break;
+    case Type::INT32:
+      st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int32ColumnIndex*>(index));
+      break;
+    case Type::INT64:
+      st &= FuzzReadTypedColumnIndex(dynamic_cast<const Int64ColumnIndex*>(index));
+      break;
+    case Type::INT96:
+      st &= FuzzReadTypedColumnIndex(
+          dynamic_cast<const TypedColumnIndex<Int96Type>*>(index));
+      break;
+    case Type::FLOAT:
+      st &= FuzzReadTypedColumnIndex(dynamic_cast<const FloatColumnIndex*>(index));
+      break;
+    case Type::DOUBLE:
+      st &= FuzzReadTypedColumnIndex(dynamic_cast<const DoubleColumnIndex*>(index));
+      break;
+    case Type::FIXED_LEN_BYTE_ARRAY:
+      st &= FuzzReadTypedColumnIndex(dynamic_cast<const FLBAColumnIndex*>(index));
+      break;
+    case Type::BYTE_ARRAY:
+      st &= FuzzReadTypedColumnIndex(dynamic_cast<const ByteArrayColumnIndex*>(index));
+      break;
+    case Type::UNDEFINED:
+      break;
+  }
+  END_PARQUET_CATCH_EXCEPTIONS
+  return st;
+}
+
+Status FuzzReadPageIndex(RowGroupPageIndexReader* reader, const SchemaDescriptor* schema,
+                         int column) {
+  Status st;
+  BEGIN_PARQUET_CATCH_EXCEPTIONS
+  auto offset_index = reader->GetOffsetIndex(column);
+  if (offset_index) {
+    offset_index->page_locations();
+    offset_index->unencoded_byte_array_data_bytes();
+  }
+  auto col_index = reader->GetColumnIndex(column);
+  if (col_index) {
+    st &= FuzzReadColumnIndex(col_index.get(), schema->Column(column));
+  }
+  END_PARQUET_CATCH_EXCEPTIONS
+  return st;
+}
+
+ReaderProperties MakeFuzzReaderProperties(MemoryPool* pool) {
+  FileDecryptionProperties::Builder builder;
+  builder.key_retriever(MakeKeyRetriever());
+  builder.plaintext_files_allowed();
+  // XXX Cannot set a AAD prefix as that would fail on files
+  // that store their own ADD prefix.
+  auto decryption_properties = builder.build();
+
+  ReaderProperties properties(pool);
+  properties.file_decryption_properties(decryption_properties);
+  return properties;
+}
+
+}  // namespace
+
+Status FuzzReader(const uint8_t* data, int64_t size) {
+  Status st;
+
+  auto buffer = std::make_shared<::arrow::Buffer>(data, size);
+  auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
+  auto pool = ::arrow::internal::fuzzing_memory_pool();
+  auto reader_properties = MakeFuzzReaderProperties(pool);
+
+  std::default_random_engine rng(/*seed*/ 42);
+
+  // Read Parquet file metadata only once, which will reduce iteration time slightly
+  std::shared_ptr<FileMetaData> pq_md;
+  BEGIN_PARQUET_CATCH_EXCEPTIONS {
+    int num_row_groups, num_columns;
+    auto pq_file_reader = ParquetFileReader::Open(file, reader_properties);
+    {
+      // Read some additional metadata (often lazy-decoded, such as statistics)
+      pq_md = pq_file_reader->metadata();
+      num_row_groups = pq_md->num_row_groups();
+      num_columns = pq_md->num_columns();
+      for (int i = 0; i < num_row_groups; ++i) {
+        auto rg = pq_md->RowGroup(i);
+        rg->sorting_columns();
+        for (int j = 0; j < num_columns; ++j) {
+          auto col = rg->ColumnChunk(j);
+          col->encoded_statistics();
+          col->statistics();
+          col->geo_statistics();
+          col->size_statistics();
+          col->key_value_metadata();
+          col->encodings();
+          col->encoding_stats();
+        }
+      }
+    }
+    {
+      // Read and decode bloom filters
+      try {
+        auto& bloom_reader = pq_file_reader->GetBloomFilterReader();
+        std::uniform_int_distribution<uint64_t> hash_dist;
+        for (int i = 0; i < num_row_groups; ++i) {
+          auto bloom_rg = bloom_reader.RowGroup(i);
+          for (int j = 0; j < num_columns; ++j) {
+            std::unique_ptr<BloomFilter> bloom;
+            bloom = bloom_rg->GetColumnBloomFilter(j);
+            // If the column has a bloom filter, find a bunch of random hashes
+            if (bloom != nullptr) {
+              for (int k = 0; k < 100; ++k) {
+                bloom->FindHash(hash_dist(rng));
+              }
+            }
+          }
+        }
+      } catch (const ParquetException& exc) {
+        // XXX we just want to ignore encrypted bloom filters and validate the
+        // rest of the file; there is no better way of doing this until GH-46597
+        // is done.
+        // (also see GH-48334 for reading encrypted bloom filters)
+        if (std::string_view(exc.what())
+                .find("BloomFilter decryption is not yet supported") ==
+            std::string_view::npos) {
+          throw;
+        }
+      }
+    }
+    {
+      // Read and decode page indexes
+      auto index_reader = pq_file_reader->GetPageIndexReader();
+      for (int i = 0; i < num_row_groups; ++i) {
+        auto index_rg = index_reader->RowGroup(i);
+        if (index_rg) {
+          for (int j = 0; j < num_columns; ++j) {
+            st &= FuzzReadPageIndex(index_rg.get(), pq_md->schema(), j);
+          }
+        }
+      }
+    }
+  }
+  END_PARQUET_CATCH_EXCEPTIONS
+
+  // Note that very small batch sizes probably make fuzzing slower
+  for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13, 300}) {
+    ArrowReaderProperties properties;
+    if (batch_size) {
+      properties.set_batch_size(batch_size.value());
+    }
+
+    std::unique_ptr<ParquetFileReader> pq_file_reader;
+    BEGIN_PARQUET_CATCH_EXCEPTIONS
+    pq_file_reader = ParquetFileReader::Open(file, reader_properties, pq_md);
+    END_PARQUET_CATCH_EXCEPTIONS
+
+    std::unique_ptr<FileReader> reader;
+    RETURN_NOT_OK(FileReader::Make(pool, std::move(pq_file_reader), properties, &reader));
+    st &= FuzzReadData(std::move(reader));
+  }
+  return st;
+}
+
+}  // namespace parquet::fuzzing::internal