apache · pitrou · Jan 14, 2026 · Aug 26, 2023 · Aug 26, 2023 · Aug 26, 2023
@@ -156,6 +156,7 @@ set(PARQUET_SRCS
     arrow/writer.cc
     bloom_filter.cc
     bloom_filter_reader.cc
+    bloom_filter_builder.cc
     column_reader.cc
     column_scanner.cc
     column_writer.cc
@@ -335,7 +336,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h"
 add_parquet_test(internals-test
                  SOURCES
                  bloom_filter_test.cc
-                 bloom_filter_reader_test.cc
+                 bloom_filter_parquet_test.cc
                  properties_test.cc
                  statistics_test.cc
                  encoding_test.cc

@@ -66,6 +66,8 @@
 #include "parquet/arrow/schema.h"
 #include "parquet/arrow/test_util.h"
 #include "parquet/arrow/writer.h"
+#include "parquet/bloom_filter.h"
+#include "parquet/bloom_filter_reader.h"
 #include "parquet/column_writer.h"
 #include "parquet/file_writer.h"
 #include "parquet/page_index.h"
@@ -5256,7 +5258,7 @@ auto encode_double = [](double value) {
 
 }  // namespace
 
-class ParquetPageIndexRoundTripTest : public ::testing::Test {
+class ParquetIndexRoundTripTest {
  public:
   void WriteFile(const std::shared_ptr<WriterProperties>& writer_properties,
                  const std::shared_ptr<::arrow::Table>& table) {
@@ -5280,10 +5282,17 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test {
     ASSERT_OK_AND_ASSIGN(buffer_, sink->Finish());
   }
 
+ protected:
+  std::shared_ptr<Buffer> buffer_;
+};
+
+class ParquetPageIndexRoundTripTest : public ::testing::Test,
+                                      public ParquetIndexRoundTripTest {
+ public:
   void ReadPageIndexes(int expect_num_row_groups, int expect_num_pages,
                        const std::set<int>& expect_columns_without_index = {}) {
     auto read_properties = default_arrow_reader_properties();
-    auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer_));
+    auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(this->buffer_));
 
     auto metadata = reader->metadata();
     ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups());
@@ -5348,7 +5357,6 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test {
   }
 
  protected:
-  std::shared_ptr<Buffer> buffer_;
   std::vector<ColumnIndexObject> column_indexes_;
 };
 
@@ -5584,5 +5592,104 @@ TEST_F(ParquetPageIndexRoundTripTest, EnablePerColumn) {
                             /*null_counts=*/{0}}));
 }
 
+class ParquetBloomFilterRoundTripTest : public ::testing::Test,
+                                        public ParquetIndexRoundTripTest {
+ public:
+  void ReadBloomFilters(int expect_num_row_groups,
+                        const std::set<int>& expect_columns_without_filter = {}) {
+    auto read_properties = default_arrow_reader_properties();
+    auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer_));
+
+    auto metadata = reader->metadata();
+    ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups());
+
+    auto& bloom_filter_reader = reader->GetBloomFilterReader();
+
+    for (int rg = 0; rg < metadata->num_row_groups(); ++rg) {
+      auto row_group_reader = bloom_filter_reader.RowGroup(rg);
+      ASSERT_NE(row_group_reader, nullptr);
+
+      for (int col = 0; col < metadata->num_columns(); ++col) {
+        bool expect_no_bloom_filter = expect_columns_without_filter.find(col) !=
+                                      expect_columns_without_filter.cend();
+
+        auto bloom_filter = row_group_reader->GetColumnBloomFilter(col);
+        if (expect_no_bloom_filter) {
+          ASSERT_EQ(bloom_filter, nullptr);
+        } else {
+          bloom_filters_.push_back(std::move(bloom_filter));
+        }
+      }
+    }
+  }
+
+  template <typename ArrowType>
+  void verifyBloomFilter(const BloomFilter* bloom_filter,
+                         const ::arrow::ChunkedArray& chunked_array) {
+    auto iter = ::arrow::stl::Begin<ArrowType>(chunked_array);
+    auto end = ::arrow::stl::End<ArrowType>(chunked_array);
+    while (iter != end) {
+      auto value = *iter;
+      if (value == std::nullopt) {
+        ++iter;
+        continue;
+      }
+      if constexpr (std::is_same_v<ArrowType, ::arrow::StringType>) {
+        ByteArray ba(value.value());
+        EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba)));
+      } else {
+        EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(value.value())));
+      }
+      ++iter;
+    }
+  }
+
+ protected:
+  std::vector<std::unique_ptr<BloomFilter>> bloom_filters_;
+};
+
+TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) {
+  BloomFilterOptions options;
+  options.ndv = 100;
+  auto writer_properties = WriterProperties::Builder()
+                               .set_bloom_filter_options(options)
+                               ->max_row_group_length(4)
+                               ->build();
+  auto schema = ::arrow::schema(
+      {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())});
+  auto table = ::arrow::TableFromJSON(schema, {R"([
+[1,     "a"      ],
+[2,     "b"   ],
+[3,     "c"   ],
+[null,  "d"],
+[5,     null],
+[6,     "f"    ]
+])"});
+  WriteFile(writer_properties, table);
+
+  ReadBloomFilters(/*expect_num_row_groups=*/2);
+  ASSERT_EQ(4, bloom_filters_.size());
+  {
+    ASSERT_NE(nullptr, bloom_filters_[0]);
+    auto col = table->column(0)->Slice(0, 4);
+    verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[0].get(), *col);
+  }
+  {
+    ASSERT_NE(nullptr, bloom_filters_[1]);
+    auto col = table->column(1)->Slice(0, 4);
+    verifyBloomFilter<::arrow::StringType>(bloom_filters_[1].get(), *col);
+  }
+  {
+    ASSERT_NE(nullptr, bloom_filters_[2]);
+    auto col = table->column(0)->Slice(4, 2);
+    verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[2].get(), *col);
+  }
+  {
+    ASSERT_NE(nullptr, bloom_filters_[3]);
+    auto col = table->column(1)->Slice(4, 2);
+    verifyBloomFilter<::arrow::StringType>(bloom_filters_[3].get(), *col);
+  }
+}
+
 }  // namespace arrow
 }  // namespace parquet
@@ -167,6 +167,12 @@ class PARQUET_EXPORT BloomFilter {
 
   virtual ~BloomFilter() = default;
 
+  // Variant of const pointer argument to facilitate template
+  uint64_t Hash(const int32_t* value) const { return Hash(*value); }
+  uint64_t Hash(const int64_t* value) const { return Hash(*value); }
+  uint64_t Hash(const float* value) const { return Hash(*value); }
+  uint64_t Hash(const double* value) const { return Hash(*value); }
+
  protected:
   // Hash strategy available for Bloom filter.
   enum class HashStrategy : uint32_t { XXHASH = 0 };

diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#include "parquet/bloom_filter_builder.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+
+#include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
+#include "parquet/metadata.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+class BloomFilterBuilderImpl : public BloomFilterBuilder {
+ public:
+  explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema,
+                                  WriterProperties properties)
+      : schema_(schema), properties_(std::move(properties)) {}
+  /// Append a new row group to host all incoming bloom filters.
+  void AppendRowGroup() override;
+
+  BloomFilter* GetOrCreateBloomFilter(
+      int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) override;
+
+  /// Serialize all bloom filters with header and bitset in the order of row group and
+  /// column id. Column encryption is not implemented yet. The side effect is that it
+  /// deletes all bloom filters after they have been flushed.
+  void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override;
+
+  void Finish() override { finished_ = true; }
+
+ private:
+  /// Make sure column ordinal is not out of bound and the builder is in good state.
+  void CheckState(int32_t column_ordinal) const {
+    if (finished_) {
-    if (finished_) {
+    if (finished_) [[unlikely]] {
-    if (finished_) {
+    if (finished_) [[unlikely]] {
+      throw ParquetException("BloomFilterBuilder is already finished.");
+    }
+    if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) {
+      throw ParquetException("Invalid column ordinal: ", column_ordinal);
+    }
+    if (row_group_bloom_filters_.empty()) {
+      throw ParquetException("No row group appended to BloomFilterBuilder.");
+    }
+    if (schema_->Column(column_ordinal)->physical_type() == Type::BOOLEAN) {
+      throw ParquetException("BloomFilterBuilder not supports Boolean.");
+    }
+  }
+
+  const SchemaDescriptor* schema_;
+  WriterProperties properties_;
+  bool finished_ = false;
+
+  // vector: row_group_ordinal
+  // map: column_ordinal -> bloom filter
+  std::vector<std::map<int32_t, std::unique_ptr<BloomFilter>>> row_group_bloom_filters_;
+};
+
+std::unique_ptr<BloomFilterBuilder> BloomFilterBuilder::Make(
+    const SchemaDescriptor* schema, const WriterProperties& properties) {
+  return std::unique_ptr<BloomFilterBuilder>(
+      new BloomFilterBuilderImpl(schema, properties));
+}
+
+void BloomFilterBuilderImpl::AppendRowGroup() { row_group_bloom_filters_.emplace_back(); }
+
+BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(
+    int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) {
+  CheckState(column_ordinal);
+  std::unique_ptr<BloomFilter>& bloom_filter =
+      row_group_bloom_filters_.back()[column_ordinal];
+  if (bloom_filter == nullptr) {
+    auto block_split_bloom_filter =
+        std::make_unique<BlockSplitBloomFilter>(properties_.memory_pool());
+    block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes(
+        bloom_filter_options.ndv, bloom_filter_options.fpp));
+    bloom_filter = std::move(block_split_bloom_filter);
+  }
+  return bloom_filter.get();
+}
+
+void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink,
+                                     BloomFilterLocation* location) {
+  if (!finished_) {
+    throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder.");
+  }
+  if (row_group_bloom_filters_.empty()) {
+    // Return quickly if there is no bloom filter
+    return;
+  }
+
+  for (size_t row_group_ordinal = 0; row_group_ordinal < row_group_bloom_filters_.size();
+       ++row_group_ordinal) {
+    const auto& row_group_bloom_filters = row_group_bloom_filters_[row_group_ordinal];
+    // the whole row group has no bloom filter
+    if (row_group_bloom_filters.empty()) {
+      continue;
+    }
+    bool has_valid_bloom_filter = false;
+    int num_columns = schema_->num_columns();
+    std::vector<std::optional<IndexLocation>> locations(num_columns, std::nullopt);
+
+    // serialize bloom filter by ascending order of column id
+    for (int32_t column_id = 0; column_id < num_columns; ++column_id) {
+      auto iter = row_group_bloom_filters.find(column_id);
+      if (iter != row_group_bloom_filters.cend() && iter->second != nullptr) {
+        PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell());
+        iter->second->WriteTo(sink);
+        PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell());
+        has_valid_bloom_filter = true;
+        locations[column_id] = IndexLocation{offset, static_cast<int32_t>(pos - offset)};
+      }
+    }
+    if (has_valid_bloom_filter) {
+      location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations));
+    }
+  }
+}
+
+}  // namespace parquet
diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include "arrow/io/interfaces.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class BloomFilter;
+class SchemaDescriptor;
+struct BloomFilterOptions;
+struct BloomFilterLocation;
+
+namespace schema {
+class ColumnPath;
+}
+
+/// \brief Interface for collecting bloom filter of a parquet file.
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+  /// \brief API convenience to create a BloomFilterBuilder.
+  static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor* schema,
+                                                  const WriterProperties& properties);
+
+  /// Append a new row group to host all incoming bloom filters.
+  virtual void AppendRowGroup() = 0;
+
+  /// \brief Get the BloomFilter from column ordinal.
+  ///
+  /// \param column_ordinal Column ordinal in schema, which is only for leaf columns.
+  /// \param bloom_filter_options The options(like num distinct values and false positive
+  /// rate) to create a BloomFilter.
+  ///
+  /// \return BloomFilter for the column and its memory ownership belongs to the
+  /// BloomFilterBuilder.
+  virtual BloomFilter* GetOrCreateBloomFilter(
+      int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0;
+
+  /// \brief Write the bloom filter to sink.
+  ///
+  /// \param[out] sink The output stream to write the bloom filter.
+  /// \param[out] location The location of all bloom filter to the start of sink.
+  virtual void WriteTo(::arrow::io::OutputStream* sink,
+                       BloomFilterLocation* location) = 0;
+
+  /// \brief Complete the bloom filter builder and no more write is allowed.
+  virtual void Finish() = 0;
+
+  virtual ~BloomFilterBuilder() = default;
+};
+
+}  // namespace parquet