Better tests for Bloom filter operations that allow varying CG sizes (#683)

sleeepyjack · web-flow · commit e445e99f8e4e · 2025-03-05T01:49:06.000+01:00
Closes #676
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -271,13 +271,12 @@ class bloom_filter_impl {
       auto const num_keys = cuco::detail::distance(first, last);
       if (num_keys == 0) { return; }
 
-      auto constexpr cg_size    = add_optimal_cg_size();
       auto constexpr block_size = cuco::detail::default_block_size();
       void const* kernel        = reinterpret_cast<void const*>(
-        detail::bloom_filter_ns::add<cg_size, block_size, InputIt, bloom_filter_impl>);
+        detail::bloom_filter_ns::add<block_size, InputIt, bloom_filter_impl>);
       auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel);
 
-      detail::bloom_filter_ns::add<cg_size, block_size>
+      detail::bloom_filter_ns::add<block_size>
         <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
     }
   }
diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh
@@ -26,7 +26,7 @@ namespace cuco::detail::bloom_filter_ns {
 
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
-template <int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+template <int32_t BlockSize, class InputIt, class Ref>
 CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
                                                   cuco::detail::index_type n,
                                                   Ref ref)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -144,4 +144,4 @@ ConfigureTest(HYPERLOGLOG_TEST
 ConfigureTest(BLOOM_FILTER_TEST
     bloom_filter/unique_sequence_test.cu
     bloom_filter/arrow_policy_test.cu
-    )
+    bloom_filter/variable_cg_test.cu)
diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu
@@ -27,6 +27,8 @@
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/generators/catch_generators.hpp>
 
+#include <exception>
+
 using size_type = int32_t;
 
 template <typename Filter>
@@ -96,8 +98,14 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
   constexpr size_type num_keys{400};
 
-  uint32_t pattern_bits =
-    GENERATE(Policy::words_per_block, Policy::words_per_block + 1, Policy::words_per_block + 2);
+  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4);
+
+  // some parameter combinations might be invalid so we skip them
+  try {
+    [[maybe_unused]] auto policy = Policy{pattern_bits};
+  } catch (std::exception const& e) {
+    SKIP(e.what());
+  }
 
   auto filter = filter_type{1000, {}, {pattern_bits}};
 
diff --git a/tests/bloom_filter/variable_cg_test.cu b/tests/bloom_filter/variable_cg_test.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <test_utils.hpp>
+
+#include <cuco/bloom_filter.cuh>
+
+#include <cuda/functional>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+
+#include <cstdint>
+#include <exception>
+
+using size_type = int32_t;
+
+template <int32_t AddCGSize, int32_t ContainsCGSize, typename Filter>
+void test_variable_cg_size(Filter& filter, size_type num_keys)
+{
+  constexpr int32_t block_size = 128;
+  constexpr int32_t grid_size  = 128;
+
+  using Key = typename Filter::key_type;
+
+  auto ref = filter.ref();
+
+  // Generate keys
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(thrust::device, keys.begin(), keys.end());
+
+  thrust::device_vector<bool> contained(num_keys, false);
+
+  auto const always_true = thrust::constant_iterator<bool>{true};
+
+  SECTION("Check if fallback kernels work for varying combinations of CG sizes.")
+  {
+    cuco::detail::bloom_filter_ns::add_if_n<AddCGSize, block_size>
+      <<<grid_size, block_size>>>(keys.begin(), num_keys, always_true, cuda::std::identity{}, ref);
+    cuco::detail::bloom_filter_ns::contains_if_n<ContainsCGSize, block_size>
+      <<<grid_size, block_size>>>(
+        keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref);
+    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
+  }
+
+  filter.clear();
+  thrust::fill(contained.begin(), contained.end(), false);  // reset output vector
+
+  SECTION("Check if adaptive add kernel works with fallback contains kernel.")
+  {
+    cuco::detail::bloom_filter_ns::add<block_size>
+      <<<grid_size, block_size>>>(keys.begin(), num_keys, ref);
+    cuco::detail::bloom_filter_ns::contains_if_n<ContainsCGSize, block_size>
+      <<<grid_size, block_size>>>(
+        keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref);
+    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
+  }
+
+  // TODO adaptive vs. adaptive and fallback add vs. adaptive contains (requires #673)
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "bloom_filter variable CG size tests",
+  "",
+  ((int32_t AddCGSize, int32_t ContainsCGSize, class Key, class Policy),
+   AddCGSize,
+   ContainsCGSize,
+   Key,
+   Policy),
+  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
+  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
+  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
+  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>),
+  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
+  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
+  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
+  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>))
+{
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
+  constexpr size_type num_keys{400};
+
+  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4);
+
+  // some parameter combinations might be invalid so we skip them
+  try {
+    [[maybe_unused]] auto policy = Policy{pattern_bits};
+  } catch (std::exception const& e) {
+    SKIP(e.what());
+  }
+
+  auto filter = filter_type{1000, {}, {pattern_bits}};
+
+  test_variable_cg_size<AddCGSize, ContainsCGSize>(filter, num_keys);
+}

Original file line number	Diff line number	Diff line change
`@@ -271,13 +271,12 @@ class bloom_filter_impl {`
`271`	`271`	`auto const num_keys = cuco::detail::distance(first, last);`
`272`	`272`	`if (num_keys == 0) { return; }`
`273`	`273`
`274`		`- auto constexpr cg_size = add_optimal_cg_size();`
`275`	`274`	`auto constexpr block_size = cuco::detail::default_block_size();`
`276`	`275`	`void const* kernel = reinterpret_cast<void const*>(`
`277`		`- detail::bloom_filter_ns::add<cg_size, block_size, InputIt, bloom_filter_impl>);`
	`276`	`+ detail::bloom_filter_ns::add<block_size, InputIt, bloom_filter_impl>);`
`278`	`277`	`auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel);`
`279`	`278`
`280`		`- detail::bloom_filter_ns::add<cg_size, block_size>`
	`279`	`+ detail::bloom_filter_ns::add<block_size>`
`281`	`280`	`<<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);`
`282`	`281`	`}`
`283`	`282`	`}`