Bloom filter optimizations (1/5): Less noisy benchmarks (#669)

sleeepyjack · web-flow · commit ac4ba6b9e342 · 2025-02-13T01:01:23.000+01:00
This PR is part 1/5 of the Bloom filter optimization project and must be
merged in the correct order.

This PR introduces two changes that reduce the noise level of the
benchmark measurements drastically:

- Generate input data on-the-fly rather than loading the keys from
global memory.
- Increase the number of input keys to make the kernels run longer
(former runtimes were in the single-digit ms range which was too noisy).
diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,12 +21,11 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
-#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <cstdint>
 #include <exception>
@@ -61,25 +60,19 @@ void bloom_filter_add(nvbench::state& state,
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 
@@ -106,25 +99,19 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 
diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,12 +21,12 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
 #include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <exception>
 
@@ -63,28 +63,22 @@ void bloom_filter_contains(
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 
@@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key,
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 
diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ using BF_KEY  = nvbench::int64_t;
 using BF_HASH = cuco::xxhash_64<char>;
 using BF_WORD = nvbench::uint32_t;
 
-static constexpr auto BF_N               = 400'000'000;
+static constexpr auto BF_N               = 1'000'000'000;
 static constexpr auto BF_SIZE_MB         = 2'000;
 static constexpr auto BF_WORDS_PER_BLOCK = 8;