Skip to content

Commit ac4ba6b

Browse files
authored
Bloom filter optimizations (1/5): Less noisy benchmarks (#669)
This PR is part 1/5 of the Bloom filter optimization project and must be merged in the correct order. This PR introduces two changes that reduce the noise level of the benchmark measurements drastically: - Generate input data on-the-fly rather than loading the keys from global memory. - Increase the number of input keys to make the kernels run longer (former runtimes were in the single-digit ms range which was too noisy).
1 parent 4f03dcc commit ac4ba6b

File tree

3 files changed

+16
-41
lines changed

3 files changed

+16
-41
lines changed

benchmarks/bloom_filter/add_bench.cu

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -21,12 +21,11 @@
2121
#include <benchmark_utils.hpp>
2222

2323
#include <cuco/bloom_filter.cuh>
24-
#include <cuco/utility/key_generator.cuh>
2524

2625
#include <nvbench/nvbench.cuh>
2726

2827
#include <cuda/std/limits>
29-
#include <thrust/device_vector.h>
28+
#include <thrust/iterator/counting_iterator.h>
3029

3130
#include <cstdint>
3231
#include <exception>
@@ -61,25 +60,19 @@ void bloom_filter_add(nvbench::state& state,
6160
(filter_size_mb * 1024 * 1024) /
6261
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);
6362

64-
thrust::device_vector<Key> keys(num_keys);
65-
66-
key_generator gen;
67-
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
63+
thrust::counting_iterator<Key> keys(0);
6864

6965
state.add_element_count(num_keys);
7066

7167
filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
7268

7369
state.collect_dram_throughput();
74-
state.collect_l1_hit_rates();
7570
state.collect_l2_hit_rates();
76-
state.collect_loads_efficiency();
77-
state.collect_stores_efficiency();
7871

7972
add_fpr_summary(state, filter);
8073

8174
state.exec([&](nvbench::launch& launch) {
82-
filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
75+
filter.add_async(keys, keys + num_keys, {launch.get_stream()});
8376
});
8477
}
8578

@@ -106,25 +99,19 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
10699
// configurations
107100
}
108101

109-
thrust::device_vector<Key> keys(num_keys);
110-
111-
key_generator gen;
112-
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
102+
thrust::counting_iterator<Key> keys(0);
113103

114104
state.add_element_count(num_keys);
115105

116106
filter_type filter{num_sub_filters};
117107

118108
state.collect_dram_throughput();
119-
state.collect_l1_hit_rates();
120109
state.collect_l2_hit_rates();
121-
state.collect_loads_efficiency();
122-
state.collect_stores_efficiency();
123110

124111
add_fpr_summary(state, filter);
125112

126113
state.exec([&](nvbench::launch& launch) {
127-
filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
114+
filter.add_async(keys, keys + num_keys, {launch.get_stream()});
128115
});
129116
}
130117

benchmarks/bloom_filter/contains_bench.cu

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -21,12 +21,12 @@
2121
#include <benchmark_utils.hpp>
2222

2323
#include <cuco/bloom_filter.cuh>
24-
#include <cuco/utility/key_generator.cuh>
2524

2625
#include <nvbench/nvbench.cuh>
2726

2827
#include <cuda/std/limits>
2928
#include <thrust/device_vector.h>
29+
#include <thrust/iterator/counting_iterator.h>
3030

3131
#include <exception>
3232

@@ -63,28 +63,22 @@ void bloom_filter_contains(
6363
(filter_size_mb * 1024 * 1024) /
6464
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);
6565

66-
thrust::device_vector<Key> keys(num_keys);
66+
thrust::counting_iterator<Key> keys(0);
6767
thrust::device_vector<bool> result(num_keys, false);
6868

69-
key_generator gen;
70-
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
71-
7269
state.add_element_count(num_keys);
7370

7471
filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
7572

7673
state.collect_dram_throughput();
77-
state.collect_l1_hit_rates();
7874
state.collect_l2_hit_rates();
79-
state.collect_loads_efficiency();
80-
state.collect_stores_efficiency();
8175

8276
add_fpr_summary(state, filter);
8377

84-
filter.add(keys.begin(), keys.end());
78+
filter.add(keys, keys + num_keys);
8579

8680
state.exec([&](nvbench::launch& launch) {
87-
filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
81+
filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
8882
});
8983
}
9084

@@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key,
113107
// configurations
114108
}
115109

116-
thrust::device_vector<Key> keys(num_keys);
110+
thrust::counting_iterator<Key> keys(0);
117111
thrust::device_vector<bool> result(num_keys, false);
118112

119-
key_generator gen;
120-
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
121-
122113
state.add_element_count(num_keys);
123114

124115
filter_type filter{num_sub_filters};
125116

126117
state.collect_dram_throughput();
127-
state.collect_l1_hit_rates();
128118
state.collect_l2_hit_rates();
129-
state.collect_loads_efficiency();
130-
state.collect_stores_efficiency();
131119

132120
add_fpr_summary(state, filter);
133121

134-
filter.add(keys.begin(), keys.end());
122+
filter.add(keys, keys + num_keys);
135123

136124
state.exec([&](nvbench::launch& launch) {
137-
filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
125+
filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
138126
});
139127
}
140128

benchmarks/bloom_filter/defaults.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ using BF_KEY = nvbench::int64_t;
3030
using BF_HASH = cuco::xxhash_64<char>;
3131
using BF_WORD = nvbench::uint32_t;
3232

33-
static constexpr auto BF_N = 400'000'000;
33+
static constexpr auto BF_N = 1'000'000'000;
3434
static constexpr auto BF_SIZE_MB = 2'000;
3535
static constexpr auto BF_WORDS_PER_BLOCK = 8;
3636

0 commit comments

Comments
 (0)