Skip to content

Commit d829576

Browse files
PointKernelpre-commit-ci[bot]sleeepyjack
authored
Migrate set retrieve to use the OA implementation (#637)
This PR updates the legacy set retrieve to use the new open-addressing solution. It enhances open-addressing retrieve by eliminating the use of coalesced groups to reduce register pressure, resulting in approximately 10% to 40% speedups in multiset retrieve benchmarks. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Daniel Jünger <[email protected]>
1 parent 644e553 commit d829576

File tree

8 files changed

+432
-571
lines changed

8 files changed

+432
-571
lines changed

benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ ConfigureBench(STATIC_SET_BENCH
5252
static_set/contains_bench.cu
5353
static_set/find_bench.cu
5454
static_set/insert_bench.cu
55+
static_set/retrieve_bench.cu
5556
static_set/retrieve_all_bench.cu
5657
static_set/size_bench.cu
5758
static_set/rehash_bench.cu)
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Copyright (c) 2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <benchmark_defaults.hpp>
18+
#include <benchmark_utils.hpp>
19+
20+
#include <cuco/static_set.cuh>
21+
#include <cuco/utility/key_generator.cuh>
22+
23+
#include <nvbench/nvbench.cuh>
24+
25+
#include <thrust/device_vector.h>
26+
#include <thrust/transform.h>
27+
28+
using namespace cuco::benchmark;
29+
using namespace cuco::utility;
30+
31+
/**
32+
* @brief A benchmark evaluating `cuco::static_set::retrieve` performance
33+
*/
34+
template <typename Key, typename Dist>
35+
void static_set_retrieve(nvbench::state& state, nvbench::type_list<Key, Dist>)
36+
{
37+
auto const num_keys = state.get_int64("NumInputs");
38+
auto const occupancy = state.get_float64("Occupancy");
39+
auto const matching_rate = state.get_float64("MatchingRate");
40+
41+
std::size_t const size = num_keys / occupancy;
42+
43+
thrust::device_vector<Key> keys(num_keys);
44+
45+
key_generator gen;
46+
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
47+
48+
gen.dropout(keys.begin(), keys.end(), matching_rate);
49+
50+
state.add_element_count(num_keys);
51+
52+
cuco::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
53+
set.insert(keys.begin(), keys.end());
54+
55+
auto const output_size = set.count(keys.begin(), keys.end());
56+
thrust::device_vector<Key> output_match(output_size);
57+
auto output_probe_begin = thrust::discard_iterator{};
58+
59+
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
60+
set.retrieve(
61+
keys.begin(), keys.end(), output_probe_begin, output_match.begin(), {launch.get_stream()});
62+
});
63+
}
64+
65+
NVBENCH_BENCH_TYPES(static_set_retrieve,
66+
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
67+
nvbench::type_list<distribution::uniform>))
68+
.set_name("static_set_retrieve_uniform_occupancy")
69+
.set_type_axes_names({"Key", "Distribution"})
70+
.set_max_noise(defaults::MAX_NOISE)
71+
.add_int64_axis("NumInputs", {defaults::N})
72+
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE)
73+
.add_float64_axis("MatchingRate", {defaults::MATCHING_RATE})
74+
.add_int64_axis("Multiplicity", {defaults::MULTIPLICITY});
75+
76+
NVBENCH_BENCH_TYPES(static_set_retrieve,
77+
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
78+
nvbench::type_list<distribution::uniform>))
79+
.set_name("static_set_retrieve_uniform_matching_rate")
80+
.set_type_axes_names({"Key", "Distribution"})
81+
.set_max_noise(defaults::MAX_NOISE)
82+
.add_int64_axis("NumInputs", {defaults::N})
83+
.add_float64_axis("Occupancy", {defaults::OCCUPANCY})
84+
.add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE)
85+
.add_int64_axis("Multiplicity", {defaults::MULTIPLICITY});
86+
87+
NVBENCH_BENCH_TYPES(static_set_retrieve,
88+
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
89+
nvbench::type_list<distribution::uniform>))
90+
.set_name("static_set_retrieve_uniform_multiplicity")
91+
.set_type_axes_names({"Key", "Distribution"})
92+
.set_max_noise(defaults::MAX_NOISE)
93+
.add_int64_axis("NumInputs", {defaults::N})
94+
.add_float64_axis("Occupancy", {defaults::OCCUPANCY})
95+
.add_float64_axis("MatchingRate", {defaults::MATCHING_RATE})
96+
.add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);

include/cuco/detail/open_addressing/kernels.cuh

Lines changed: 70 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -415,77 +415,6 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
415415
}
416416
}
417417

418-
/**
419-
* @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
420-
* input_probe + n)`.
421-
*
422-
* If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
423-
* `output_probe` and associated slot contents to `output_match`, respectively. The output order is
424-
* unspecified.
425-
*
426-
* @tparam IsOuter Flag indicating whether it's an outer count or not
427-
* @tparam block_size The size of the thread block
428-
* @tparam InputProbeIt Device accessible input iterator
429-
* @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
430-
* convertible to the `InputProbeIt`'s `value_type`
431-
* @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
432-
* convertible to the container's `value_type`
433-
* @tparam AtomicCounter Integral atomic type that follows the same semantics as
434-
* `cuda::(std::)atomic(_ref)`
435-
* @tparam Ref Type of non-owning device ref allowing access to storage
436-
*
437-
* @param input_probe Beginning of the sequence of input keys
438-
* @param n Number of the keys to query
439-
* @param output_probe Beginning of the sequence of keys corresponding to matching elements in
440-
* `output_match`
441-
* @param output_match Beginning of the sequence of matching elements
442-
* @param atomic_counter Pointer to an atomic object of integral type that is used to count the
443-
* number of output elements
444-
* @param ref Non-owning container device ref used to access the slot storage
445-
*/
446-
template <bool IsOuter,
447-
int32_t BlockSize,
448-
class InputProbeIt,
449-
class OutputProbeIt,
450-
class OutputMatchIt,
451-
class AtomicCounter,
452-
class Ref>
453-
CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe,
454-
cuco::detail::index_type n,
455-
OutputProbeIt output_probe,
456-
OutputMatchIt output_match,
457-
AtomicCounter* atomic_counter,
458-
Ref ref)
459-
{
460-
namespace cg = cooperative_groups;
461-
462-
auto const block = cg::this_thread_block();
463-
auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
464-
// make sure all but the last block are always occupied
465-
auto const items_per_block = detail::int_div_ceil(n, tiles_in_block * gridDim.x) * tiles_in_block;
466-
467-
auto const block_begin_offset = block.group_index().x * items_per_block;
468-
auto const block_end_offset = min(n, block_begin_offset + items_per_block);
469-
470-
if (block_begin_offset < block_end_offset) {
471-
if constexpr (IsOuter) {
472-
ref.retrieve_outer<BlockSize>(block,
473-
input_probe + block_begin_offset,
474-
input_probe + block_end_offset,
475-
output_probe,
476-
output_match,
477-
*atomic_counter);
478-
} else {
479-
ref.retrieve<BlockSize>(block,
480-
input_probe + block_begin_offset,
481-
input_probe + block_end_offset,
482-
output_probe,
483-
output_match,
484-
*atomic_counter);
485-
}
486-
}
487-
}
488-
489418
/**
490419
* @brief Inserts all elements in the range `[first, last)`.
491420
*
@@ -642,6 +571,76 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
642571
if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
643572
}
644573

574+
/**
575+
* @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
576+
* input_probe + n)`.
577+
*
578+
* If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
579+
* `output_probe` and associated slot contents to `output_match`, respectively. The output order is
580+
* unspecified.
581+
*
582+
* @tparam IsOuter Flag indicating whether it's an outer count or not
583+
* @tparam block_size The size of the thread block
584+
* @tparam InputProbeIt Device accessible input iterator
585+
* @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
586+
* convertible to the `InputProbeIt`'s `value_type`
587+
* @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
588+
* convertible to the container's `value_type`
589+
* @tparam AtomicCounter Integral atomic type that follows the same semantics as
590+
* `cuda::(std::)atomic(_ref)`
591+
* @tparam Ref Type of non-owning device ref allowing access to storage
592+
*
593+
* @param input_probe Beginning of the sequence of input keys
594+
* @param n Number of the keys to query
595+
* @param output_probe Beginning of the sequence of keys corresponding to matching elements in
596+
* `output_match`
597+
* @param output_match Beginning of the sequence of matching elements
598+
* @param atomic_counter Pointer to an atomic object of integral type that is used to count the
599+
* number of output elements
600+
* @param ref Non-owning container device ref used to access the slot storage
601+
*/
602+
template <bool IsOuter,
603+
int32_t BlockSize,
604+
class InputProbeIt,
605+
class OutputProbeIt,
606+
class OutputMatchIt,
607+
class AtomicCounter,
608+
class Ref>
609+
CUCO_KERNEL void retrieve(InputProbeIt input_probe,
610+
cuco::detail::index_type n,
611+
OutputProbeIt output_probe,
612+
OutputMatchIt output_match,
613+
AtomicCounter* atomic_counter,
614+
Ref ref)
615+
{
616+
namespace cg = cooperative_groups;
617+
618+
auto const block = cg::this_thread_block();
619+
auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
620+
621+
auto const block_begin_offset = block.group_index().x * tiles_in_block;
622+
auto const block_end_offset =
623+
min(n, static_cast<cuco::detail::index_type>(block_begin_offset + tiles_in_block));
624+
625+
if (block_begin_offset < block_end_offset) {
626+
if constexpr (IsOuter) {
627+
ref.retrieve_outer<BlockSize>(block,
628+
input_probe + block_begin_offset,
629+
input_probe + block_end_offset,
630+
output_probe,
631+
output_match,
632+
atomic_counter);
633+
} else {
634+
ref.retrieve<BlockSize>(block,
635+
input_probe + block_begin_offset,
636+
input_probe + block_end_offset,
637+
output_probe,
638+
output_match,
639+
atomic_counter);
640+
}
641+
}
642+
}
643+
645644
/**
646645
* @brief Calculates the number of filled slots for the given bucket storage.
647646
*

0 commit comments

Comments
 (0)