1+ /*
2+ * Copyright (c) 2024, NVIDIA CORPORATION.
3+ *
4+ * Licensed under the Apache License, Version 2.0 (the "License");
5+ * you may not use this file except in compliance with the License.
6+ * You may obtain a copy of the License at
7+ *
8+ * http://www.apache.org/licenses/LICENSE-2.0
9+ *
10+ * Unless required by applicable law or agreed to in writing, software
11+ * distributed under the License is distributed on an "AS IS" BASIS,
12+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+ * See the License for the specific language governing permissions and
14+ * limitations under the License.
15+ */
16+
17+ #include < test_utils.hpp>
18+
19+ #include < cuco/detail/utility/cuda.hpp>
20+ #include < cuco/static_multimap.cuh>
21+
22+ #include < cuda/atomic>
23+ #include < cuda/functional>
24+ #include < thrust/iterator/counting_iterator.h>
25+ #include < thrust/iterator/transform_iterator.h>
26+
27+ #include < cooperative_groups.h>
28+ #include < cooperative_groups/reduce.h>
29+
30+ #include < catch2/catch_template_test_macros.hpp>
31+
32+ #include < cstddef>
33+
34+ template <class Ref , class InputIt , class AtomicErrorCounter >
35+ CUCO_KERNEL void for_each_check_scalar (Ref ref,
36+ InputIt first,
37+ std::size_t n,
38+ std::size_t multiplicity,
39+ AtomicErrorCounter* error_counter)
40+ {
41+ static_assert (Ref::cg_size == 1 , " Scalar test must have cg_size==1" );
42+ auto const loop_stride = cuco::detail::grid_stride ();
43+ auto idx = cuco::detail::global_thread_id ();
44+
45+ while (idx < n) {
46+ auto const & key = *(first + idx);
47+ std::size_t matches = 0 ;
48+ ref.for_each (key, [&] __device__ (auto const slot) {
49+ auto const [slot_key, slot_value] = slot;
50+ if (ref.key_eq ()(key, slot_key) and ref.key_eq ()(slot_key, slot_value)) { matches++; }
51+ });
52+ if (matches != multiplicity) { error_counter->fetch_add (1 , cuda::memory_order_relaxed); }
53+ idx += loop_stride;
54+ }
55+ }
56+
57+ template <bool Synced, class Ref , class InputIt , class AtomicErrorCounter >
58+ CUCO_KERNEL void for_each_check_cooperative (Ref ref,
59+ InputIt first,
60+ std::size_t n,
61+ std::size_t multiplicity,
62+ AtomicErrorCounter* error_counter)
63+ {
64+ auto const loop_stride = cuco::detail::grid_stride () / Ref::cg_size;
65+ auto idx = cuco::detail::global_thread_id () / Ref::cg_size;
66+ ;
67+
68+ while (idx < n) {
69+ auto const tile =
70+ cooperative_groups::tiled_partition<Ref::cg_size>(cooperative_groups::this_thread_block ());
71+ auto const & key = *(first + idx);
72+ std::size_t thread_matches = 0 ;
73+ if constexpr (Synced) {
74+ ref.for_each (
75+ tile,
76+ key,
77+ [&] __device__ (auto const slot) {
78+ auto const [slot_key, slot_value] = slot;
79+ if (ref.key_eq ()(key, slot_key) and ref.key_eq ()(slot_key, slot_value)) {
80+ thread_matches++;
81+ }
82+ },
83+ [] __device__ (auto const & group) { group.sync (); });
84+ } else {
85+ ref.for_each (tile, key, [&] __device__ (auto const slot) {
86+ auto const [slot_key, slot_value] = slot;
87+ if (ref.key_eq ()(key, slot_key) and ref.key_eq ()(slot_key, slot_value)) {
88+ thread_matches++;
89+ }
90+ });
91+ }
92+ auto const tile_matches =
93+ cooperative_groups::reduce (tile, thread_matches, cooperative_groups::plus<std::size_t >());
94+ if (tile_matches != multiplicity and tile.thread_rank () == 0 ) {
95+ error_counter->fetch_add (1 , cuda::memory_order_relaxed);
96+ }
97+ idx += loop_stride;
98+ }
99+ }
100+
101+ TEMPLATE_TEST_CASE_SIG (
102+ " static_multimap for_each tests" ,
103+ " " ,
104+ ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
105+ (int32_t , cuco::test::probe_sequence::double_hashing, 1 ),
106+ (int32_t , cuco::test::probe_sequence::double_hashing, 2 ),
107+ (int64_t , cuco::test::probe_sequence::double_hashing, 1 ),
108+ (int64_t , cuco::test::probe_sequence::double_hashing, 2 ),
109+ (int32_t , cuco::test::probe_sequence::linear_probing, 1 ),
110+ (int32_t , cuco::test::probe_sequence::linear_probing, 2 ),
111+ (int64_t , cuco::test::probe_sequence::linear_probing, 1 ),
112+ (int64_t , cuco::test::probe_sequence::linear_probing, 2 ))
113+ {
114+ constexpr size_t num_unique_keys{400 };
115+ constexpr size_t key_multiplicity{5 };
116+ constexpr size_t num_keys{num_unique_keys * key_multiplicity};
117+
118+ using probe = std::conditional_t <Probe == cuco::test::probe_sequence::linear_probing,
119+ cuco::linear_probing<CGSize, cuco::default_hash_function<Key>>,
120+ cuco::double_hashing<CGSize, cuco::default_hash_function<Key>>>;
121+
122+ auto set = cuco::experimental::static_multimap{num_keys,
123+ cuco::empty_key<Key>{-1 },
124+ cuco::empty_value<Key>{-1 },
125+ {},
126+ probe{},
127+ {},
128+ cuco::storage<2 >{}};
129+
130+ auto unique_keys_begin = thrust::counting_iterator<Key>(0 );
131+ auto gen_duplicate_keys = cuda::proclaim_return_type<Key>(
132+ [] __device__ (auto const & k) { return static_cast <Key>(k % num_unique_keys); });
133+ auto keys_begin = thrust::make_transform_iterator (unique_keys_begin, gen_duplicate_keys);
134+
135+ auto const pairs_begin = thrust::make_transform_iterator (
136+ keys_begin, cuda::proclaim_return_type<cuco::pair<Key, Key>>([] __device__ (auto i) {
137+ return cuco::pair<Key, Key>{i, i};
138+ }));
139+
140+ set.insert (pairs_begin, pairs_begin + num_keys);
141+
142+ using error_counter_type = cuda::atomic<std::size_t , cuda::thread_scope_system>;
143+ error_counter_type* error_counter;
144+ CUCO_CUDA_TRY (cudaMallocHost (&error_counter, sizeof (error_counter_type)));
145+ new (error_counter) error_counter_type{0 };
146+
147+ auto const grid_size = cuco::detail::grid_size (num_unique_keys, CGSize);
148+ auto const block_size = cuco::detail::default_block_size ();
149+
150+ // test scalar for_each
151+ if constexpr (CGSize == 1 ) {
152+ for_each_check_scalar<<<grid_size, block_size>>> (
153+ set.ref (cuco::for_each), unique_keys_begin, num_unique_keys, key_multiplicity, error_counter);
154+ CUCO_CUDA_TRY (cudaDeviceSynchronize ());
155+ REQUIRE (error_counter->load () == 0 );
156+ error_counter->store (0 );
157+ }
158+
159+ // test CG for_each
160+ for_each_check_cooperative<false ><<<grid_size, block_size>>> (
161+ set.ref (cuco::for_each), unique_keys_begin, num_unique_keys, key_multiplicity, error_counter);
162+ CUCO_CUDA_TRY (cudaDeviceSynchronize ());
163+ REQUIRE (error_counter->load () == 0 );
164+ error_counter->store (0 );
165+
166+ // test synchronized CG for_each
167+ for_each_check_cooperative<true ><<<grid_size, block_size>>> (
168+ set.ref (cuco::for_each), unique_keys_begin, num_unique_keys, key_multiplicity, error_counter);
169+ CUCO_CUDA_TRY (cudaDeviceSynchronize ());
170+ REQUIRE (error_counter->load () == 0 );
171+
172+ CUCO_CUDA_TRY (cudaFreeHost (error_counter));
173+ }
0 commit comments