Skip to content

Commit b00f413

Browse files
committed
Merge remote-tracking branch 'upstream' into cg-overhaul
2 parents 4dcdba0 + 28b4625 commit b00f413

File tree

11 files changed

+129
-146
lines changed

11 files changed

+129
-146
lines changed

include/cuco/detail/bitwise_compare.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ __host__ __device__ constexpr std::size_t alignment()
8383
* @return If the bits in the object representations of lhs and rhs are identical.
8484
*/
8585
template <typename T>
86-
__host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs)
86+
__host__ __device__ constexpr bool bitwise_compare(T lhs, T rhs)
8787
{
8888
static_assert(
8989
cuco::is_bitwise_comparable_v<T>,

include/cuco/detail/open_addressing/functors.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ struct slot_is_filled {
7373
* @param empty_sentinel Key sentinel indicating an empty slot
7474
* @param erased_sentinel Key sentinel indicating an erased slot
7575
*/
76-
explicit constexpr slot_is_filled(T const& empty_sentinel, T const& erased_sentinel) noexcept
76+
explicit constexpr slot_is_filled(T empty_sentinel, T erased_sentinel) noexcept
7777
: empty_sentinel_{empty_sentinel}, erased_sentinel_{erased_sentinel}
7878
{
7979
}
@@ -88,7 +88,7 @@ struct slot_is_filled {
8888
* @return `true` if slot is filled
8989
*/
9090
template <typename S>
91-
__device__ constexpr bool operator()(S const& slot) const noexcept
91+
__device__ constexpr bool operator()(S slot) const noexcept
9292
{
9393
auto const key = [&]() {
9494
if constexpr (HasPayload) {

include/cuco/detail/open_addressing/kernels.cuh

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
7878

7979
while (idx < n) {
8080
if (pred(*(stencil + idx))) {
81-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
82-
*(first + idx)};
81+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
8382
if constexpr (CGSize == 1) {
8483
if (ref.insert(insert_element)) { thread_num_successes++; };
8584
} else {
@@ -138,8 +137,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
138137

139138
while (idx < n) {
140139
if (pred(*(stencil + idx))) {
141-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
142-
*(first + idx)};
140+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{*(first + idx)};
143141
if constexpr (CGSize == 1) {
144142
ref.insert(insert_element);
145143
} else {
@@ -175,7 +173,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
175173
auto idx = cuco::detail::global_thread_id() / CGSize;
176174

177175
while (idx < n) {
178-
typename cuda::std::iterator_traits<InputIt>::value_type const& erase_element{*(first + idx)};
176+
typename cuda::std::iterator_traits<InputIt>::value_type const erase_element{*(first + idx)};
179177
if constexpr (CGSize == 1) {
180178
ref.erase(erase_element);
181179
} else {
@@ -216,7 +214,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
216214
auto idx = cuco::detail::global_thread_id() / CGSize;
217215

218216
while (idx < n) {
219-
typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
217+
typename cuda::std::iterator_traits<InputIt>::value_type const key{*(first + idx)};
220218
if constexpr (CGSize == 1) {
221219
ref.for_each(key, callback_op);
222220
} else {
@@ -280,7 +278,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
280278
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
281279
if constexpr (CGSize == 1) {
282280
if (idx < n) {
283-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
281+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
284282
/*
285283
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
286284
* sector stores from L2 to global memory. By writing results to shared memory and then
@@ -294,7 +292,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
294292
} else {
295293
auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(block);
296294
if (idx < n) {
297-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
295+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
298296
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
299297
if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
300298
}
@@ -396,8 +394,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
396394
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
397395
if constexpr (CGSize == 1) {
398396
if (idx < n) {
399-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
400-
auto const found = ref.find(key);
397+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
398+
auto const found = ref.find(key);
401399
/*
402400
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
403401
* sector stores from L2 to global memory. By writing results to shared memory and then
@@ -411,8 +409,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
411409
} else {
412410
auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(block);
413411
if (idx < n) {
414-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
415-
auto const found = ref.find(tile, key);
412+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
413+
auto const found = ref.find(tile, key);
416414

417415
if (tile.thread_rank() == 0) {
418416
*(output_begin + idx) = pred(*(stencil + idx)) ? output(found) : sentinel;
@@ -486,7 +484,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
486484
while ((idx - thread_idx / CGSize) < n) { // the whole thread block falls into the same iteration
487485
if constexpr (CGSize == 1) {
488486
if (idx < n) {
489-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
487+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
490488
*(first + idx)};
491489
auto const [iter, inserted] = ref.insert_and_find(insert_element);
492490
/*
@@ -506,7 +504,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
506504
} else {
507505
auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
508506
if (idx < n) {
509-
typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
507+
typename cuda::std::iterator_traits<InputIt>::value_type const insert_element{
510508
*(first + idx)};
511509
auto const [iter, inserted] = ref.insert_and_find(tile, insert_element);
512510
if (tile.thread_rank() == 0) {
@@ -557,7 +555,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
557555
auto idx = cuco::detail::global_thread_id() / CGSize;
558556

559557
while (idx < n) {
560-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
558+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
561559
if constexpr (CGSize == 1) {
562560
if constexpr (IsOuter) {
563561
thread_count += max(ref.count(key), outer_min_count);
@@ -617,7 +615,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count_each(InputIt first,
617615
size_type constexpr outer_min_count = 1;
618616

619617
while (idx < n) {
620-
typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
618+
typename cuda::std::iterator_traits<InputIt>::value_type const key = *(first + idx);
621619
if constexpr (CGSize == 1) {
622620
if constexpr (IsOuter) {
623621
*(output_begin + idx) = max(ref.count(key), size_type{outer_min_count});

include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ class open_addressing_ref_impl {
372372
* @return True if the given element is successfully inserted
373373
*/
374374
template <typename Value>
375-
__device__ bool insert(Value const& value) noexcept
375+
__device__ bool insert(Value value) noexcept
376376
{
377377
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
378378

@@ -428,7 +428,7 @@ class open_addressing_ref_impl {
428428
*/
429429
template <bool SupportsErase, typename Value, typename ParentCG>
430430
__device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
431-
Value const& value) noexcept
431+
Value value) noexcept
432432
{
433433
auto const val = this->heterogeneous_value(value);
434434
auto const key = this->extract_key(val);
@@ -513,7 +513,7 @@ class open_addressing_ref_impl {
513513
* insertion is successful or not.
514514
*/
515515
template <typename Value>
516-
__device__ cuda::std::pair<iterator, bool> insert_and_find(Value const& value) noexcept
516+
__device__ cuda::std::pair<iterator, bool> insert_and_find(Value value) noexcept
517517
{
518518
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
519519
#if __CUDA_ARCH__ < 700
@@ -589,7 +589,7 @@ class open_addressing_ref_impl {
589589
*/
590590
template <typename Value, typename ParentCG>
591591
__device__ cuda::std::pair<iterator, bool> insert_and_find(
592-
cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value const& value) noexcept
592+
cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value value) noexcept
593593
{
594594
#if __CUDA_ARCH__ < 700
595595
// Spinning to ensure that the write to the value part took place requires
@@ -680,12 +680,12 @@ class open_addressing_ref_impl {
680680
*
681681
* @tparam ProbeKey Input type which is convertible to 'key_type'
682682
*
683-
* @param value The element to erase
683+
* @param key The element to erase
684684
*
685685
* @return True if the given element is successfully erased
686686
*/
687687
template <typename ProbeKey>
688-
__device__ bool erase(ProbeKey const& key) noexcept
688+
__device__ bool erase(ProbeKey key) noexcept
689689
{
690690
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
691691

@@ -726,13 +726,13 @@ class open_addressing_ref_impl {
726726
* @tparam ParentCG Type of parent Cooperative Group
727727
*
728728
* @param group The Cooperative Group used to perform group erase
729-
* @param value The element to erase
729+
* @param key The element to erase
730730
*
731731
* @return True if the given element is successfully erased
732732
*/
733733
template <typename ProbeKey, typename ParentCG>
734734
__device__ bool erase(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
735-
ProbeKey const& key) noexcept
735+
ProbeKey key) noexcept
736736
{
737737
auto probing_iter =
738738
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -790,7 +790,7 @@ class open_addressing_ref_impl {
790790
* @return A boolean indicating whether the probe key is present
791791
*/
792792
template <typename ProbeKey>
793-
[[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept
793+
[[nodiscard]] __device__ bool contains(ProbeKey key) const noexcept
794794
{
795795
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
796796
auto probing_iter =
@@ -830,8 +830,7 @@ class open_addressing_ref_impl {
830830
*/
831831
template <typename ProbeKey, typename ParentCG>
832832
[[nodiscard]] __device__ bool contains(
833-
cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
834-
ProbeKey const& key) const noexcept
833+
cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey key) const noexcept
835834
{
836835
auto probing_iter =
837836
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -871,7 +870,7 @@ class open_addressing_ref_impl {
871870
* @return An iterator to the position at which the equivalent key is stored
872871
*/
873872
template <typename ProbeKey>
874-
[[nodiscard]] __device__ iterator find(ProbeKey const& key) const noexcept
873+
[[nodiscard]] __device__ iterator find(ProbeKey key) const noexcept
875874
{
876875
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
877876
auto probing_iter =
@@ -915,8 +914,7 @@ class open_addressing_ref_impl {
915914
*/
916915
template <typename ProbeKey, typename ParentCG>
917916
[[nodiscard]] __device__ iterator
918-
find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
919-
ProbeKey const& key) const noexcept
917+
find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey key) const noexcept
920918
{
921919
auto probing_iter =
922920
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -964,7 +962,7 @@ class open_addressing_ref_impl {
964962
* @return Number of occurrences found by the current thread
965963
*/
966964
template <typename ProbeKey>
967-
[[nodiscard]] __device__ size_type count(ProbeKey const& key) const noexcept
965+
[[nodiscard]] __device__ size_type count(ProbeKey key) const noexcept
968966
{
969967
if constexpr (not allows_duplicates) {
970968
return static_cast<size_type>(this->contains(key));
@@ -1013,8 +1011,7 @@ class open_addressing_ref_impl {
10131011
*/
10141012
template <typename ProbeKey, typename ParentCG>
10151013
[[nodiscard]] __device__ size_type
1016-
count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
1017-
ProbeKey const& key) const noexcept
1014+
count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey key) const noexcept
10181015
{
10191016
auto probing_iter =
10201017
probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -1369,7 +1366,7 @@ class open_addressing_ref_impl {
13691366
* @param callback_op Function to apply to every matched slot
13701367
*/
13711368
template <class ProbeKey, class CallbackOp>
1372-
__device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept
1369+
__device__ void for_each(ProbeKey key, CallbackOp&& callback_op) const noexcept
13731370
{
13741371
static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
13751372
auto probing_iter =
@@ -1420,7 +1417,7 @@ class open_addressing_ref_impl {
14201417
*/
14211418
template <class ProbeKey, class CallbackOp, typename ParentCG>
14221419
__device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
1423-
ProbeKey const& key,
1420+
ProbeKey key,
14241421
CallbackOp&& callback_op) const noexcept
14251422
{
14261423
auto probing_iter =
@@ -1485,7 +1482,7 @@ class open_addressing_ref_impl {
14851482
*/
14861483
template <class ProbeKey, class CallbackOp, class SyncOp, typename ParentCG>
14871484
__device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
1488-
ProbeKey const& key,
1485+
ProbeKey key,
14891486
CallbackOp&& callback_op,
14901487
SyncOp&& sync_op) const noexcept
14911488
{
@@ -1545,7 +1542,7 @@ class open_addressing_ref_impl {
15451542
* @return The key
15461543
*/
15471544
template <typename Value>
1548-
[[nodiscard]] __host__ __device__ constexpr auto extract_key(Value const& value) const noexcept
1545+
[[nodiscard]] __host__ __device__ constexpr auto extract_key(Value value) const noexcept
15491546
{
15501547
if constexpr (has_payload) {
15511548
return thrust::raw_reference_cast(value).first;
@@ -1566,8 +1563,7 @@ class open_addressing_ref_impl {
15661563
* @return The payload
15671564
*/
15681565
template <typename Value, typename Enable = cuda::std::enable_if_t<has_payload and sizeof(Value)>>
1569-
[[nodiscard]] __host__ __device__ constexpr auto extract_payload(
1570-
Value const& value) const noexcept
1566+
[[nodiscard]] __host__ __device__ constexpr auto extract_payload(Value value) const noexcept
15711567
{
15721568
return thrust::raw_reference_cast(value).second;
15731569
}
@@ -1582,7 +1578,7 @@ class open_addressing_ref_impl {
15821578
* @return The converted object
15831579
*/
15841580
template <typename T>
1585-
[[nodiscard]] __device__ constexpr value_type native_value(T const& value) const noexcept
1581+
[[nodiscard]] __device__ constexpr value_type native_value(T value) const noexcept
15861582
{
15871583
if constexpr (has_payload) {
15881584
return {static_cast<key_type>(this->extract_key(value)), this->extract_payload(value)};
@@ -1602,7 +1598,7 @@ class open_addressing_ref_impl {
16021598
* @return The converted object
16031599
*/
16041600
template <typename T>
1605-
[[nodiscard]] __device__ constexpr auto heterogeneous_value(T const& value) const noexcept
1601+
[[nodiscard]] __device__ constexpr auto heterogeneous_value(T value) const noexcept
16061602
{
16071603
if constexpr (has_payload and not cuda::std::is_same_v<T, value_type>) {
16081604
using mapped_type = decltype(this->empty_value_sentinel());
@@ -1624,7 +1620,7 @@ class open_addressing_ref_impl {
16241620
*
16251621
* @return The sentinel value used to represent an erased slot
16261622
*/
1627-
[[nodiscard]] __device__ constexpr value_type const erased_slot_sentinel() const noexcept
1623+
[[nodiscard]] __device__ constexpr value_type erased_slot_sentinel() const noexcept
16281624
{
16291625
if constexpr (has_payload) {
16301626
return cuco::pair{this->erased_key_sentinel(), this->empty_value_sentinel()};
@@ -1685,8 +1681,8 @@ class open_addressing_ref_impl {
16851681
*/
16861682
template <typename Value>
16871683
[[nodiscard]] __device__ constexpr insert_result back_to_back_cas(value_type* address,
1688-
value_type const& expected,
1689-
Value const& desired) noexcept
1684+
value_type expected,
1685+
Value desired) noexcept
16901686
{
16911687
using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
16921688

@@ -1736,8 +1732,9 @@ class open_addressing_ref_impl {
17361732
* @return Result of this operation, i.e., success/continue/duplicate
17371733
*/
17381734
template <typename Value>
1739-
[[nodiscard]] __device__ constexpr insert_result cas_dependent_write(
1740-
value_type* address, value_type const& expected, Value const& desired) noexcept
1735+
[[nodiscard]] __device__ constexpr insert_result cas_dependent_write(value_type* address,
1736+
value_type expected,
1737+
Value desired) noexcept
17411738
{
17421739
using mapped_type = cuda::std::decay_t<decltype(this->empty_value_sentinel())>;
17431740

@@ -1778,8 +1775,8 @@ class open_addressing_ref_impl {
17781775
*/
17791776
template <typename Value>
17801777
[[nodiscard]] __device__ insert_result attempt_insert(value_type* address,
1781-
value_type const& expected,
1782-
Value const& desired) noexcept
1778+
value_type expected,
1779+
Value desired) noexcept
17831780
{
17841781
if constexpr (sizeof(value_type) <= 8) {
17851782
return packed_cas(address, expected, desired);
@@ -1811,8 +1808,8 @@ class open_addressing_ref_impl {
18111808
*/
18121809
template <typename Value>
18131810
[[nodiscard]] __device__ insert_result attempt_insert_stable(value_type* address,
1814-
value_type const& expected,
1815-
Value const& desired) noexcept
1811+
value_type expected,
1812+
Value desired) noexcept
18161813
{
18171814
if constexpr (sizeof(value_type) <= 8) {
18181815
return packed_cas(address, expected, desired);
@@ -1833,7 +1830,7 @@ class open_addressing_ref_impl {
18331830
* @param sentinel The slot sentinel value
18341831
*/
18351832
template <typename T>
1836-
__device__ void wait_for_payload(T& slot, T const& sentinel) const noexcept
1833+
__device__ void wait_for_payload(T& slot, T sentinel) const noexcept
18371834
{
18381835
auto ref = cuda::atomic_ref<T, Scope>{slot};
18391836
T current;

0 commit comments

Comments
 (0)