Skip to content

Commit 9832776

Browse files
Initial migration of cuco hasher to cudax (#4898)
* Adds xxhash32 implementation and tests * Uses cuda::std::span for dynamic hashing --------- Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
1 parent 1675115 commit 9832776

File tree

6 files changed

+532
-1
lines changed

6 files changed

+532
-1
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
12+
#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
13+
14+
#include <cuda/__cccl_config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#include <cuda/std/__memory/assume_aligned.h>
25+
#include <cuda/std/cstddef>
26+
#include <cuda/std/cstring>
27+
28+
#include <cuda/std/__cccl/prologue.h>
29+
30+
namespace cuda::experimental::cuco::__detail
31+
{
32+
33+
template <typename _Tp, typename _Extent>
34+
[[nodiscard]] _CCCL_API constexpr _Tp __load_chunk(::cuda::std::byte const* const __bytes, _Extent __index) noexcept
35+
{
36+
_Tp __chunk;
37+
38+
auto __ptr = __bytes + __index * sizeof(_Tp);
39+
auto __uintptr = reinterpret_cast<_CUDA_VSTD::uintptr_t>(__ptr);
40+
41+
static_assert(sizeof(_Tp) == 4 || sizeof(_Tp) == 8, "__load_chunk must be used with types of size 4 or 8 bytes");
42+
43+
if (alignof(_Tp) == 8 && ((__uintptr % 8) == 0))
44+
{
45+
_CUDA_VSTD::memcpy(&__chunk, _CUDA_VSTD::assume_aligned<8>(__ptr), sizeof(_Tp));
46+
}
47+
else if ((__uintptr % 4) == 0)
48+
{
49+
_CUDA_VSTD::memcpy(&__chunk, _CUDA_VSTD::assume_aligned<4>(__ptr), sizeof(_Tp));
50+
}
51+
else if ((__uintptr % 2) == 0)
52+
{
53+
_CUDA_VSTD::memcpy(&__chunk, _CUDA_VSTD::assume_aligned<2>(__ptr), sizeof(_Tp));
54+
}
55+
else
56+
{
57+
_CUDA_VSTD::memcpy(&__chunk, __bytes, sizeof(_Tp));
58+
}
59+
return __chunk;
60+
}
61+
62+
}; // namespace cuda::experimental::cuco::__detail
63+
64+
#include <cuda/std/__cccl/epilogue.h>
65+
66+
#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_UTILS_CUH
Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
/*
12+
* _XXHash_32 implementation from
13+
* https://github.com/Cyan4973/xxHash
14+
* -----------------------------------------------------------------------------
15+
* xxHash - Extremely Fast Hash algorithm
16+
* Header File
17+
* Copyright (C) 2012-2021 Yann Collet
18+
*
19+
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
20+
*
21+
* Redistribution and use in source and binary forms, with or without
22+
* modification, are permitted provided that the following conditions are
23+
* met:
24+
*
25+
* * Redistributions of source code must retain the above copyright
26+
* notice, this list of conditions and the following disclaimer.
27+
* * Redistributions in binary form must reproduce the above
28+
* copyright notice, this list of conditions and the following disclaimer
29+
* in the documentation and/or other materials provided with the
30+
* distribution.
31+
*
32+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
33+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
34+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
35+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
36+
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
37+
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38+
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
39+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
40+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
42+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43+
*/
44+
45+
#ifndef _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
46+
#define _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH
47+
48+
#include <cuda/__cccl_config>
49+
50+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
51+
# pragma GCC system_header
52+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
53+
# pragma clang system_header
54+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
55+
# pragma system_header
56+
#endif // no system header
57+
58+
#include <cuda/__utility/static_for.h>
59+
#include <cuda/std/__bit/bit_cast.h>
60+
#include <cuda/std/__bit/rotate.h>
61+
#include <cuda/std/array>
62+
#include <cuda/std/cstddef>
63+
#include <cuda/std/cstdint>
64+
#include <cuda/std/span>
65+
66+
#include <cuda/experimental/__cuco/detail/hash_functions/utils.cuh>
67+
68+
#include <cuda/std/__cccl/prologue.h>
69+
70+
namespace cuda::experimental::cuco::__detail
71+
{
72+
73+
//! @brief A `_XXHash_32` hash function to hash the given argument on host and device.
74+
//!
75+
//! @tparam Key The type of the values to hash
76+
template <typename _Key>
77+
struct _XXHash_32
78+
{
79+
private:
80+
static constexpr _CUDA_VSTD::uint32_t __prime1 = 0x9e3779b1u;
81+
static constexpr _CUDA_VSTD::uint32_t __prime2 = 0x85ebca77u;
82+
static constexpr _CUDA_VSTD::uint32_t __prime3 = 0xc2b2ae3du;
83+
static constexpr _CUDA_VSTD::uint32_t __prime4 = 0x27d4eb2fu;
84+
static constexpr _CUDA_VSTD::uint32_t __prime5 = 0x165667b1u;
85+
86+
static constexpr size_t __block_size = 4;
87+
static constexpr size_t __chunk_size = 16;
88+
89+
//! @brief Type erased holder of all the bytes
90+
template <size_t _KeySize,
91+
size_t _Alignment,
92+
bool _HasChunks = (_KeySize >= __block_size),
93+
bool _HasTail = (_KeySize % __block_size)>
94+
struct alignas(_Alignment) _Byte_holder
95+
{
96+
//! The number of trailing bytes that do not fit into a uint32_t
97+
static constexpr size_t __tail_size = _KeySize % __block_size;
98+
99+
//! The number of uint32_t blocks
100+
static constexpr size_t __num_blocks = _KeySize / __block_size;
101+
102+
//! The number of 16-byte chunks
103+
static constexpr size_t __num_chunks = _KeySize / __chunk_size;
104+
105+
alignas(_Alignment) _CUDA_VSTD::uint32_t __blocks[__num_blocks];
106+
unsigned char __bytes[__tail_size];
107+
};
108+
109+
//! @brief Type erased holder of small types < __block_size
110+
template <size_t _KeySize, size_t _Alignment>
111+
struct alignas(_Alignment) _Byte_holder<_KeySize, _Alignment, false, true>
112+
{
113+
//! The number of trailing bytes that do not fit into a uint32_t
114+
static constexpr size_t __tail_size = _KeySize % __block_size;
115+
116+
//! The number of uint32_t blocks
117+
static constexpr size_t __num_blocks = _KeySize / __block_size;
118+
119+
//! The number of 16-byte chunks
120+
static constexpr size_t __num_chunks = _KeySize / __chunk_size;
121+
122+
alignas(_Alignment) unsigned char __bytes[__tail_size];
123+
};
124+
125+
//! @brief Type erased holder of types without trailing bytes
126+
template <size_t _KeySize, size_t _Alignment>
127+
struct alignas(_Alignment) _Byte_holder<_KeySize, _Alignment, true, false>
128+
{
129+
//! The number of trailing bytes that do not fit into a uint32_t
130+
static constexpr size_t __tail_size = _KeySize % __block_size;
131+
132+
//! The number of uint32_t blocks
133+
static constexpr size_t __num_blocks = _KeySize / __block_size;
134+
135+
//! The number of 16-byte chunks
136+
static constexpr size_t __num_chunks = _KeySize / __chunk_size;
137+
138+
alignas(_Alignment) _CUDA_VSTD::uint32_t __blocks[__num_blocks];
139+
};
140+
141+
public:
142+
//! @brief Constructs a XXH32 hash function with the given `seed`.
143+
//! @param seed A custom number to randomize the resulting hash value
144+
_CCCL_API constexpr _XXHash_32(_CUDA_VSTD::uint32_t __seed = 0)
145+
: __seed_{__seed}
146+
{}
147+
148+
//! @brief Returns a hash value for its argument, as a value of type `_CUDA_VSTD::uint32_t`.
149+
//! @param __key The input argument to hash
150+
//! @return The resulting hash value for `__key`
151+
[[nodiscard]] _CCCL_API constexpr _CUDA_VSTD::uint32_t operator()(_Key const& __key) const noexcept
152+
{
153+
using _Holder = _Byte_holder<sizeof(_Key), alignof(_Key)>;
154+
// explicit copy to avoid emitting a bunch of LDG.8 instructions
155+
const _Key __copy{__key};
156+
return __compute_hash(_CUDA_VSTD::bit_cast<_Holder>(__copy));
157+
}
158+
159+
template <size_t _Extent>
160+
[[nodiscard]] _CCCL_API constexpr _CUDA_VSTD::uint32_t
161+
operator()(_CUDA_VSTD::span<_Key, _Extent> __keys) const noexcept
162+
{
163+
// TODO: optimize when _Extent is known at compile time i.e
164+
// _Extent != _CUDA_VSTD::dynamic_extent, dispatch to bit_cast based implementation
165+
return __compute_hash_span(__keys);
166+
}
167+
168+
private:
169+
//! @brief Returns a hash value for its argument, as a value of type `_CUDA_VSTD::uint32_t`.
170+
//!
171+
//! @tparam _Extent The extent type
172+
//!
173+
//! @return The resulting hash value
174+
template <class _Holder>
175+
[[nodiscard]] _CCCL_API constexpr _CUDA_VSTD::uint32_t __compute_hash(_Holder __holder) const noexcept
176+
{
177+
size_t __offset = 0;
178+
_CUDA_VSTD::uint32_t __h32 = {};
179+
180+
// process data in 16-byte chunks
181+
if constexpr (_Holder::__num_chunks > 0)
182+
{
183+
_CUDA_VSTD::array<_CUDA_VSTD::uint32_t, 4> __v;
184+
__v[0] = __seed_ + __prime1 + __prime2;
185+
__v[1] = __seed_ + __prime2;
186+
__v[2] = __seed_;
187+
__v[3] = __seed_ - __prime1;
188+
189+
for (size_t __i = 0; __i < _Holder::__num_chunks; ++__i)
190+
{
191+
cuda::static_for<4>([&](auto i) {
192+
__v[i] += __holder.__blocks[__offset++] * __prime2;
193+
__v[i] = _CUDA_VSTD::rotl(__v[i], 13);
194+
__v[i] *= __prime1;
195+
});
196+
}
197+
__h32 = _CUDA_VSTD::rotl(__v[0], 1) + _CUDA_VSTD::rotl(__v[1], 7) + _CUDA_VSTD::rotl(__v[2], 12)
198+
+ _CUDA_VSTD::rotl(__v[3], 18);
199+
}
200+
else
201+
{
202+
__h32 = __seed_ + __prime5;
203+
}
204+
205+
__h32 += _CUDA_VSTD::uint32_t{sizeof(_Holder)};
206+
207+
// remaining data can be processed in 4-byte chunks
208+
if constexpr (_Holder::__num_blocks % __chunk_size > 0)
209+
{
210+
for (; __offset < _Holder::__num_blocks; ++__offset)
211+
{
212+
__h32 += __holder.__blocks[__offset] * __prime3;
213+
__h32 = _CUDA_VSTD::rotl(__h32, 17) * __prime4;
214+
}
215+
}
216+
217+
// the following loop is only needed if the size of the key is not a multiple of the block size
218+
if constexpr (_Holder::__tail_size > 0)
219+
{
220+
for (size_t __i = 0; __i < _Holder::__tail_size; ++__i)
221+
{
222+
__h32 += (static_cast<_CUDA_VSTD::uint32_t>(__holder.__bytes[__i]) & 255) * __prime5;
223+
__h32 = _CUDA_VSTD::rotl(__h32, 11) * __prime1;
224+
}
225+
}
226+
227+
return __finalize(__h32);
228+
}
229+
230+
[[nodiscard]] _CCCL_API _CUDA_VSTD::uint32_t __compute_hash_span(_CUDA_VSTD::span<_Key> __keys) const noexcept
231+
{
232+
auto __bytes = _CUDA_VSTD::as_bytes(__keys).data();
233+
auto const __size = __keys.size_bytes();
234+
235+
size_t __offset = 0;
236+
_CUDA_VSTD::uint32_t __h32 = {};
237+
238+
// data can be processed in 16-byte chunks
239+
if (__size >= 16)
240+
{
241+
auto const __limit = __size - 16;
242+
_CUDA_VSTD::array<_CUDA_VSTD::uint32_t, 4> __v;
243+
244+
__v[0] = __seed_ + __prime1 + __prime2;
245+
__v[1] = __seed_ + __prime2;
246+
__v[2] = __seed_;
247+
__v[3] = __seed_ - __prime1;
248+
249+
for (; __offset <= __limit; __offset += 16)
250+
{
251+
// pipeline 4*4byte computations
252+
auto const __pipeline_offset = __offset / 4;
253+
cuda::static_for<4>([&](auto i) {
254+
__v[i] += __load_chunk<_CUDA_VSTD::uint32_t>(__bytes, __pipeline_offset + i) * __prime2;
255+
__v[i] = _CUDA_VSTD::rotl(__v[i], 13);
256+
__v[i] *= __prime1;
257+
});
258+
}
259+
260+
__h32 = _CUDA_VSTD::rotl(__v[0], 1) + _CUDA_VSTD::rotl(__v[1], 7) + _CUDA_VSTD::rotl(__v[2], 12)
261+
+ _CUDA_VSTD::rotl(__v[3], 18);
262+
}
263+
else
264+
{
265+
__h32 = __seed_ + __prime5;
266+
}
267+
268+
__h32 += __size;
269+
270+
// remaining data can be processed in 4-byte chunks
271+
if ((__size % 16) >= 4)
272+
{
273+
for (; __offset <= __size - 4; __offset += 4)
274+
{
275+
__h32 += __load_chunk<_CUDA_VSTD::uint32_t>(__bytes, __offset / 4) * __prime3;
276+
__h32 = _CUDA_VSTD::rotl(__h32, 17) * __prime4;
277+
}
278+
}
279+
280+
// the following loop is only needed if the size of the key is not a multiple of the block size
281+
if (__size % 4)
282+
{
283+
while (__offset < __size)
284+
{
285+
__h32 += (::cuda::std::to_integer<_CUDA_VSTD::uint32_t>(__bytes[__offset]) & 255) * __prime5;
286+
__h32 = _CUDA_VSTD::rotl(__h32, 11) * __prime1;
287+
++__offset;
288+
}
289+
}
290+
291+
return __finalize(__h32);
292+
}
293+
294+
[[nodiscard]] _CCCL_API constexpr _CUDA_VSTD::uint32_t __finalize(_CUDA_VSTD::uint32_t __h) const noexcept
295+
{
296+
__h ^= __h >> 15;
297+
__h *= __prime2;
298+
__h ^= __h >> 13;
299+
__h *= __prime3;
300+
__h ^= __h >> 16;
301+
return __h;
302+
}
303+
304+
_CUDA_VSTD::uint32_t __seed_;
305+
};
306+
307+
} // namespace cuda::experimental::cuco::__detail
308+
309+
#include <cuda/std/__cccl/epilogue.h>
310+
311+
#endif // _CUDAX__CUCO_DETAIL_HASH_FUNCTIONS_XXHASH_CUH

0 commit comments

Comments
 (0)