Skip to content

Commit 669c84b

Browse files
committed
Resolve conflicts
2 parents fbe6249 + 6bbaf67 commit 669c84b

25 files changed

+1128
-190
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"shutdownAction": "stopContainer",
3+
"image": "rapidsai/devcontainers:25.12-cpp-gcc13-cuda13.0-ubuntu24.04",
4+
"hostRequirements": {
5+
"gpu": true
6+
},
7+
"initializeCommand": [
8+
"/bin/bash",
9+
"-c",
10+
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
11+
],
12+
"containerEnv": {
13+
"SCCACHE_REGION": "us-east-2",
14+
"SCCACHE_BUCKET": "rapids-sccache-devs",
15+
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
16+
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
17+
"DEVCONTAINER_NAME": "cuda13.0-gcc13",
18+
"CUCO_CUDA_VERSION": "13.0",
19+
"CUCO_HOST_COMPILER": "gcc",
20+
"CUCO_HOST_COMPILER_VERSION": "13"
21+
},
22+
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
23+
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
24+
"mounts": [
25+
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
26+
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
27+
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
28+
],
29+
"customizations": {
30+
"vscode": {
31+
"extensions": [
32+
"llvm-vs-code-extensions.vscode-clangd"
33+
],
34+
"settings": {
35+
"clangd.arguments": [
36+
"--compile-commands-dir=${workspaceFolder}/build/latest"
37+
]
38+
}
39+
}
40+
},
41+
"name": "cuda13.0-gcc13"
42+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"shutdownAction": "stopContainer",
3+
"image": "rapidsai/devcontainers:25.12-cpp-llvm20-cuda13.0ext-ubuntu24.04",
4+
"hostRequirements": {
5+
"gpu": true
6+
},
7+
"initializeCommand": [
8+
"/bin/bash",
9+
"-c",
10+
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
11+
],
12+
"containerEnv": {
13+
"SCCACHE_REGION": "us-east-2",
14+
"SCCACHE_BUCKET": "rapids-sccache-devs",
15+
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
16+
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
17+
"DEVCONTAINER_NAME": "cuda13.0-llvm20",
18+
"CUCO_CUDA_VERSION": "13.0",
19+
"CUCO_HOST_COMPILER": "llvm",
20+
"CUCO_HOST_COMPILER_VERSION": "20"
21+
},
22+
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
23+
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
24+
"mounts": [
25+
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
26+
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
27+
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
28+
],
29+
"customizations": {
30+
"vscode": {
31+
"extensions": [
32+
"llvm-vs-code-extensions.vscode-clangd"
33+
],
34+
"settings": {
35+
"clangd.arguments": [
36+
"--compile-commands-dir=${workspaceFolder}/build/latest"
37+
]
38+
}
39+
}
40+
},
41+
"name": "cuda13.0-llvm20"
42+
}

.devcontainer/devcontainer.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"shutdownAction": "stopContainer",
3-
"image": "rapidsai/devcontainers:25.12-cpp-gcc13-cuda12.9-ubuntu24.04",
3+
"image": "rapidsai/devcontainers:25.12-cpp-gcc13-cuda13.0-ubuntu24.04",
44
"hostRequirements": {
55
"gpu": true
66
},
@@ -14,8 +14,8 @@
1414
"SCCACHE_BUCKET": "rapids-sccache-devs",
1515
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
1616
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
17-
"DEVCONTAINER_NAME": "cuda12.9-gcc13",
18-
"CUCO_CUDA_VERSION": "12.9",
17+
"DEVCONTAINER_NAME": "cuda13.0-gcc13",
18+
"CUCO_CUDA_VERSION": "13.0",
1919
"CUCO_HOST_COMPILER": "gcc",
2020
"CUCO_HOST_COMPILER_VERSION": "13"
2121
},
@@ -38,5 +38,5 @@
3838
}
3939
}
4040
},
41-
"name": "cuda12.9-gcc13"
41+
"name": "cuda13.0-gcc13"
4242
}

.devcontainer/make_devcontainers.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,12 @@ update_devcontainer() {
4545
local devcontainer_version="$8"
4646

4747
local IMAGE_ROOT="rapidsai/devcontainers:${devcontainer_version}-cpp-"
48-
local image="${IMAGE_ROOT}${compiler_name}${compiler_version}-cuda${cuda_version}-${os}"
48+
# Add 'ext' suffix only for LLVM compilers with CUDA 13.0
49+
local cuda_suffix=""
50+
if [[ "$cuda_version" == "13.0" && "$compiler_name" == "llvm" ]]; then
51+
cuda_suffix="ext"
52+
fi
53+
local image="${IMAGE_ROOT}${compiler_name}${compiler_version}-cuda${cuda_version}${cuda_suffix}-${os}"
4954

5055
jq --arg image "$image" --arg name "$name" \
5156
--arg cuda_version "$cuda_version" --arg compiler_name "$compiler_name" \

CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
1818
set(rapids-cmake-version 25.12)
1919
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
2020
file(DOWNLOAD
21-
https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${rapids-cmake-version}/RAPIDS.cmake
21+
https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${rapids-cmake-version}/RAPIDS.cmake
2222
${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
2323
endif()
2424
include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
@@ -116,6 +116,13 @@ function(cuco_set_common_compile_options target_name)
116116
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
117117
target_compile_options(${target_name} PRIVATE -Xcompiler -Wno-subobject-linkage)
118118
endif()
119+
120+
# Add Clang-specific warning suppression for deprecated literal operators
121+
# (Catch2 and cuco code still use deprecated syntax)
122+
# Only for Clang 15+ which introduced this warning
123+
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 15.0)
124+
target_compile_options(${target_name} PRIVATE -Xcompiler -Wno-deprecated-literal-operator)
125+
endif()
119126
endfunction()
120127

121128
###################################################################################################

benchmarks/benchmark_utils.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ template <class OutputIt>
5959
struct lazy_discard {
6060
OutputIt it;
6161

62-
using index_type = typename thrust::iterator_traits<OutputIt>::difference_type;
63-
using value_type = typename thrust::iterator_traits<OutputIt>::value_type;
62+
using index_type = typename cuda::std::iterator_traits<OutputIt>::difference_type;
63+
using value_type = typename cuda::std::iterator_traits<OutputIt>::value_type;
6464

6565
__device__ void device_dispatch(index_type index, value_type const& value) const
6666
{

ci/matrix.yml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
cuda_oldest: &cuda_oldest '12.0'
17-
cuda_newest: &cuda_newest '12.9'
16+
cuda_12_0: &cuda_12_0 '12.0'
17+
cuda_12_9: &cuda_12_9 '12.9'
18+
cuda_13_0: &cuda_13_0 '13.0'
1819

1920
# The GPUs to test on
2021
# Note: This assumes that the appropriate gpu_build_archs are set to include building for the GPUs listed here
@@ -42,8 +43,11 @@ devcontainer_version: '25.12'
4243
# Configurations that will run for every PR
4344
pull_request:
4445
nvcc:
45-
- {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'g++'}, gpu_build_archs: '70', std: [17], jobs: ['build', 'test']}
46-
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '13', exe: 'g++'}, gpu_build_archs: '70', std: [17], jobs: ['build', 'test']}
47-
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '13', exe: 'g++'}, gpu_build_archs: '80,90', std: [17], jobs: ['build']}
48-
- {cuda: *cuda_oldest, os: 'ubuntu20.04', cpu: 'amd64', compiler: {name: 'llvm', version: '14', exe: 'clang++'}, gpu_build_archs: '70', std: [17], jobs: ['build']}
49-
- {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '18', exe: 'clang++'}, gpu_build_archs: '90', std: [17], jobs: ['build']}
46+
- {cuda: *cuda_12_0, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'g++'}, gpu_build_archs: '70', std: [17], jobs: ['build', 'test']}
47+
- {cuda: *cuda_12_9, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '13', exe: 'g++'}, gpu_build_archs: '70', std: [17], jobs: ['build', 'test']}
48+
- {cuda: *cuda_12_9, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '13', exe: 'g++'}, gpu_build_archs: '80,90', std: [17], jobs: ['build']}
49+
- {cuda: *cuda_12_0, os: 'ubuntu20.04', cpu: 'amd64', compiler: {name: 'llvm', version: '14', exe: 'clang++'}, gpu_build_archs: '70', std: [17], jobs: ['build']}
50+
- {cuda: *cuda_12_9, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '18', exe: 'clang++'}, gpu_build_archs: '90', std: [17], jobs: ['build']}
51+
- {cuda: *cuda_13_0, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '13', exe: 'g++'}, gpu_build_archs: '80', std: [17], jobs: ['build']}
52+
- {cuda: *cuda_13_0, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '13', exe: 'g++'}, gpu_build_archs: '80,90', std: [17], jobs: ['build']}
53+
- {cuda: *cuda_13_0, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'llvm', version: '20', exe: 'clang++'}, gpu_build_archs: '90', std: [17], jobs: ['build']}

cmake/header_testing.cmake

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,9 @@ function(cuco_add_header_tests)
6363
# Create executable test for this specific header
6464
add_executable(${headertest_target} ${header_src})
6565
target_link_libraries(${headertest_target} PRIVATE cuco::cuco CUDA::cudart)
66-
67-
target_compile_options(${headertest_target} PRIVATE
68-
$<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
69-
--compiler-options=-Wall --compiler-options=-Wextra
70-
--compiler-options=-Werror -Wno-deprecated-gpu-targets
71-
)
72-
73-
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
74-
target_compile_options(${headertest_target} PRIVATE -Xcompiler -Wno-subobject-linkage)
75-
endif()
66+
67+
# Use common compile options (includes all compiler-specific warning suppressions)
68+
cuco_set_common_compile_options(${headertest_target})
7669

7770
set_target_properties(${headertest_target} PROPERTIES
7871
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests/headers"

doxygen/Doxyfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1508,7 +1508,7 @@ FORMULA_MACROFILE =
15081508
# The default value is: NO.
15091509
# This tag requires that the tag GENERATE_HTML is set to YES.
15101510

1511-
USE_MATHJAX = NO
1511+
USE_MATHJAX = YES
15121512

15131513
# When MathJax is enabled you can set the default output format to be used for
15141514
# the MathJax output. See the MathJax site (see:

include/cuco/bloom_filter.cuh

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ namespace cuco {
4242
* - Host-side "bulk" operations
4343
* - Device-side "singular" operations
4444
*
45-
* The host-side bulk operations include `add`, `contains`, etc. These APIs should be used when
45+
* The host-side bulk operations include add(), contains(), etc. These APIs should be used when
4646
* there are a large number of keys to add or lookup. For example, given a range of keys
4747
* specified by device-accessible iterators, the bulk `add` function will add all keys into
4848
* the filter.
@@ -124,7 +124,7 @@ class bloom_filter {
124124
* @brief Erases all information from the filter.
125125
*
126126
* @note This function synchronizes the given stream. For asynchronous execution use
127-
* `clear_async`.
127+
* clear_async().
128128
*
129129
* @param stream CUDA stream used for device memory operations and kernel launches
130130
*/
@@ -142,7 +142,7 @@ class bloom_filter {
142142
* @brief Adds all keys in the range `[first, last)` to the filter.
143143
*
144144
* @note This function synchronizes the given stream. For asynchronous execution use
145-
* `add_async`.
145+
* add_async().
146146
*
147147
* @tparam InputIt Device-accessible random access input key iterator
148148
* @param first Beginning of the sequence of keys
@@ -173,7 +173,7 @@ class bloom_filter {
173173
*
174174
* @note The key `*(first + i)` is added if `pred( *(stencil + i) )` returns `true`.
175175
* @note This function synchronizes the given stream and returns the number of successful
176-
* insertions. For asynchronous execution use `add_if_async`.
176+
* insertions. For asynchronous execution use add_if_async().
177177
*
178178
* @tparam InputIt Device-accessible random access input key iterator
179179
* @tparam StencilIt Device-accessible random-access iterator whose `value_type` is
@@ -227,7 +227,7 @@ class bloom_filter {
227227
* filter.
228228
*
229229
* @note This function synchronizes the given stream. For asynchronous execution use
230-
* `contains_async`.
230+
* contains_async().
231231
*
232232
* @tparam InputIt Device-accessible random access input key iterator
233233
* @tparam OutputIt Device-accessible output iterator assignable from `bool`
@@ -269,7 +269,7 @@ class bloom_filter {
269269
*
270270
* @note The key `*(first + i)` is queried if `pred( *(stencil + i) )` returns `true`.
271271
* @note This function synchronizes the given stream. For asynchronous execution use
272-
* `contains_if_async`.
272+
* contains_if_async().
273273
*
274274
* @tparam InputIt Device-accessible random access input key iterator
275275
* @tparam StencilIt Device-accessible random-access iterator whose `value_type` is
@@ -325,6 +325,91 @@ class bloom_filter {
325325
cuda::stream_ref stream = cuda::stream_ref{
326326
cudaStream_t{nullptr}}) const noexcept;
327327

328+
/**
329+
* @brief Merge another bloom filter into this.
330+
*
331+
* @note Modifies `this` in place.
332+
* @note This function synchronizes the given stream. For asynchronous execution use
333+
* merge_async().
334+
*
335+
* @note This performs the set union of the two filters. Let \f$f : X \to B\f$ denote the
336+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
337+
* then it holds that \f$f(A \cup B) = f(A) \cup f(B)\f$.
338+
*
339+
* @param other Other filter with matching type to this. The policy object must be equal to that
340+
* of this filter, otherwise behavior is undefined.
341+
* @param stream CUDA stream used for device memory operations and kernel launches.
342+
*
343+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
344+
*/
345+
__host__ constexpr void merge(bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
346+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
347+
348+
/**
349+
* @brief Asynchronously merge another bloom filter into this.
350+
*
351+
* @note Modifies `this` in place.
352+
*
353+
* @note This performs the set union of the two filters. Let \f$f : X \to B\f$ denote the
354+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
355+
* then it holds that \f$f(A \cup B) = f(A) \cup f(B)\f$
356+
*
357+
* @param other Other filter with matching type to this. The policy object must be equal to that
358+
* of this filter, otherwise behavior is undefined.
359+
* @param stream CUDA stream used for device memory operations and kernel launches.
360+
*
361+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
362+
*/
363+
__host__ constexpr void merge_async(
364+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
365+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
366+
367+
/**
368+
* @brief Intersect another bloom filter into this.
369+
*
370+
* @note Modifies `this` in place.
371+
* @note This function synchronizes the given stream. For asynchronous execution use
372+
* intersect_async().
373+
*
374+
* @note This performs the set intersection of the two filters. Unlike merge(), this operation
375+
* does not distribute over filter construction and therefore only approximates the bloom filter
376+
* of the intersection of the input sets. In other words, let \f$f : X \to B\f$ denote the
377+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
378+
* then \f$(A \cap B) \ne f(A) \cap f(B)\f$. Despite this, it is guaranteed that for all \f$x \in
379+
* (A \cap B)\f$, it holds \f$x \in f(A) \cap f(B)\f$.
380+
*
381+
* @param other Other filter with matching type to this. The policy object must be equal to that
382+
* of this filter, otherwise behavior is undefined.
383+
* @param stream CUDA stream used for device memory operations and kernel launches.
384+
*
385+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
386+
*/
387+
__host__ constexpr void intersect(
388+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
389+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
390+
391+
/**
392+
* @brief Asynchronously intersect another bloom filter into this.
393+
*
394+
* @note Modifies `this` in place.
395+
*
396+
* @note This performs the set intersection of the two filters. Unlike merge_async(), this
397+
* operation does not distribute over filter construction and therefore only approximates the
398+
* bloom filter of the intersection of the input sets. In other words, let \f$f : X \to B\f$
399+
* denote the construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be
400+
* two sets, then \f$(A \cap B) \ne f(A) \cap f(B)\f$. Despite this, it is guaranteed that for
401+
* all \f$x \in (A \cap B)\f$, it holds \f$x \in f(A) \cap f(B)\f$.
402+
*
403+
* @param other Other filter with matching type to this. The policy object must be equal to that
404+
* of this filter, otherwise behavior is undefined.
405+
* @param stream CUDA stream used for device memory operations and kernel launches.
406+
*
407+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
408+
*/
409+
__host__ constexpr void intersect_async(
410+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
411+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
412+
328413
/**
329414
* @brief Gets a pointer to the underlying filter storage.
330415
*
@@ -369,4 +454,4 @@ class bloom_filter {
369454
};
370455
} // namespace cuco
371456

372-
#include <cuco/detail/bloom_filter/bloom_filter.inl>
457+
#include <cuco/detail/bloom_filter/bloom_filter.inl>

0 commit comments

Comments
 (0)