Skip to content

Commit c3bf1dd

Browse files
committed
Merge remote-tracking branch 'upstream/dev' into hll-add-if
2 parents 3f68e2e + c5b9c66 commit c3bf1dd

File tree

8 files changed

+501
-20
lines changed

8 files changed

+501
-20
lines changed

doxygen/Doxyfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1508,7 +1508,7 @@ FORMULA_MACROFILE =
15081508
# The default value is: NO.
15091509
# This tag requires that the tag GENERATE_HTML is set to YES.
15101510

1511-
USE_MATHJAX = NO
1511+
USE_MATHJAX = YES
15121512

15131513
# When MathJax is enabled you can set the default output format to be used for
15141514
# the MathJax output. See the MathJax site (see:

include/cuco/bloom_filter.cuh

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ namespace cuco {
4242
* - Host-side "bulk" operations
4343
* - Device-side "singular" operations
4444
*
45-
* The host-side bulk operations include `add`, `contains`, etc. These APIs should be used when
45+
* The host-side bulk operations include add(), contains(), etc. These APIs should be used when
4646
* there are a large number of keys to add or lookup. For example, given a range of keys
4747
* specified by device-accessible iterators, the bulk `add` function will add all keys into
4848
* the filter.
@@ -124,7 +124,7 @@ class bloom_filter {
124124
* @brief Erases all information from the filter.
125125
*
126126
* @note This function synchronizes the given stream. For asynchronous execution use
127-
* `clear_async`.
127+
* clear_async().
128128
*
129129
* @param stream CUDA stream used for device memory operations and kernel launches
130130
*/
@@ -142,7 +142,7 @@ class bloom_filter {
142142
* @brief Adds all keys in the range `[first, last)` to the filter.
143143
*
144144
* @note This function synchronizes the given stream. For asynchronous execution use
145-
* `add_async`.
145+
* add_async().
146146
*
147147
* @tparam InputIt Device-accessible random access input key iterator
148148
* @param first Beginning of the sequence of keys
@@ -173,7 +173,7 @@ class bloom_filter {
173173
*
174174
* @note The key `*(first + i)` is added if `pred( *(stencil + i) )` returns `true`.
175175
* @note This function synchronizes the given stream and returns the number of successful
176-
* insertions. For asynchronous execution use `add_if_async`.
176+
* insertions. For asynchronous execution use add_if_async().
177177
*
178178
* @tparam InputIt Device-accessible random access input key iterator
179179
* @tparam StencilIt Device-accessible random-access iterator whose `value_type` is
@@ -227,7 +227,7 @@ class bloom_filter {
227227
* filter.
228228
*
229229
* @note This function synchronizes the given stream. For asynchronous execution use
230-
* `contains_async`.
230+
* contains_async().
231231
*
232232
* @tparam InputIt Device-accessible random access input key iterator
233233
* @tparam OutputIt Device-accessible output iterator assignable from `bool`
@@ -269,7 +269,7 @@ class bloom_filter {
269269
*
270270
* @note The key `*(first + i)` is queried if `pred( *(stencil + i) )` returns `true`.
271271
* @note This function synchronizes the given stream. For asynchronous execution use
272-
* `contains_if_async`.
272+
* contains_if_async().
273273
*
274274
* @tparam InputIt Device-accessible random access input key iterator
275275
* @tparam StencilIt Device-accessible random-access iterator whose `value_type` is
@@ -325,6 +325,91 @@ class bloom_filter {
325325
cuda::stream_ref stream = cuda::stream_ref{
326326
cudaStream_t{nullptr}}) const noexcept;
327327

328+
/**
329+
* @brief Merge another bloom filter into this.
330+
*
331+
* @note Modifies `this` in place.
332+
* @note This function synchronizes the given stream. For asynchronous execution use
333+
* merge_async().
334+
*
335+
* @note This performs the set union of the two filters. Let \f$f : X \to B\f$ denote the
336+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
337+
* then it holds that \f$f(A \cup B) = f(A) \cup f(B)\f$.
338+
*
339+
* @param other Other filter with matching type to this. The policy object must be equal to that
340+
* of this filter, otherwise behavior is undefined.
341+
* @param stream CUDA stream used for device memory operations and kernel launches.
342+
*
343+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
344+
*/
345+
__host__ constexpr void merge(bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
346+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
347+
348+
/**
349+
* @brief Asynchronously merge another bloom filter into this.
350+
*
351+
* @note Modifies `this` in place.
352+
*
353+
* @note This performs the set union of the two filters. Let \f$f : X \to B\f$ denote the
354+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
355+
* then it holds that \f$f(A \cup B) = f(A) \cup f(B)\f$
356+
*
357+
* @param other Other filter with matching type to this. The policy object must be equal to that
358+
* of this filter, otherwise behavior is undefined.
359+
* @param stream CUDA stream used for device memory operations and kernel launches.
360+
*
361+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
362+
*/
363+
__host__ constexpr void merge_async(
364+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
365+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
366+
367+
/**
368+
* @brief Intersect another bloom filter into this.
369+
*
370+
* @note Modifies `this` in place.
371+
* @note This function synchronizes the given stream. For asynchronous execution use
372+
* intersect_async().
373+
*
374+
* @note This performs the set intersection of the two filters. Unlike merge(), this operation
375+
* does not distribute over filter construction and therefore only approximates the bloom filter
376+
* of the intersection of the input sets. In other words, let \f$f : X \to B\f$ denote the
377+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
378+
* then \f$(A \cap B) \ne f(A) \cap f(B)\f$. Despite this, it is guaranteed that for all \f$x \in
379+
* (A \cap B)\f$, it holds \f$x \in f(A) \cap f(B)\f$.
380+
*
381+
* @param other Other filter with matching type to this. The policy object must be equal to that
382+
* of this filter, otherwise behavior is undefined.
383+
* @param stream CUDA stream used for device memory operations and kernel launches.
384+
*
385+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
386+
*/
387+
__host__ constexpr void intersect(
388+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
389+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
390+
391+
/**
392+
* @brief Asynchronously intersect another bloom filter into this.
393+
*
394+
* @note Modifies `this` in place.
395+
*
396+
* @note This performs the set intersection of the two filters. Unlike merge_async(), this
397+
* operation does not distribute over filter construction and therefore only approximates the
398+
* bloom filter of the intersection of the input sets. In other words, let \f$f : X \to B\f$
399+
* denote the construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be
400+
* two sets, then \f$(A \cap B) \ne f(A) \cap f(B)\f$. Despite this, it is guaranteed that for
401+
* all \f$x \in (A \cap B)\f$, it holds \f$x \in f(A) \cap f(B)\f$.
402+
*
403+
* @param other Other filter with matching type to this. The policy object must be equal to that
404+
* of this filter, otherwise behavior is undefined.
405+
* @param stream CUDA stream used for device memory operations and kernel launches.
406+
*
407+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
408+
*/
409+
__host__ constexpr void intersect_async(
410+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other,
411+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
412+
328413
/**
329414
* @brief Gets a pointer to the underlying filter storage.
330415
*
@@ -369,4 +454,4 @@ class bloom_filter {
369454
};
370455
} // namespace cuco
371456

372-
#include <cuco/detail/bloom_filter/bloom_filter.inl>
457+
#include <cuco/detail/bloom_filter/bloom_filter.inl>

include/cuco/bloom_filter_ref.cuh

Lines changed: 94 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class bloom_filter_ref {
9797
* @brief Erases all information from the filter.
9898
*
9999
* @note This function synchronizes the given stream. For asynchronous execution use
100-
* `clear_async`.
100+
* clear_async().
101101
*
102102
* @param stream CUDA stream used for device memory operations and kernel launches
103103
*/
@@ -114,7 +114,7 @@ class bloom_filter_ref {
114114
/**
115115
* @brief Device function that adds a key to the filter.
116116
*
117-
* @tparam ProbeKey Input type that is implicitly convertible to `key_type`
117+
* @tparam ProbeKey Input type that is implicitly convertible to @ref key_type
118118
*
119119
* @param key The key to be added
120120
*/
@@ -124,7 +124,7 @@ class bloom_filter_ref {
124124
/**
125125
* @brief Device function that cooperatively adds a key to the filter.
126126
*
127-
* @note Best performance is achieved if the size of the CG is equal to `words_per_block`.
127+
* @note Best performance is achieved if the size of the CG is equal to @ref words_per_block.
128128
*
129129
* @tparam CG Cooperative Group type
130130
* @tparam ProbeKey Input key type
@@ -139,7 +139,7 @@ class bloom_filter_ref {
139139
* @brief Device function that adds all keys in the range `[first, last)` to the filter.
140140
*
141141
* @note Best performance is achieved if the size of the CG is larger than or equal to
142-
* `words_per_block`.
142+
* @ref words_per_block.
143143
*
144144
* @tparam CG Cooperative Group type
145145
* @tparam InputIt Device-accessible random access input key iterator
@@ -155,7 +155,7 @@ class bloom_filter_ref {
155155
* @brief Adds all keys in the range `[first, last)` to the filter.
156156
*
157157
* @note This function synchronizes the given stream. For asynchronous execution use
158-
* `add_async`.
158+
* add_async().
159159
*
160160
* @tparam InputIt Device-accessible random access input key iterator
161161
*
@@ -187,7 +187,7 @@ class bloom_filter_ref {
187187
*
188188
* @note The key `*(first + i)` is added if `pred( *(stencil + i) )` returns `true`.
189189
* @note This function synchronizes the given stream and returns the number of successful
190-
* insertions. For asynchronous execution use `add_if_async`.
190+
* insertions. For asynchronous execution use add_if_async().
191191
*
192192
* @tparam InputIt Device-accessible random access input key iterator
193193
* @tparam StencilIt Device-accessible random-access iterator whose `value_type` is
@@ -275,7 +275,7 @@ class bloom_filter_ref {
275275
* filter.
276276
*
277277
* @note This function synchronizes the given stream. For asynchronous execution use
278-
* `contains_async`.
278+
* contains_async().
279279
*
280280
* @tparam InputIt Device-accessible random access input iterator where
281281
* <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
@@ -321,7 +321,7 @@ class bloom_filter_ref {
321321
*
322322
* @note The key `*(first + i)` is queried if `pred( *(stencil + i) )` returns `true`.
323323
* @note This function synchronizes the given stream. For asynchronous execution use
324-
* `contains_if_async`.
324+
* contains_if_async().
325325
*
326326
* @tparam InputIt Device-accessible random access input iterator where
327327
* <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
@@ -381,6 +381,91 @@ class bloom_filter_ref {
381381
cuda::stream_ref stream = cuda::stream_ref{
382382
cudaStream_t{nullptr}}) const noexcept;
383383

384+
/**
385+
* @brief Merge another bloom filter into this.
386+
*
387+
* @note Modifies `this` in place.
388+
* @note This function synchronizes the given stream. For asynchronous execution use
389+
* merge_async().
390+
*
391+
* @note This performs the set union of the two filters. Let \f$f : X \to B\f$ denote the
392+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
393+
* then it holds that \f$f(A \cup B) = f(A) \cup f(B)\f$.
394+
*
395+
* @param other Other filter with matching type to this. The policy object must be equal to that
396+
* of this filter, otherwise behavior is undefined.
397+
* @param stream CUDA stream used for device memory operations and kernel launches.
398+
*
399+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
400+
*/
401+
__host__ constexpr void merge(bloom_filter_ref<Key, Extent, Scope, Policy> const& other,
402+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
403+
404+
/**
405+
* @brief Asynchronously merge another bloom filter into this.
406+
*
407+
* @note Modifies `this` in place.
408+
*
409+
* @note This performs the set union of the two filters. Let \f$f : X \to B\f$ denote the
410+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
411+
* then it holds that \f$f(A \cup B) = f(A) \cup f(B)\f$
412+
*
413+
* @param other Other filter with matching type to this. The policy object must be equal to that
414+
* of this filter, otherwise behavior is undefined.
415+
* @param stream CUDA stream used for device memory operations and kernel launches.
416+
*
417+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
418+
*/
419+
__host__ constexpr void merge_async(bloom_filter_ref<Key, Extent, Scope, Policy> const& other,
420+
cuda::stream_ref stream = cuda::stream_ref{
421+
cudaStream_t{nullptr}});
422+
423+
/**
424+
* @brief Intersect another bloom filter into this.
425+
*
426+
* @note Modifies `this` in place.
427+
* @note This function synchronizes the given stream. For asynchronous execution use
428+
* intersect_async().
429+
*
430+
* @note This performs the set intersection of the two filters. Unlike merge(), this operation
431+
* does not distribute over filter construction and therefore only approximates the bloom filter
432+
* of the intersection of the input sets. In other words, let \f$f : X \to B\f$ denote the
433+
* construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be two sets,
434+
* then \f$f(A \cap B) \ne f(A) \cap f(B)\f$. Despite this, it is guaranteed that for all \f$x \in
435+
* (A \cap B)\f$, it holds \f$x \in f(A) \cap f(B)\f$.
436+
*
437+
* @param other Other filter with matching type to this. The policy object must be equal to that
438+
* of this filter, otherwise behavior is undefined.
439+
* @param stream CUDA stream used for device memory operations and kernel launches.
440+
*
441+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
442+
*/
443+
__host__ constexpr void intersect(bloom_filter_ref<Key, Extent, Scope, Policy> const& other,
444+
cuda::stream_ref stream = cuda::stream_ref{
445+
cudaStream_t{nullptr}});
446+
447+
/**
448+
* @brief Asynchronously intersect another bloom filter into this.
449+
*
450+
* @note Modifies `this` in place.
451+
*
452+
* @note This performs the set intersection of the two filters. Unlike merge_async(), this
453+
* operation does not distribute over filter construction and therefore only approximates the
454+
* bloom filter of the intersection of the input sets. In other words, let \f$f : X \to B\f$
455+
* denote the construction of a bloom filter on some set \f$X\f$, and let \f$A\f$ and \f$B\f$ be
456+
* two sets, then \f$f(A \cap B) \ne f(A) \cap f(B)\f$. Despite this, it is guaranteed that for
457+
* all \f$x \in (A \cap B)\f$, it holds \f$x \in f(A) \cap f(B)\f$.
458+
*
459+
* @param other Other filter with matching type to this. The policy object must be equal to that
460+
* of this filter, otherwise behavior is undefined.
461+
* @param stream CUDA stream used for device memory operations and kernel launches.
462+
*
463+
* @throws cuco::logic_error If the other filter does not have the same number of blocks as this.
464+
*/
465+
__host__ constexpr void intersect_async(bloom_filter_ref<Key, Extent, Scope, Policy> const& other,
466+
cuda::stream_ref stream = cuda::stream_ref{
467+
cudaStream_t{nullptr}});
468+
384469
/**
385470
* @brief Gets a pointer to the underlying filter storage.
386471
*
@@ -407,4 +492,4 @@ class bloom_filter_ref {
407492
};
408493
} // namespace cuco
409494

410-
#include <cuco/detail/bloom_filter/bloom_filter_ref.inl>
495+
#include <cuco/detail/bloom_filter/bloom_filter_ref.inl>

include/cuco/detail/bloom_filter/bloom_filter.inl

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,34 @@ __host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::con
129129
ref_.contains_if_async(first, last, stencil, pred, output_begin, stream);
130130
}
131131

132+
template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
133+
__host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::merge(
134+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other, cuda::stream_ref stream)
135+
{
136+
ref_.merge(other.ref_, stream);
137+
}
138+
139+
template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
140+
__host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::merge_async(
141+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other, cuda::stream_ref stream)
142+
{
143+
ref_.merge_async(other.ref_, stream);
144+
}
145+
146+
template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
147+
__host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::intersect(
148+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other, cuda::stream_ref stream)
149+
{
150+
ref_.intersect(other.ref_, stream);
151+
}
152+
153+
template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
154+
__host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::intersect_async(
155+
bloom_filter<Key, Extent, Scope, Policy, Allocator> const& other, cuda::stream_ref stream)
156+
{
157+
ref_.intersect_async(other.ref_, stream);
158+
}
159+
132160
template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
133161
[[nodiscard]] __host__ constexpr
134162
typename bloom_filter<Key, Extent, Scope, Policy, Allocator>::word_type*
@@ -169,4 +197,4 @@ template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class
169197
return ref_;
170198
}
171199

172-
} // namespace cuco
200+
} // namespace cuco

0 commit comments

Comments
 (0)