@@ -415,77 +415,6 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
415415 }
416416}
417417
418- /* *
419- * @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
420- * input_probe + n)`.
421- *
422- * If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
423- * `output_probe` and associated slot contents to `output_match`, respectively. The output order is
424- * unspecified.
425- *
426- * @tparam IsOuter Flag indicating whether it's an outer count or not
427- * @tparam block_size The size of the thread block
428- * @tparam InputProbeIt Device accessible input iterator
429- * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
430- * convertible to the `InputProbeIt`'s `value_type`
431- * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
432- * convertible to the container's `value_type`
433- * @tparam AtomicCounter Integral atomic type that follows the same semantics as
434- * `cuda::(std::)atomic(_ref)`
435- * @tparam Ref Type of non-owning device ref allowing access to storage
436- *
437- * @param input_probe Beginning of the sequence of input keys
438- * @param n Number of the keys to query
439- * @param output_probe Beginning of the sequence of keys corresponding to matching elements in
440- * `output_match`
441- * @param output_match Beginning of the sequence of matching elements
442- * @param atomic_counter Pointer to an atomic object of integral type that is used to count the
443- * number of output elements
444- * @param ref Non-owning container device ref used to access the slot storage
445- */
446- template <bool IsOuter,
447- int32_t BlockSize,
448- class InputProbeIt ,
449- class OutputProbeIt ,
450- class OutputMatchIt ,
451- class AtomicCounter ,
452- class Ref >
453- CUCO_KERNEL __launch_bounds__ (BlockSize) void retrieve(InputProbeIt input_probe,
454- cuco::detail::index_type n,
455- OutputProbeIt output_probe,
456- OutputMatchIt output_match,
457- AtomicCounter* atomic_counter,
458- Ref ref)
459- {
460- namespace cg = cooperative_groups;
461-
462- auto const block = cg::this_thread_block ();
463- auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
464- // make sure all but the last block are always occupied
465- auto const items_per_block = detail::int_div_ceil (n, tiles_in_block * gridDim .x ) * tiles_in_block;
466-
467- auto const block_begin_offset = block.group_index ().x * items_per_block;
468- auto const block_end_offset = min (n, block_begin_offset + items_per_block);
469-
470- if (block_begin_offset < block_end_offset) {
471- if constexpr (IsOuter) {
472- ref.retrieve_outer <BlockSize>(block,
473- input_probe + block_begin_offset,
474- input_probe + block_end_offset,
475- output_probe,
476- output_match,
477- *atomic_counter);
478- } else {
479- ref.retrieve <BlockSize>(block,
480- input_probe + block_begin_offset,
481- input_probe + block_end_offset,
482- output_probe,
483- output_match,
484- *atomic_counter);
485- }
486- }
487- }
488-
489418/* *
490419 * @brief Inserts all elements in the range `[first, last)`.
491420 *
@@ -642,6 +571,76 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
642571 if (threadIdx .x == 0 ) { count->fetch_add (block_count, cuda::std::memory_order_relaxed); }
643572}
644573
574+ /* *
575+ * @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
576+ * input_probe + n)`.
577+ *
578+ * If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
579+ * `output_probe` and associated slot contents to `output_match`, respectively. The output order is
580+ * unspecified.
581+ *
582+ * @tparam IsOuter Flag indicating whether it's an outer count or not
583+ * @tparam block_size The size of the thread block
584+ * @tparam InputProbeIt Device accessible input iterator
585+ * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
586+ * convertible to the `InputProbeIt`'s `value_type`
587+ * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
588+ * convertible to the container's `value_type`
589+ * @tparam AtomicCounter Integral atomic type that follows the same semantics as
590+ * `cuda::(std::)atomic(_ref)`
591+ * @tparam Ref Type of non-owning device ref allowing access to storage
592+ *
593+ * @param input_probe Beginning of the sequence of input keys
594+ * @param n Number of the keys to query
595+ * @param output_probe Beginning of the sequence of keys corresponding to matching elements in
596+ * `output_match`
597+ * @param output_match Beginning of the sequence of matching elements
598+ * @param atomic_counter Pointer to an atomic object of integral type that is used to count the
599+ * number of output elements
600+ * @param ref Non-owning container device ref used to access the slot storage
601+ */
602+ template <bool IsOuter,
603+ int32_t BlockSize,
604+ class InputProbeIt ,
605+ class OutputProbeIt ,
606+ class OutputMatchIt ,
607+ class AtomicCounter ,
608+ class Ref >
609+ CUCO_KERNEL void retrieve (InputProbeIt input_probe,
610+ cuco::detail::index_type n,
611+ OutputProbeIt output_probe,
612+ OutputMatchIt output_match,
613+ AtomicCounter* atomic_counter,
614+ Ref ref)
615+ {
616+ namespace cg = cooperative_groups;
617+
618+ auto const block = cg::this_thread_block ();
619+ auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
620+
621+ auto const block_begin_offset = block.group_index ().x * tiles_in_block;
622+ auto const block_end_offset =
623+ min (n, static_cast <cuco::detail::index_type>(block_begin_offset + tiles_in_block));
624+
625+ if (block_begin_offset < block_end_offset) {
626+ if constexpr (IsOuter) {
627+ ref.retrieve_outer <BlockSize>(block,
628+ input_probe + block_begin_offset,
629+ input_probe + block_end_offset,
630+ output_probe,
631+ output_match,
632+ atomic_counter);
633+ } else {
634+ ref.retrieve <BlockSize>(block,
635+ input_probe + block_begin_offset,
636+ input_probe + block_end_offset,
637+ output_probe,
638+ output_match,
639+ atomic_counter);
640+ }
641+ }
642+ }
643+
645644/* *
646645 * @brief Calculates the number of filled slots for the given bucket storage.
647646 *
0 commit comments