@@ -1420,5 +1420,73 @@ class operator_impl<
14201420 return ref_.impl_ .count (group, key);
14211421 }
14221422};
1423+
1424+ template <typename Key,
1425+ typename T,
1426+ cuda::thread_scope Scope,
1427+ typename KeyEqual,
1428+ typename ProbingScheme,
1429+ typename StorageRef,
1430+ typename ... Operators>
1431+ class operator_impl <
1432+ op::retrieve_tag,
1433+ static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
1434+ using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
1435+ using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
1436+ using key_type = typename base_type::key_type;
1437+ using value_type = typename base_type::value_type;
1438+ using iterator = typename base_type::iterator;
1439+ using const_iterator = typename base_type::const_iterator;
1440+
1441+ static constexpr auto cg_size = base_type::cg_size;
1442+ static constexpr auto bucket_size = base_type::bucket_size;
1443+
1444+ public:
1445+ /* *
1446+ * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin,
1447+ * input_probe_end)`.
1448+ *
1449+ * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated
1450+ * slot content to `output_match`, respectively. The output order is unspecified.
1451+ *
1452+ * Behavior is undefined if the size of the output range exceeds the number of retrieved slots.
1453+ * Use `count()` to determine the size of the output range.
1454+ *
1455+ * @tparam BlockSize Size of the thread block this operation is executed in
1456+ * @tparam InputProbeIt Device accessible input iterator whose `value_type` is
1457+ * convertible to the container's `key_type`
1458+ * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
1459+ * convertible to the container's `key_type`
1460+ * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
1461+ * convertible to the container's `value_type`
1462+ * @tparam AtomicCounter Atomic counter type that follows the same semantics as
1463+ * `cuda::atomic(_ref)`
1464+ *
1465+ * @param block Thread block this operation is executed in
1466+ * @param input_probe_begin Beginning of the input sequence of keys
1467+ * @param input_probe_end End of the input sequence of keys
1468+ * @param output_probe Beginning of the sequence of keys corresponding to matching elements in
1469+ * `output_match`
1470+ * @param output_match Beginning of the sequence of matching elements
1471+ * @param atomic_counter Counter that is used to determine the next free position in the output
1472+ * sequences
1473+ */
1474+ template <int32_t BlockSize,
1475+ class InputProbeIt ,
1476+ class OutputProbeIt ,
1477+ class OutputMatchIt ,
1478+ class AtomicCounter >
1479+ __device__ void retrieve (cooperative_groups::thread_block const & block,
1480+ InputProbeIt input_probe_begin,
1481+ InputProbeIt input_probe_end,
1482+ OutputProbeIt output_probe,
1483+ OutputMatchIt output_match,
1484+ AtomicCounter* atomic_counter) const
1485+ {
1486+ auto const & ref_ = static_cast <ref_type const &>(*this );
1487+ ref_.impl_ .retrieve <BlockSize>(
1488+ block, input_probe_begin, input_probe_end, output_probe, output_match, atomic_counter);
1489+ }
1490+ };
14231491} // namespace detail
14241492} // namespace cuco
0 commit comments