@@ -14302,6 +14302,241 @@ named_barrier_signal(uint8_t barrier_id, uint8_t producer_consumer_mode,
1430214302
1430314303/// @} sycl_esimd_memory_nbarrier
1430414304
14305+ /// @defgroup sycl_esimd_mask_compressed Mask compressed APIs.
14306+ /// @ingroup sycl_esimd_memory
14307+
14308+ /// @addtogroup sycl_esimd_mask_compressed
14309+ /// @{
14310+
14311+ /// template <typename T, int N,
14312+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14313+ /// simd<T, N>
14314+ /// mask_expand_load(const T *p, simd_mask<N> mask, PropertyListT props = {});
14315+ /// Mask expand load from USM memory location.
14316+ /// The function reads data from a memory location using following algorithm:
14317+ ///
14318+ /// \code{.cpp}
14319+ ///
14320+ /// int Index = 0;
14321+ /// for (int i = 0; i < N; ++i) {
14322+ /// if (Mask[i]) {
14323+ /// Result[i] = *(p + Index);
14324+ /// ++Index;
14325+ /// }
14326+ /// }
14327+ /// \endcode
14328+ ///
14329+ ///
14330+ /// @tparam T is the element type.
14331+ /// @tparam N is the data size.
14332+ /// @param p is the base address for this operation.
14333+ /// @param mask is the mask determining which elements will be read.
14334+ /// @param props The compile-time properties. Only cache hint
14335+ /// properties are used.
14336+ ///
14337+ template <typename T, int N,
14338+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14339+ __ESIMD_API std::enable_if_t<
14340+ ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
14341+ mask_expand_load(const T *p, simd_mask<N> mask, PropertyListT props = {}) {
14342+ // offsets::value contains binary masks that for every location at index i it
14343+ // contains i 1's i.e. 0,1,3,7,...
14344+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14345+ // Performing '&' operation with packed mask will leave at index i a bitmask
14346+ // with number of 1's corresponding to a number of elements to be loaded so
14347+ // far (number of 1's in the mask preceding the index i). Number of 1's
14348+ // becomes an index for compressed store/expanded load operation.
14349+ simd<uint32_t, N> offset =
14350+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14351+ return gather(p, offset * sizeof(T), mask, props);
14352+ }
14353+
14354+ /// template <typename T, int N, typename AccessorTy,
14355+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14356+ /// simd<T, N>
14357+ /// mask_expand_load(AccessorTy acc, simd_mask<N> mask, PropertyListT props =
14358+ /// {});
14359+ /// Mask expand load from accessor memory (could be local or device
14360+ /// accessor). The function reads data from a memory location using following
14361+ /// algorithm:
14362+ ///
14363+ /// \code{.cpp}
14364+ ///
14365+ /// int Index = 0;
14366+ /// for (int i = 0; i < N; ++i) {
14367+ /// if (Mask[i])
14368+ /// Result[i] = acc[global_offset + Index++];
14369+ /// }
14370+ /// \endcode
14371+ ///
14372+ ///
14373+ /// @tparam T is the element type.
14374+ /// @tparam N is the data size.
14375+ /// @param acc is the accessor to read from.
14376+ /// @param global_offset is the global offset in bytes.
14377+ /// @param mask is the mask determining which elements will be read.
14378+ /// @param props The compile-time properties. Only cache hint
14379+ /// properties are used.
14380+ ///
14381+ template <typename T, int N, typename AccessorTy,
14382+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14383+ __ESIMD_API std::enable_if_t<
14384+ ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
14385+ detail::is_accessor_with_v<AccessorTy,
14386+ detail::accessor_mode_cap::can_read>,
14387+ simd<T, N>>
14388+ mask_expand_load(AccessorTy acc, uint32_t global_offset, simd_mask<N> mask,
14389+ PropertyListT props = {}) {
14390+ // offsets::value contains binary masks that for every location at index i it
14391+ // contains i 1's i.e. 0,1,3,7,...
14392+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14393+ // Performing '&' operation with packed mask will leave at index i a bitmask
14394+ // with number of 1's corresponding to a number of elements to be loaded so
14395+ // far (number of 1's in the mask preceding the index i). Number of 1's
14396+ // becomes an index for compressed store/expanded load operation.
14397+ simd<uint32_t, N> offset =
14398+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14399+ return gather<T>(acc, offset * sizeof(T) + global_offset, mask, props);
14400+ }
14401+
14402+ /// template <typename T, int N,
14403+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14404+ /// simd<T, N>
14405+ /// mask_compress_store(T *p, simd<T, N> vals, simd_mask<N> mask,
14406+ /// PropertyListT props = {});
14407+ /// Mask compressed store to USM memory location.
14408+ /// The function stores data to a memory location using following algorithm:
14409+ ///
14410+ /// \code{.cpp}
14411+ ///
14412+ /// int Index = 0;
14413+ /// for (int i = 0; i < N; ++i) {
14414+ /// if (Mask[i]) {
14415+ /// *(p + Index) = val[i];
14416+ /// ++Index;
14417+ /// }
14418+ /// }
14419+ /// \endcode
14420+ ///
14421+ ///
14422+ /// @tparam T is the element type.
14423+ /// @tparam N is the data size.
14424+ /// @param p is the base address for this operation.
14425+ /// @param vals is the data to store.
14426+ /// @param mask is the mask determining which elements will be stored.
14427+ /// @param props The compile-time properties. Only cache hint
14428+ /// properties are used.
14429+ ///
14430+ template <typename T, int N,
14431+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14432+ __ESIMD_API std::enable_if_t<
14433+ ext::oneapi::experimental::is_property_list_v<PropertyListT>>
14434+ mask_compress_store(T *p, simd<T, N> vals, simd_mask<N> mask,
14435+ PropertyListT props = {}) {
14436+ // offsets::value contains binary masks that for every location at index i it
14437+ // contains i 1's i.e. 0,1,3,7,...
14438+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14439+ // Performing '&' operation with packed mask will leave at index i a bitmask
14440+ // with number of 1's corresponding to a number of elements to be loaded so
14441+ // far (number of 1's in the mask preceding the index i). Number of 1's
14442+ // becomes an index for compressed store/expanded load operation.
14443+ simd<uint32_t, N> offset =
14444+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14445+ scatter(p, offset * sizeof(T), vals, mask, props);
14446+ }
14447+
14448+ /// template <typename T, int N, typename AccessorTy,
14449+ /// typename PropertyListT = oneapi::experimental::empty_properties_t>
14450+ /// simd<T, N>
14451+ /// mask_compress_store(AccessorTy acc, simd<T, N> vals, simd_mask<N> mask,
14452+ /// PropertyListT props = {});
14453+ /// Mask compressed store to accessor memory (could be local or device
14454+ /// accessor).
14455+ /// The function stores data to a memory location using following algorithm:
14456+ ///
14457+ /// \code{.cpp}
14458+ ///
14459+ /// int Index = 0;
14460+ /// for (int i = 0; i < N; ++i) {
14461+ /// if (Mask[i])
14462+ /// acc[global_offset + Index++] = val[i];
14463+ /// }
14464+ /// \endcode
14465+ ///
14466+ ///
14467+ /// @tparam T is the element type.
14468+ /// @tparam N is the data size.
14469+ /// @param acc is the accessor to write to.
14470+ /// @param global_offset is the global offset in bytes.
14471+ /// @param vals is the data to store.
14472+ /// @param mask is the mask determining which elements will be stored.
14473+ /// @param props The compile-time properties. Only cache hint
14474+ /// properties are used.
14475+ ///
14476+ template <typename T, int N, typename AccessorTy,
14477+ typename PropertyListT = oneapi::experimental::empty_properties_t>
14478+ __ESIMD_API std::enable_if_t<
14479+ ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
14480+ detail::is_accessor_with_v<AccessorTy,
14481+ detail::accessor_mode_cap::can_write>>
14482+ mask_compress_store(AccessorTy acc, uint32_t global_offset, simd<T, N> vals,
14483+ simd_mask<N> mask, PropertyListT props = {}) {
14484+ // offsets::value contains binary masks that for every location at index i it
14485+ // contains i 1's i.e. 0,1,3,7,...
14486+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14487+ // Performing '&' operation with packed mask will leave at index i a bitmask
14488+ // with number of 1's corresponding to a number of elements to be loaded so
14489+ // far (number of 1's in the mask preceding the index i). Number of 1's
14490+ // becomes an index for compressed store/expanded load operation.
14491+ simd<uint32_t, N> offset =
14492+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14493+ scatter<T, N>(acc, offset * sizeof(T) + global_offset, vals, mask, props);
14494+ }
14495+
14496+ /// template <typename T, int N, int M>
14497+ /// simd<T, N>
14498+ /// mask_compress_store(simd<T, M> &dst, uint32_t global_offset, simd<T, N>
14499+ /// vals, simd_mask<N> mask);
14500+ /// Mask compressed store to another vector. The function reads data to a
14501+ /// vector using following algorithm:
14502+ ///
14503+ /// \code{.cpp}
14504+ ///
14505+ /// int Index = 0;
14506+ /// for (int i = 0; i < N; ++i) {
14507+ /// if (Mask[i])
14508+ /// dst[global_offset + Index++] = vals[i];
14509+ /// }
14510+ /// \endcode
14511+ ///
14512+ ///
14513+ /// @tparam T is the element type.
14514+ /// @tparam N is the data size.
14515+ /// @tparam M is the source data size.
14516+ /// @param dst is the vector to write to.
14517+ /// @param global_offset is an offset to use for all writes.
14518+ /// @param vals is the data to store.
14519+ /// @param mask is the mask determining which elements will be stored.
14520+ ///
14521+ template <typename T, int N, int M>
14522+ __ESIMD_API std::enable_if_t<M >= N>
14523+ mask_compress_store(simd<T, M> &dst, uint32_t global_offset, simd<T, N> vals,
14524+ simd_mask<N> mask) {
14525+ // offsets::value contains binary masks that for every location at index i it
14526+ // contains i 1's i.e. 0,1,3,7,...
14527+ using offsets = typename detail::GenerateCompressedBitmask<N>::value;
14528+ // Performing '&' operation with packed mask will leave at index i a bitmask
14529+ // with number of 1's corresponding to a number of elements to be loaded so
14530+ // far (number of 1's in the mask preceding the index i). Number of 1's
14531+ // becomes an index for compressed store/expanded load operation.
14532+ simd<uint32_t, N> offset =
14533+ cbit(simd<uint32_t, N>(offsets::value) & pack_mask(mask));
14534+
14535+ simd<uint16_t, N> Indices = global_offset + offset;
14536+ dst.iupdate(Indices, vals, mask);
14537+ }
14538+
14539+ /// @} sycl_esimd_mask_compressed
1430514540/// @} sycl_esimd_memory
1430614541
1430714542/// @cond EXCLUDE
0 commit comments