@@ -474,8 +474,16 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
474474// / Supported platforms: DG2, PVC
475475// / VISA instruction: lsc_load.ugm
476476// /
477- // / Collects elements located at specified address and returns them
478- // / as a single \ref simd object.
477+ // / Accesses contiguous block of memory of `NElts * S` bytes starting from
478+ // / given address, where S is a byte size of an "element" defined by the \c DS
479+ // / template parameter. The maximum size of accessed block is 512 bytes for PVC
480+ // / and 256 bytes for ACM (DG2).
481+ // / When \? DS equals \? lsc_data_size::u64, the address must be 8-byte aligned,
482+ // / otherwise - 4-bytes aligned. Allowed values for the data size are
483+ // / \? lsc_data_size::u32 and \? lsc_data_size::u64. Allowed NElts values are
484+ // / 1, 2, 3, 4, 8, 16, 32, 64.
485+ // / Note that to access 512 bytes, DS must be \? lsc_data_size::u64 and \c NElts
486+ // / must be 64.
479487// /
480488// / @tparam T is element type.
481489// / @tparam NElts is the number of elements to load per address.
@@ -492,22 +500,34 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
492500 cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
493501__ESIMD_API __ESIMD_NS::simd<T, NElts>
494502lsc_block_load (const T *p, __ESIMD_NS::simd_mask<1 > pred = 1 ) {
495- detail::check_lsc_vector_size<NElts>();
496503 detail::check_lsc_data_size<T, DS>();
497504 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
498505 constexpr uint16_t _AddressScale = 1 ;
499506 constexpr int _ImmOffset = 0 ;
500507 constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
501- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
502- " Transposed load is supported only for data size u32 or u64" );
503- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
504508 constexpr detail::lsc_data_order _Transposed =
505509 detail::lsc_data_order::transpose;
506510 constexpr int N = 1 ;
507511 __ESIMD_NS::simd<uintptr_t , N> addrs = reinterpret_cast <uintptr_t >(p);
508- return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
509- _VS, _Transposed, N>(pred.data (),
510- addrs.data ());
512+ constexpr int SmallIntFactor =
513+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
514+ static_assert (NElts % SmallIntFactor == 0 ,
515+ " Number of elements is not supported by Transposed load" );
516+
517+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
518+ constexpr detail::lsc_vector_size _VS =
519+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
520+ if constexpr (SmallIntFactor == 1 ) {
521+ return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
522+ _DS, _VS, _Transposed, N>(pred.data (),
523+ addrs.data ());
524+ } else {
525+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> result =
526+ __esimd_lsc_load_stateless<uint32_t , L1H, L3H, _AddressScale,
527+ _ImmOffset, lsc_data_size::u32 , _VS,
528+ _Transposed, N>(pred.data (), addrs.data ());
529+ return result.template bit_cast_view <T>();
530+ }
511531}
512532
513533// / Accessor-based transposed gather with 1 channel.
@@ -516,6 +536,8 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
516536// /
517537// / Collects elements located at surface and returns them
518538// / as a single \ref simd object.
539+ // / See comments in the \ref lsc_block_load API for description and parameter
540+ // / constraints.
519541// /
520542// / @tparam T is element type.
521543// / @tparam NElts is the number of elements to load per address.
@@ -541,22 +563,36 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
541563 return lsc_block_load<T, NElts, DS, L1H, L3H>(
542564 __ESIMD_DNS::accessorToPointer<T>(acc, offset), pred);
543565#else
544- detail::check_lsc_vector_size<NElts>();
545566 detail::check_lsc_data_size<T, DS>();
546567 detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
547568 constexpr uint16_t _AddressScale = 1 ;
548569 constexpr int _ImmOffset = 0 ;
549570 constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
550- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
551- " Transposed load is supported only for data size u32 or u64" );
552- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
553571 constexpr detail::lsc_data_order _Transposed =
554572 detail::lsc_data_order::transpose;
555573 constexpr int N = 1 ;
556574 __ESIMD_NS::simd<uint32_t , N> offsets = offset;
557575 auto si = __ESIMD_NS::get_surface_index (acc);
558- return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
559- _Transposed, N>(pred.data (), offsets.data (), si);
576+ constexpr int SmallIntFactor =
577+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
578+ static_assert (NElts % SmallIntFactor == 0 ,
579+ " Number of elements is not supported by Transposed load" );
580+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
581+ constexpr detail::lsc_vector_size _VS =
582+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
583+
584+ if constexpr (SmallIntFactor == 1 ) {
585+ return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
586+ _VS, _Transposed, N>(pred.data (),
587+ offsets.data (), si);
588+ } else {
589+
590+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> result =
591+ __esimd_lsc_load_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
592+ lsc_data_size::u32 , _VS, _Transposed, N>(
593+ pred.data (), offsets.data (), si);
594+ return result.template bit_cast_view <T>();
595+ }
560596#endif
561597}
562598
@@ -622,6 +658,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
622658 constexpr uint16_t _AddressScale = 1 ;
623659 constexpr int _ImmOffset = 0 ;
624660 constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
661+
625662 static_assert (
626663 _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
627664 " Transposed prefetch is supported only for data size u32 or u64" );
@@ -630,6 +667,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
630667 detail::lsc_data_order::transpose;
631668 constexpr int N = 1 ;
632669 __ESIMD_NS::simd_mask<N> pred = 1 ;
670+
633671 __ESIMD_NS::simd<uintptr_t , N> addrs = reinterpret_cast <uintptr_t >(p);
634672 __esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
635673 _VS, _Transposed, N>(pred.data (),
@@ -894,6 +932,8 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
894932// / VISA instruction: lsc_store.ugm
895933// /
896934// / Scatters elements to specific address.
935+ // / See comments in the \ref lsc_block_load API for description and parameter
936+ // / constraints.
897937// /
898938// / @tparam T is element type.
899939// / @tparam NElts is the number of elements to store per address.
@@ -910,29 +950,44 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
910950 cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
911951__ESIMD_API void lsc_block_store (T *p, __ESIMD_NS::simd<T, NElts> vals,
912952 __ESIMD_NS::simd_mask<1 > pred = 1 ) {
913- detail::check_lsc_vector_size<NElts>();
914953 detail::check_lsc_data_size<T, DS>();
915954 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
916955 constexpr uint16_t _AddressScale = 1 ;
917956 constexpr int _ImmOffset = 0 ;
918957 constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
919- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
920- " Transposed store is supported only for data size u32 or u64" );
921- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
922958 constexpr detail::lsc_data_order _Transposed =
923959 detail::lsc_data_order::transpose;
924960 constexpr int N = 1 ;
925961 __ESIMD_NS::simd<uintptr_t , N> addrs = reinterpret_cast <uintptr_t >(p);
926- __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
927- _Transposed, N>(pred.data (), addrs.data (),
928- vals.data ());
962+ constexpr int SmallIntFactor =
963+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
964+ static_assert (NElts % SmallIntFactor == 0 ,
965+ " Number of elements is not supported by Transposed store" );
966+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
967+ constexpr detail::lsc_vector_size _VS =
968+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
969+ if constexpr (SmallIntFactor == 1 ) {
970+
971+ __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
972+ _VS, _Transposed, N>(pred.data (), addrs.data (),
973+ vals.data ());
974+ } else {
975+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> tmp =
976+ vals.template bit_cast_view <uint32_t >();
977+
978+ __esimd_lsc_store_stateless<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
979+ lsc_data_size::u32 , _VS, _Transposed, N>(
980+ pred.data (), addrs.data (), tmp.data ());
981+ }
929982}
930983
931984// / Accessor-based transposed scatter with 1 channel.
932985// / Supported platforms: DG2, PVC
933986// / VISA instruction: lsc_store.ugm
934987// /
935988// / Scatters elements to surface.
989+ // / See comments in the \ref lsc_block_load API for description and parameter
990+ // / constraints.
936991// /
937992// / @tparam T is element type.
938993// / @tparam NElts is the number of elements to store per address.
@@ -958,23 +1013,36 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
9581013 lsc_block_store<T, NElts, DS, L1H>(
9591014 __ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
9601015#else
961- detail::check_lsc_vector_size<NElts>();
9621016 detail::check_lsc_data_size<T, DS>();
9631017 detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
9641018 constexpr uint16_t _AddressScale = 1 ;
9651019 constexpr int _ImmOffset = 0 ;
9661020 constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
967- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
968- " Transposed store is supported only for data size u32 or u64" );
969- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
9701021 constexpr detail::lsc_data_order _Transposed =
9711022 detail::lsc_data_order::transpose;
9721023 constexpr int N = 1 ;
1024+
9731025 __ESIMD_NS::simd<uint32_t , N> offsets = offset;
9741026 auto si = __ESIMD_NS::get_surface_index (acc);
975- __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
976- _Transposed, N>(pred.data (), offsets.data (),
977- vals.data (), si);
1027+ constexpr int SmallIntFactor =
1028+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
1029+
1030+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
1031+ static_assert (NElts % SmallIntFactor == 0 ,
1032+ " Number of elements is not supported by Transposed store" );
1033+ constexpr detail::lsc_vector_size _VS =
1034+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
1035+ if constexpr (SmallIntFactor > 1 ) {
1036+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> Tmp =
1037+ vals.template bit_cast_view <uint32_t >();
1038+ __esimd_lsc_store_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
1039+ lsc_data_size::u32 , _VS, _Transposed, N>(
1040+ pred.data (), offsets.data (), Tmp.data (), si);
1041+ } else {
1042+ __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1043+ _Transposed, N>(pred.data (), offsets.data (),
1044+ vals.data (), si);
1045+ }
9781046#endif
9791047}
9801048
0 commit comments