intel · rishi-yadav · Oct 20, 2025 · Copilot · Oct 20, 2025 · Copilot
diff --git a/test/unit/cute/intel_xe/CMakeLists.txt b/test/unit/cute/intel_xe/CMakeLists.txt
@@ -47,6 +47,9 @@ cutlass_test_unit_add_executable(
   copy_scatter.cpp
   mma.cpp
   tiled_mma.cpp
+  xe_copy_prefetch_2d.cpp
+  xe_vnni_2d.cpp
+  xe_transpose_2d.cpp
 )
 else()
 cutlass_test_unit_add_executable(

diff --git a/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp b/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
- * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/detail/layout.hpp"
+
+#include <cute/tensor.hpp>
+#include <cute/atom/copy_atom.hpp>
+#include <cute/atom/copy_traits_xe_2d.hpp>
+#include <cute/arch/copy_xe_2d.hpp>
+#include <sycl/sycl.hpp>
+#include <cute/util/compat.hpp>
+
+#include "cutlass_unit_test.h"
+#include "utils.hpp"
+
+using namespace cute;
+using namespace cutlass;
+using namespace compat::experimental;
+
+#define SUBGROUP_SIZE (16)
+
+#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) 
+
+// Kernel name for unique identification
+template<class SrcTensor> 
+class XEPrefetch2DKernelName;
+
+// Device kernel for XE_PREFETCH_2D testing  
+template <class SrcTensor, int Bits, int Height, int Width>
+void xe_prefetch_2d_kernel(SrcTensor src) {
+  using namespace cute;
+  using Element = typename SrcTensor::value_type;
+
+  // Only execute with the first subgroup to avoid race conditions
+  if (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group(0) == 0) {
+    // Get thread/subgroup information
+    auto local_id = int(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_id(0));
+
+    // Create block 2D prefetch inside kernel (device-only operation)
+    using PrefetchOp = XE_PREFETCH_2D<Bits, Height, Width>;
+    auto tiled_prefetch = make_block_2d_copy(PrefetchOp{}, src);
+
+    // Get thread slice of the tiled prefetch
+    auto thr_prefetch = tiled_prefetch.get_slice(local_id);
+
+    // Create coordinate tensor for a single tile
+    auto coord_shape = make_shape(Int<Height>{}, Int<Width * Bits / sizeof_bits_v<Element>>{});
+    Tensor coord_tile = make_identity_tensor(coord_shape);
+
+    // Partition source coordinates for prefetch
+    auto thr_src_coord = thr_prefetch.partition_S(coord_tile);
+
+    // Create dummy destination fragment (prefetch ignores destination)
+    auto thr_dst_frag = thr_prefetch.partition_fragment_D(coord_tile);
+
+    // Perform the prefetch operation
+    copy(tiled_prefetch, thr_src_coord, thr_dst_frag);
+
+    // Synchronize to ensure all threads complete their operations
+    sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group());
+  }
+}
+
+// Host test function template for XE_PREFETCH_2D
+template <typename Element, int Bits, int Height, int Width>
+void test_xe_prefetch_2d() {
+  using namespace cute;
+
+  // Matrix dimensions - must be compatible with block 2D constraints
+  constexpr int M = Height;
+  constexpr int N = Width * sizeof_bits_v<Element> / Bits;
-  constexpr int N = Width * sizeof_bits_v<Element> / Bits;
+  constexpr int N = (Width * Bits) / sizeof_bits_v<Element>;
-  constexpr int N = Width * sizeof_bits_v<Element> / Bits;
+  constexpr int N = (Width * Bits) / sizeof_bits_v<Element>;
+
+  // Ensure proper alignment (required for block 2D operations)
+  constexpr int elem_alignment = 16 / sizeof(Element);  
+  constexpr int aligned_N = ((N + elem_alignment - 1) / elem_alignment) * elem_alignment;
+
+  // Allocate and initialize host data
+  cutlass::host_vector<Element> host_src(M * aligned_N);
+
+  // Initialize source with test pattern
+  for (size_t i = 0; i < host_src.size(); ++i) {
+    host_src[i] = static_cast<Element>(i % 256);
+  }
+
+  // Copy to device
+  cutlass::device_vector<Element> device_src = host_src;
+
+  // Create tensors with proper layout
+  Tensor tensor_src = 
+        make_tensor(make_gmem_ptr(device_src.data()),
+                    make_layout(Shape<Int<M>, Int<aligned_N>>{}, Stride<Int<aligned_N>, _1>{}));
+
+  // Launch kernel - prefetch happens on device
+  auto blockDim = compat::dim3(SUBGROUP_SIZE);
+  auto gridDim = compat::dim3(1);
+
+  launch<xe_prefetch_2d_kernel<decltype(tensor_src), Bits, Height, Width>,
+         XEPrefetch2DKernelName<decltype(tensor_src)>>(
+    launch_policy{
+      gridDim, blockDim,
+      kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}
+    },
+    tensor_src);
+
+  compat::wait_and_throw();
+
+  // Note: XE_PREFETCH_2D just prefetches to cache, no verification needed
+  EXPECT_TRUE(true) << "XE_PREFETCH_2D operation completed successfully";
+}
+
+TEST(PVC_CuTe_Xe, XE_PREFETCH_2D_uint8) {
+  test_xe_prefetch_2d<uint8_t, 8, 2, 64>();
+  test_xe_prefetch_2d<uint8_t, 8, 4, 64>();
+}
+
+TEST(PVC_CuTe_Xe, XE_PREFETCH_2D_int16) {
+  test_xe_prefetch_2d<int16_t, 16, 2, 32>();
+  test_xe_prefetch_2d<int16_t, 16, 4, 32>();
+}
+
+TEST(PVC_CuTe_Xe, XE_PREFETCH_2D_float) {
+  test_xe_prefetch_2d<float, 32, 2, 16>();
+  test_xe_prefetch_2d<float, 32, 4, 16>();
+}
+
+#else
+
+// For the fallback case
+#include "cutlass_unit_test.h"
+
+TEST(PVC_CuTe_Xe, XE_PREFETCH_2D_SKIPPED) {
+  GTEST_SKIP() << "XE_PREFETCH_2D tests require IGC version 2.18 or higher. skipped";
+}
+
+#endif
diff --git a/test/unit/cute/intel_xe/xe_transpose_2d.cpp b/test/unit/cute/intel_xe/xe_transpose_2d.cpp
@@ -0,0 +1,100 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <cute/tensor.hpp>
+#include <cute/atom/copy_atom.hpp>
+#include <cute/atom/copy_traits_xe_2d.hpp>
+#include <cute/arch/copy_xe_2d.hpp>
+#include <sycl/sycl.hpp>
+#include "cutlass_unit_test.h"
+
+using namespace cute;
+
+#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) 
+
+TEST(PVC_CuTe_Xe, XE_LOAD_2D_TRANSPOSE_API_Declaration) {
+  // Template: XE_LOAD_2D_TRANSPOSE<Bits, Height, Width>
+  // Constraints: Bits == 32 || Bits == 64, Width <= 8
+  // For 64-bit: Height == 8 && Width < 4
+
+  // Test 32-bit transpose operations
+  using TransposeOp_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>;
+  using TransposeOp_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;
+  using TransposeOp_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>;
+
+  // Test 64-bit transpose operations (limited constraints)
+  using TransposeOp_64bit_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
+  using TransposeOp_64bit_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
+
+  // Test that the operations have the required static members from XE_Copy_Op_2D_Base
+  static_assert(TransposeOp_32bit_2x4::AtomHeight == 2);
+  static_assert(TransposeOp_32bit_2x4::AtomWidth == 4);
+  static_assert(TransposeOp_32bit_2x4::CopyBits == 32);
+
+  static_assert(TransposeOp_32bit_4x8::AtomHeight == 4);
+  static_assert(TransposeOp_32bit_4x8::AtomWidth == 8);
+  static_assert(TransposeOp_32bit_4x8::CopyBits == 32);
+
+  static_assert(TransposeOp_64bit_8x2::AtomHeight == 8);
+  static_assert(TransposeOp_64bit_8x2::AtomWidth == 2);
+  static_assert(TransposeOp_64bit_8x2::CopyBits == 64);
+
+  EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE API types declared successfully";
+}
+
+TEST(PVC_CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Constraints) {
+  // Test that the compile-time constraints are enforced
+
+  // Valid 32-bit operations
+  using Valid32_1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
+  using Valid32_2 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; // Width <= 8
+
+  // Valid 64-bit operations (Height == 8 && Width < 4)
+  using Valid64_1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
+  using Valid64_2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
+  using Valid64_3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
+
+  static_assert(Valid32_1::CopyBits == 32);
+  static_assert(Valid32_2::CopyBits == 32);
+  static_assert(Valid64_1::CopyBits == 64);
+  static_assert(Valid64_2::CopyBits == 64);
+  static_assert(Valid64_3::CopyBits == 64);
+
+  EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE constraint validation successful";
+}
+
+#else
+
+TEST(PVC_CuTe_Xe, XE_LOAD_2D_TRANSPOSE_SKIPPED) {
+  GTEST_SKIP() << "XE_LOAD_2D_TRANSPOSE tests require IGC version 2.18 or higher. skipped";
+}
+
+#endif
diff --git a/test/unit/cute/intel_xe/xe_vnni_2d.cpp b/test/unit/cute/intel_xe/xe_vnni_2d.cpp
@@ -0,0 +1,70 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
- * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <cute/tensor.hpp>
+#include <cute/atom/copy_atom.hpp>
+#include <cute/atom/copy_traits_xe_2d.hpp>
+#include <cute/arch/copy_xe_2d.hpp>
+#include <sycl/sycl.hpp>
+#include "cutlass_unit_test.h"
+
+using namespace cute;
+
+#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) 
+
+TEST(PVC_CuTe_Xe, XE_LOAD_2D_VNNI_API_Declaration) {
+  // Template: XE_LOAD_2D_VNNI<Bits, Height, Width, BlockWidth = Width>
+
+  // Test that the VNNI operation types can be declared
+  using VNNIOp_8bit_2x32 = XE_LOAD_2D_VNNI<8, 2, 32>;
+  using VNNIOp_8bit_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>;
+  using VNNIOp_16bit_2x16 = XE_LOAD_2D_VNNI<16, 2, 16>;
+  using VNNIOp_16bit_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>;
+
+  // Test that the operations have the required static members from XE_Copy_Op_2D_Base
+  static_assert(VNNIOp_8bit_2x32::AtomHeight == 2);
+  static_assert(VNNIOp_8bit_2x32::AtomWidth == 32);
+  static_assert(VNNIOp_8bit_2x32::CopyBits == 8);
+
+  static_assert(VNNIOp_16bit_2x16::AtomHeight == 2);
+  static_assert(VNNIOp_16bit_2x16::AtomWidth == 16);
+  static_assert(VNNIOp_16bit_2x16::CopyBits == 16);
+
+  EXPECT_TRUE(true) << "XE_LOAD_2D_VNNI API types declared successfully";
+}
+
+#else
+
+TEST(PVC_CuTe_Xe, XE_LOAD_2D_VNNI_SKIPPED) {
+  GTEST_SKIP() << "XE_LOAD_2D_VNNI tests require IGC version 2.18 or higher. skipped";
+}
+
+#endif