Support mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16

jiejanezhang · jiejanezhang · commit b1c973c953f1 · 2025-07-30T17:17:15.000+08:00
diff --git a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp
@@ -1560,13 +1560,21 @@ class SYCLGen : public SYCLGenBase {
         InMatrixType[0] = "uint32_t"; // A type is .f16/.bf16x2
         InMatrixType[1] = "uint32_t"; // B type is .f16/.bf16x2
 
-        // If A matrix type is f16, then C&D matrix types can only be f32
+        // If A matrix type is f16, then C&D matrix types can be f32
         if (CType->getKind() == InlineAsmBuiltinType::f32) {
           NumVecElements[0] = 4; // A
           NumVecElements[1] = 2; // B
           NumVecElements[2] = 4; // C
           NumVecElements[3] = 4; // D
-        } else
+        } 
+        // C &D matrix types can be f16.
+        else if (CType->getKind() == InlineAsmBuiltinType::f16) {
+          NumVecElements[0] = 4; // A
+          NumVecElements[1] = 2; // B
+          NumVecElements[2] = 2; // C
+          NumVecElements[3] = 2; // D
+        }
+        else
           return SYCLGenError();
       } else if (AType->getKind() == InlineAsmBuiltinType::s8) {
         InMatrixType[0] = "uint32_t"; // A type is .s8x4
diff --git a/clang/runtime/dpct-rt/include/dpct/math.hpp b/clang/runtime/dpct-rt/include/dpct/math.hpp
@@ -2671,6 +2671,72 @@ void mma(volatile void **d_mat_frag, void *a_mat_frag, void *b_mat_frag,
               static_cast<CDType>(ra[j + 4]) * static_cast<CDType>(rb[j + 4]);
         }
       }
+    } else if constexpr (std::is_same_v<CDType, sycl::half>) {
+      // Init D matrix fragment with C matrix fragment
+      *const_cast<sycl::half *>(d[0]) = c[0];
+      *const_cast<sycl::half *>(d[1]) = c[1];
+      *const_cast<sycl::half *>(d[2]) = c[2];
+      *const_cast<sycl::half *>(d[3]) = c[3];
+
+      // Each sub-group is responsible for computing a fragment size of 16*8
+      // elements of matrix D.
+      // Each work item computes 4 elements of matrix D by gathering
+      // their corresponding row & col matrix fragments of length k (8)
+      // from A & B matrices respectively using below mapping logic:
+      // row0 = (lane >> 2)    & row1 = (lane >> 2) + 8
+      // col0 = (lane % 4) * 2 & col1 = (lane % 4) * 2 + 1
+      // As each row & col fragment of A & B matrices is distributed across
+      // 4 work items, each iteration of below loop loads a partial fragment
+      // of matrix A (row) and matrix B (col) using the row & col offsets.
+      for (int i = 0; i < 4; i++) {
+        typename MMAType<ABType>::PackType recv_a[4], recv_b[4];
+
+        // Load partial fragment from row0 of matrix A ({a0, a1})
+        recv_a[0] = dpct::select_from_sub_group(sg, a[0], row_load_offset + i);
+        // Load partial fragment from row0 of matrix A ({a2, a3})
+        recv_a[1] = dpct::select_from_sub_group(sg, a[2], row_load_offset + i);
+        // Load partial fragment from row1 of matrix A ({a0, a1})
+        recv_a[2] = dpct::select_from_sub_group(sg, a[1], row_load_offset + i);
+        // Load partial fragment from row1 of matrix A ({a2, a3})
+        recv_a[3] = dpct::select_from_sub_group(sg, a[3], row_load_offset + i);
+
+        // Load partial fragment from col0 of matrix B ({b0, b1})
+        recv_b[0] = dpct::select_from_sub_group(sg, b[0], col_load_offset + i);
+        // Load partial fragment from col0 of matrix B ({b2, b3})
+        recv_b[1] = dpct::select_from_sub_group(sg, b[1], col_load_offset + i);
+        // Load partial fragment from col1 of matrix B ({b0, b1})
+        recv_b[2] =
+            dpct::select_from_sub_group(sg, b[0], col_load_offset + 4 + i);
+        // Load partial fragment from col1 of matrix B ({b2, b3})
+        recv_b[3] =
+            dpct::select_from_sub_group(sg, b[1], col_load_offset + 4 + i);
+
+        auto ra = reinterpret_cast<ABType *>(recv_a);
+        auto rb = reinterpret_cast<ABType *>(recv_b);
+
+        // Each work item calculates a partial product of A & B matrix
+        // fragments and adds it to the corresponding D matrix fragment d0
+        // += row0{ a0, a1, a2, a3 } * col0{ b0, b1, b2, b3 } d1 += row0{
+        // a0, a1, a2, a3 } * col1{ b0, b1, b2, b3 } d2 += row1{ a0, a1,
+        // a2, a3 } * col0{ b0, b1, b2, b3 } d3 += row1{ a0, a1, a2, a3 } *
+        // col1{ b0, b1, b2, b3 }
+        // for (int j = 0; j < 4; j++) {
+        //    *d[0] +=
+        //        static_cast<CDType>(ra[j]) * static_cast<CDType>(rb[j]);
+        //    *d[1] += static_cast<CDType>(ra[j]) *
+        //             static_cast<CDType>(rb[j + 4]);
+        //    *d[2] += static_cast<CDType>(ra[j + 4]) *
+        //             static_cast<CDType>(rb[j]);
+        //    *d[3] += static_cast<CDType>(ra[j + 4]) *
+        //             static_cast<CDType>(rb[j + 4]);
+
+        for (int j = 0; j < 4; j++) {
+          *const_cast<sycl::half *>(d[0]) += ra[j] * rb[j];
+          *const_cast<sycl::half *>(d[1]) += ra[j] * rb[j + 4];
+          *const_cast<sycl::half *>(d[2]) += ra[j + 4] * rb[j];
+          *const_cast<sycl::half *>(d[3]) += ra[j + 4] * rb[j + 4];
+        }
+      }
     } else if constexpr (std::is_integral_v<ABType>) {
       // Init D matrix with fragments of C matrix
       *d[0] = c[0];
diff --git a/clang/test/dpct/asm/mma.cu b/clang/test/dpct/asm/mma.cu
@@ -100,6 +100,24 @@ __global__ void mma_kernel_m16n8k8(int *a, int *b, float *fc, float *fd) {
         "f"(fc[0]), "f"(fc[1]), "f"(fc[2]), "f"(fc[3]));
 }
 
+__global__ void mma_kernel_m16n8k16(int *a, int *b, int *c, int *d) {
+  // CHECK: {
+  // CHECK-NEXT:   volatile void *d_mat_frag_ct1[4] = { &fc[0], &fc[1]};
+  // CHECK-NEXT:   sycl::vec<uint32_t, 4> a_mat_frag_ct1(a[0], a[1], a[2], a[3]);
+  // CHECK-NEXT:   sycl::vec<uint32_t, 2> b_mat_frag_ct1(b[0], b[1]);
+  // CHECK-NEXT:   sycl::vec<uint32, 4> c_mat_frag_ct1(fc[0], fc[1]);
+  // CHECK-NEXT:   dpct::experimental::matrix::mma<16, 8, 16, sycl::half, sycl::half>(reinterpret_cast<volatile void **>(d_mat_frag_ct1), &a_mat_frag_ct1, &b_mat_frag_ct1, &c_mat_frag_ct1);
+  // CHECK-NEXT: }
+  asm("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+        " { %0, %1 }, "
+        " { %2, %3, %4, %5 }, "
+        " { %6, %7 }, "
+        " { %0, %1 };"
+        : "+r"(c[0]), "+r"(c[1])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]),
+          "r"(d[0]), "r"(d[1]));
+}
+
 __global__ void mma_kernel_m16n8k16(int *a, int *b, int *c, float *fc, int *d) {
   // CHECK: {
   // CHECK-NEXT:   volatile void *d_mat_frag_ct1[4] = { &fc[0], &fc[1], &fc[2], &fc[3] };