more enhancement

jiejanezhang · jiejanezhang · commit 5d749a53ddbe · 2025-07-31T11:02:27.000+08:00
more enhancement
diff --git a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp
@@ -1488,6 +1488,8 @@ class SYCLGen : public SYCLGenBase {
     // Data types of A, B & C matrices respectively in the PTX arguments
     std::string InMatrixType[3];
 
+    InMatrixType[2] = CDType;
+
     if (Inst->hasAttr(InstAttr::m8n8k4)) {
       M = "8";
       N = "8";
@@ -1573,6 +1575,7 @@ class SYCLGen : public SYCLGenBase {
           NumVecElements[1] = 2; // B
           NumVecElements[2] = 2; // C
           NumVecElements[3] = 2; // D
+          InMatrixType[2] = "uint32_t";  // C type is f16*2
         }
         else
           return SYCLGenError();
@@ -1613,8 +1616,6 @@ class SYCLGen : public SYCLGenBase {
     } else
       return SYCLGenError();
 
-    InMatrixType[2] = CDType;
-
     // Check the register sizes for vector elements of A, B, C & D matrices
     for (unsigned InputOp = 0; InputOp < Inst->getNumInputOperands();
          InputOp++) {
diff --git a/clang/runtime/dpct-rt/include/dpct/math.hpp b/clang/runtime/dpct-rt/include/dpct/math.hpp
@@ -2673,10 +2673,14 @@ void mma(volatile void **d_mat_frag, void *a_mat_frag, void *b_mat_frag,
       }
     } else if constexpr (std::is_same_v<CDType, sycl::half>) {
       // Init D matrix fragment with C matrix fragment
-      *const_cast<sycl::half *>(d[0]) = c[0];
-      *const_cast<sycl::half *>(d[1]) = c[1];
-      *const_cast<sycl::half *>(d[2]) = c[2];
-      *const_cast<sycl::half *>(d[3]) = c[3];
+      sycl::half *d0 = const_cast<sycl::half *>(d[0]);
+      sycl::half *d1 = d0 + 1;
+      sycl::half *d2 = const_cast<sycl::half *>(d[1]);
+      sycl::half *d3 = d2 + 1;
+      *d0 = c[0];
+      *d1 = c[1];
+      *d2 = c[2];
+      *d3 = c[3];
 
       // Each sub-group is responsible for computing a fragment size of 16*8
       // elements of matrix D.
@@ -2731,10 +2735,10 @@ void mma(volatile void **d_mat_frag, void *a_mat_frag, void *b_mat_frag,
         //             static_cast<CDType>(rb[j + 4]);
 
         for (int j = 0; j < 4; j++) {
-          *const_cast<sycl::half *>(d[0]) += ra[j] * rb[j];
-          *const_cast<sycl::half *>(d[1]) += ra[j] * rb[j + 4];
-          *const_cast<sycl::half *>(d[2]) += ra[j + 4] * rb[j];
-          *const_cast<sycl::half *>(d[3]) += ra[j + 4] * rb[j + 4];
+          *d0 += ra[j] * rb[j];
+          *d1 += ra[j] * rb[j + 4];
+          *d2 += ra[j + 4] * rb[j];
+          *d3 += ra[j + 4] * rb[j + 4];
         }
       }
     } else if constexpr (std::is_integral_v<ABType>) {
diff --git a/clang/test/dpct/asm/mma.cu b/clang/test/dpct/asm/mma.cu
@@ -19,7 +19,8 @@ m8n8k16         .s8          .s8          .s32
 m16n8k8       .f16/.bf16  .f16/.bf16      .f32    
 m16n8k16        .f16         .f16         .f32
                 .bf16        .bf16        .f32
-                .s8          .s8          .s32    
+                .s8          .s8          .s32  
+                .f16         .f16         .f16     
 m16n8k32        .s8          .s8          .s32    
 
 Except for m8n8k4, all other shapes are supported for row/col layout of A/B matrices respectively.