refactor: PR code cleanup, amd_mma_available->amd_mfma_available

deepsek · deepsek · commit 279b51e0fd53 · 2025-07-21T22:11:27.000-04:00
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -228,7 +228,7 @@ typedef float2 dfloat2;
 #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
 
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
-#define AMD_MMA_AVAILABLE
+#define AMD_MFMA_AVAILABLE
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
 
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -294,7 +294,7 @@ static bool fp32_mma_hardware_available(const int cc) {
 }
 
 // AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
-static bool amd_mma_available(const int cc) {
+static bool amd_mfma_available(const int cc) {
     return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
 }
 
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
@@ -127,7 +127,7 @@ namespace ggml_cuda_mma {
                 static_assert(I == -1 && J == -1, "template specialization not implemented");
             }
         }
-#endif
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     };
 
     template <int I_, int J_>
@@ -182,10 +182,16 @@ namespace ggml_cuda_mma {
 
     template <int I, int J, typename T>
     static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
+#if defined(AMD_MFMA_AVAILABLE)
+        int64_t * xi = (int64_t *) t.x;
+        const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
+        xi[0] = xs[0];
+#else
 #pragma unroll
         for (int l = 0; l < t.ne; ++l) {
             t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
         }
+#endif // defined(AMD_MFMA_AVAILABLE)
     }
 
     template <typename T>
@@ -220,11 +226,7 @@ namespace ggml_cuda_mma {
     template <typename T>
     static __device__ __forceinline__ void load_ldmatrix(
             tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
-#if defined(AMD_MMA_AVAILABLE)
-        int64_t* xi = (int64_t*) t.x;
-        const int64_t* xs = (int64_t*) ((const int*) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-        xi[0] = xs[0];
-#elif defined(NEW_MMA_AVAILABLE)
+#if defined(NEW_MMA_AVAILABLE)
         int * xi = (int * ) t.x;
         const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
         asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
@@ -235,23 +237,6 @@ namespace ggml_cuda_mma {
 #endif // NEW_MMA_AVAILABLE
     }
 
-    template <typename T>
-    static __device__ __forceinline__ void load_ldmatrix(
-            tile<32, 4, T> & t, const T * __restrict__ xs0, const int stride) {
-#if defined(AMD_MMA_AVAILABLE)
-        int64_t* xi = (int64_t*) t.x;
-        const int64_t* xs = (int64_t*) ((const int*) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-        xi[0] = xs[0];
-#elif defined(NEW_MMA_AVAILABLE)
-        GGML_UNUSED(t);
-        GGML_UNUSED(xs0);
-        GGML_UNUSED(stride);
-        NO_DEVICE_CODE;
-#else
-        load_generic(t, xs0, stride);
-#endif // AMD_MMA_AVAILABLE
-    }
-
     template <typename T>
     static __device__ __forceinline__ void load_ldmatrix_trans(
             tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
@@ -451,15 +436,23 @@ namespace ggml_cuda_mma {
 
     static __device__ __forceinline__ void mma(
             tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
-#if defined(AMD_MMA_AVAILABLE)
-#if defined(CDNA3)
+#if defined(AMD_MFMA_AVAILABLE)
         using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
-        int32x4_t* acc = (int32x4_t*) D.x;
-        acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t*) A.x)[0],
-                                                       ((int64_t*) B.x)[0],
+        int32x4_t * acc = (int32x4_t *) D.x;
+#if defined(CDNA3)
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
+                                                       ((int64_t *) B.x)[0],
                                                        acc[0],
                                                        0, 0, 0);
 #elif defined(CDNA2) || defined(CDNA)
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
+                                                      B.x[0],
+                                                      acc[0],
+                                                      0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
+                                                      B.x[1],
+                                                      acc[0],
+                                                      0, 0, 0);
 #endif
 #else
         GGML_UNUSED(D);
@@ -471,15 +464,23 @@ namespace ggml_cuda_mma {
 
     static __device__ __forceinline__ void mma(
             tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
-#if defined(AMD_MMA_AVAILABLE)
-#if defined(CDNA3)
+#if defined(AMD_MFMA_AVAILABLE)
         using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
-        int32x16_t* acc = (int32x16_t*) D.x;
-        acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t*) A.x)[0],
-                                                       ((int64_t*) B.x)[0],
+        int32x16_t * acc = (int32x16_t *) D.x;
+#if defined(CDNA3)
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
+                                                       ((int64_t *) B.x)[0],
                                                        acc[0],
                                                        0, 0, 0);
 #elif defined(CDNA2) || defined(CDNA)
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
+                                                     B.x[0],
+                                                     acc[0],
+                                                     0, 0, 0);
+        acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
+                                                     B.x[1],
+                                                     acc[0],
+                                                     0, 0, 0);
 #endif
 #else
         GGML_UNUSED(D);
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
@@ -306,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
         return false;
     }
 
-    if (new_mma_available(cc) || amd_mma_available(cc)) {
+    if (new_mma_available(cc) || amd_mfma_available(cc)) {
         return true;
     }
 
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh

Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,7 @@ typedef float2 dfloat2;`
`228`	`228`	`#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) \|\| defined(RDNA3) \|\| (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))`
`229`	`229`
`230`	`230`	`#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)`
`231`		`-#define AMD_MMA_AVAILABLE`
	`231`	`+#define AMD_MFMA_AVAILABLE`
`232`	`232`	`#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)`
`233`	`233`
`234`	`234`	`#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING`
`@@ -294,7 +294,7 @@ static bool fp32_mma_hardware_available(const int cc) {`
`294`	`294`	`}`
`295`	`295`
`296`	`296`	`// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.`
`297`		`-static bool amd_mma_available(const int cc) {`
	`297`	`+static bool amd_mfma_available(const int cc) {`
`298`	`298`	`return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);`
`299`	`299`	`}`
`300`	`300`
Original file line number	Diff line number	Diff line change
`@@ -306,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {`
`306`	`306`	`return false;`
`307`	`307`	`}`
`308`	`308`
`309`		`- if (new_mma_available(cc) \|\| amd_mma_available(cc)) {`
	`309`	`+ if (new_mma_available(cc) \|\| amd_mfma_available(cc)) {`
`310`	`310`	`return true;`
`311`	`311`	`}`
`312`	`312`