Revert "CUDA/HIP: optimize mmv paths taken for HIP devices (ggml-org#14324)"

Nexesenex · Nexesenex · commit 1316a4d3ea7a · 2025-06-24T18:50:07.000+02:00
This reverts commit 0142961.
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -267,11 +267,7 @@ static bool fp16_mma_hardware_available(const int cc) {
 }
 
 static bool bf16_mma_hardware_available(const int cc) {
-    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
-}
-
-static bool fp32_mma_hardware_available(const int cc) {
-    return GGML_CUDA_CC_IS_CDNA(cc);
+    return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
 }
 
 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
@@ -456,11 +456,6 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
                     return ne11 <= 4;
                 }
                 return ne11 <= 3;
-            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
-                if (fp32_mma_hardware_available(cc)) {
-                    return ne11 <= 3;
-                }
-                return ne11 <= 8;
             }
             return ne11 <= 8;
         case GGML_TYPE_F16:
@@ -473,14 +468,6 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
                     return src0_small && ne11 <= 3;
                 }
                 return ne11 <= 8;
-            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
-                if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
-                        return ne11 <= 5;
-                    }
-                    return ne11 <= 2;
-                }
-                return ne11 <= 8;
             }
             return ne11 <= 8;
         case GGML_TYPE_BF16:
@@ -493,11 +480,6 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
                     return src0_small && ne11 <= 3;
                 }
                 return ne11 <= 8;
-            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
-                if (bf16_mma_hardware_available(cc)) {
-                    return ne11 <= 3;
-                }
-                return ne11 <= 8;
             }
             return ne11 <= 8;
         default:

Original file line number	Diff line number	Diff line change
`@@ -267,11 +267,7 @@ static bool fp16_mma_hardware_available(const int cc) {`
`267`	`267`	`}`
`268`	`268`
`269`	`269`	`static bool bf16_mma_hardware_available(const int cc) {`
`270`		`- return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) \|\| GGML_CUDA_CC_IS_CDNA(cc) \|\| cc >= GGML_CUDA_CC_RDNA3;`
`271`		`-}`
`272`		`-`
`273`		`-static bool fp32_mma_hardware_available(const int cc) {`
`274`		`- return GGML_CUDA_CC_IS_CDNA(cc);`
	`270`	`+ return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;`
`275`	`271`	`}`
`276`	`272`
`277`	`273`	`// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.`