CUDA: use arch list for feature availability check

JohannesGaessler · JohannesGaessler · commit 23ef2039329a · 2025-02-10T14:38:19.000+01:00
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -71,6 +71,62 @@
 #define GGML_CUDA_CC_QY1        210
 #define GGML_CUDA_CC_QY2        220
 
+#ifdef __CUDA_ARCH_LIST__
+constexpr bool ggml_cuda_has_arch_impl(int) {
+    return false;
+}
+
+template<class ... Archs>
+constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
+    return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
+}
+
+constexpr bool ggml_cuda_has_arch(const int arch) {
+    return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
+}
+
+static int ggml_cuda_highest_compiled_arch(const int arch) {
+    switch (arch) {
+        case 1200: if (ggml_cuda_has_arch(1200)) return 1200; [[fallthrough]];
+        case 1010: if (ggml_cuda_has_arch(1000)) return 1010; [[fallthrough]];
+        case 1000: if (ggml_cuda_has_arch(1000)) return 1000; [[fallthrough]];
+        case  900: if (ggml_cuda_has_arch( 900)) return  900; [[fallthrough]];
+        case  890: if (ggml_cuda_has_arch( 890)) return  890; [[fallthrough]];
+        case  870: if (ggml_cuda_has_arch( 870)) return  870; [[fallthrough]];
+        case  860: if (ggml_cuda_has_arch( 860)) return  860; [[fallthrough]];
+        case  800: if (ggml_cuda_has_arch( 800)) return  800; [[fallthrough]];
+        case  750: if (ggml_cuda_has_arch( 750)) return  750; [[fallthrough]];
+        case  720: if (ggml_cuda_has_arch( 720)) return  720; [[fallthrough]];
+        case  700: if (ggml_cuda_has_arch( 700)) return  700; [[fallthrough]];
+        case  620: if (ggml_cuda_has_arch( 620)) return  620; [[fallthrough]];
+        case  610: if (ggml_cuda_has_arch( 610)) return  610; [[fallthrough]];
+        case  600: if (ggml_cuda_has_arch( 600)) return  600; [[fallthrough]];
+        case  530: if (ggml_cuda_has_arch( 530)) return  530; [[fallthrough]];
+        case  520: if (ggml_cuda_has_arch( 520)) return  520; [[fallthrough]];
+        case  500: if (ggml_cuda_has_arch( 500)) return  500; [[fallthrough]];
+        case  370: if (ggml_cuda_has_arch( 370)) return  370; [[fallthrough]];
+        case  350: if (ggml_cuda_has_arch( 350)) return  350; [[fallthrough]];
+        case  320: if (ggml_cuda_has_arch( 320)) return  320; [[fallthrough]];
+        case  300: if (ggml_cuda_has_arch( 300)) return  300; [[fallthrough]];
+        case  210: if (ggml_cuda_has_arch( 210)) return  210; [[fallthrough]];
+        case  200: if (ggml_cuda_has_arch( 200)) return  200; [[fallthrough]];
+        case  130: if (ggml_cuda_has_arch( 130)) return  130; [[fallthrough]];
+        case  120: if (ggml_cuda_has_arch( 120)) return  120; [[fallthrough]];
+        case  110: if (ggml_cuda_has_arch( 110)) return  110; [[fallthrough]];
+        case  100: if (ggml_cuda_has_arch( 100)) return  100;
+        GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
+
+        default: GGML_ABORT("unknown CUDA arch: %d", arch);
+    }
+}
+#else
+static int ggml_cuda_highest_compiled_arch(const int arch) {
+    return arch;
+}
+#endif // __CUDA_ARCH_LIST__
+
+// ---------------------------------------------------------------------------------------------------------
+
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
 #if defined(_MSC_VER)
@@ -162,18 +218,18 @@ typedef float2 dfloat2;
 #define FLASH_ATTN_AVAILABLE
 #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 
-static constexpr bool fast_fp16_available(const int cc) {
-    return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
+static bool fast_fp16_available(const int cc) {
+    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL && cc != 610;
 }
 
 // Any FP16 tensor cores are available.
-static constexpr bool fp16_mma_available(const int cc) {
-    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
+static bool fp16_mma_available(const int cc) {
+    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
 }
 
 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
-static constexpr bool new_mma_available(const int cc) {
-    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
+static bool new_mma_available(const int cc) {
+    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 }
 
 static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -86,7 +86,7 @@ struct tile_x_sizes {
     int sc;
 };
 
-static constexpr int get_mmq_x_max_host(const int cc) {
+static int get_mmq_x_max_host(const int cc) {
     return new_mma_available(cc) ? 128 :
 #ifdef GGML_CUDA_FORCE_MMQ
         cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128                     : 64;