diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2ead001e2c610..af78fe4244ac8 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -158,6 +158,8 @@ option(GGML_CUDA "ggml: use CUDA" option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) +option(GGML_CUDA_NO_TURING_MMA "ggml: disable the use of mma in mmq kernels" OFF) + set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "ggml: max. batch size for using peer access") option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index ea824965aae2d..b609ee6e77cad 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -77,6 +77,11 @@ if (CUDAToolkit_FOUND) add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) endif() + if (GGML_CUDA_NO_TURING_MMA) + add_compile_definitions(GGML_CUDA_NO_TURING_MMA) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + if (GGML_CUDA_NO_VMM) add_compile_definitions(GGML_CUDA_NO_VMM) endif() diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4e17fd211e1bb..0b37b32411c53 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -71,6 +71,10 @@ #include #include +#ifdef GGML_CUDA_NO_TURING_MMA +#define CUBLAS_COMPUTE_16F CUBLAS_COMPUTE_16F_PEDANTIC +#endif + static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); [[noreturn]] diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 650f7080677ad..9834830a945d9 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -9,6 +9,9 @@ using namespace ggml_cuda_mma; +#ifdef GGML_CUDA_NO_TURING_MMA +#undef TURING_MMA_AVAILABLE +#endif #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available. #define MMQ_ITER_K 256 #define MMQ_NWARPS 8