fix cuda and macos compile issues

LostRuins · LostRuins · commit a5580a32fbfe · 2025-09-12T20:53:42.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -79,6 +79,8 @@ file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-mma*.cu")
 list(APPEND GGML_SOURCES_CUDA ${SRCS})
 file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu")
 list(APPEND GGML_SOURCES_CUDA ${SRCS})
+file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmf*.cu")
+list(APPEND GGML_SOURCES_CUDA ${SRCS})
 set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h)
 set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
 set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
@@ -176,6 +178,8 @@ if (LLAMA_HIPBLAS)
         list(APPEND GGML_SOURCES_ROCM ${SRCS})
         file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu")
         list(APPEND GGML_SOURCES_ROCM ${SRCS})
+        file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmf*.cu")
+        list(APPEND GGML_SOURCES_ROCM ${SRCS})
         add_compile_definitions(GGML_USE_HIP GGML_USE_CUDA SD_USE_CUDA GGML_HIP_NO_VMM)
         add_library(ggml-rocm ${GGML_SOURCES_CUDA})
 
diff --git a/Makefile b/Makefile
@@ -191,6 +191,7 @@ endif
 # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
 OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu))
 OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmf*.cu))
 OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
 OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
 OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
diff --git a/otherarch/ttscpp/src/dia_model.cpp b/otherarch/ttscpp/src/dia_model.cpp
@@ -264,11 +264,6 @@ void dia_context::reset() {
 
 struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu) {
     dia_context * dctx = new dia_context(model, n_threads);
-    if (!use_cpu) {
-#ifdef GGML_USE_METAL
-        dctx->backend = ggml_backend_metal_init();
-#endif
-    }
     dctx->backend_cpu = ggml_backend_cpu_init();
     dctx->set_threads();
     dctx->build_schedule();
@@ -280,9 +275,7 @@ static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, di
     ggml_backend_buffer_type_t buft = nullptr;
     // this will only really support cpu or metal for the time being;
     if (dctx->backend != nullptr) {
-#ifdef GGML_USE_METAL
-        buft = ggml_backend_metal_buffer_type();
-#endif
+
     } else {
         buft = ggml_backend_cpu_buffer_type();
     }
diff --git a/otherarch/ttscpp/src/orpheus_model.cpp b/otherarch/ttscpp/src/orpheus_model.cpp
@@ -138,11 +138,6 @@ struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx,
 
 orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads, bool use_cpu) {
     orpheus_context * octx = new orpheus_context(model, n_threads);
-    if (!use_cpu) {
-#ifdef GGML_USE_METAL
-        octx->backend = ggml_backend_metal_init();
-#endif
-    }
     octx->backend_cpu = ggml_backend_cpu_init();
     octx->set_threads();
     octx->build_schedule();
@@ -153,9 +148,6 @@ orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads
 void orpheus_runner::orpheus_kv_cache_init() {
     ggml_backend_buffer_type_t buft = nullptr;
     if (octx->backend != nullptr) {
-#ifdef GGML_USE_METAL
-        buft = ggml_backend_metal_buffer_type();
-#endif
     } else {
         buft = ggml_backend_cpu_buffer_type();
     }
diff --git a/otherarch/ttscpp/src/parler_model.cpp b/otherarch/ttscpp/src/parler_model.cpp
@@ -323,11 +323,6 @@ void parler_context::reset(int32_t n_output_heads) {
 
 struct parler_context * build_new_parler_context(struct parler_tts_model * model, int n_threads, bool use_cpu) {
     parler_context * pctx = new parler_context(model, n_threads);
-    if (!use_cpu) {
-#ifdef GGML_USE_METAL
-        pctx->backend = ggml_backend_metal_init();
-#endif
-    }
     pctx->eos_seen.reserve(model->n_output_heads);
     pctx->backend_cpu = ggml_backend_cpu_init();
     pctx->set_threads();
@@ -343,9 +338,6 @@ static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_mode
     ggml_backend_buffer_type_t buft = nullptr;
     // this will only really support cpu or metal for the time being;
     if (pctx->backend != nullptr) {
-#ifdef GGML_USE_METAL
-        buft = ggml_backend_metal_buffer_type();
-#endif
     } else {
         buft = ggml_backend_cpu_buffer_type();
     }
diff --git a/otherarch/ttscpp/src/tts_model.cpp b/otherarch/ttscpp/src/tts_model.cpp
@@ -50,9 +50,6 @@ void runner_context::set_threads() {
 void runner_context::build_schedule(size_t max_nodes) {
     backend_cpu_buffer = ggml_backend_cpu_buffer_type();
     if (backend != nullptr) {
-#ifdef GGML_USE_METAL
-        backend_buffer = ggml_backend_metal_buffer_type();
-#endif
         std::vector<ggml_backend_buffer_type_t> bufs = {backend_buffer, backend_cpu_buffer};
         std::vector<ggml_backend_t> backs = {backend, backend_cpu};
         sched = ggml_backend_sched_new(backs.data(), bufs.data(), 2, max_nodes, false, false);
@@ -103,10 +100,6 @@ void tts_model::prep_buffers_and_context(bool cpu_only, float size_offset, uint3
         backend = ggml_backend_cpu_init();
         buffer = ggml_backend_cpu_buffer_type();
     } else {
-#ifdef GGML_USE_METAL
-        backend = ggml_backend_metal_init();
-        buffer = ggml_backend_metal_buffer_type();
-#endif
         // if use metal is not installed then we need to warn here
         if (!backend || !buffer) {
             TTS_ABORT("'GGML_USE_METAL' is not defined either set the model to use CPU only or install ggml with metal support.");