ikawrakow · ikawrakow · Sep 4, 2025 · Aug 16, 2025 · Aug 16, 2025 · Aug 16, 2025
diff --git a/common/common.cpp b/common/common.cpp
@@ -1378,6 +1378,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
+    if (arg == "--offload-only-active-experts" || arg == "-ooae") {
+        params.only_active_exps = true;
+        return true;
+    }
     if (arg == "--host") {
         CHECK_ARG
         params.hostname = argv[i];
@@ -2746,6 +2750,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.fused_up_gate     = params.fused_up_gate;
     cparams.min_experts       = params.min_experts;
     cparams.thresh_experts    = params.thresh_experts;
+    cparams.only_active_experts = params.only_active_exps;
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

diff --git a/common/common.h b/common/common.h
@@ -223,6 +223,7 @@ struct gpt_params {
     bool repack_tensors    = false; // repack tensors if interleaved variant is available
     bool use_thp           = false; // use transparent huge pages (linux only)
     bool validate_quants   = false; // if true, check for NaNs while loading the model
+    bool only_active_exps  = false; // if true, offload only active experts (relevant only for hybrid CPU/GPU)
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -210,6 +210,7 @@ extern "C" {
 
     // enable or disable op offload for a given op
     GGML_API void                 ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off);
+    GGML_API void                 ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off);
 
     //
     // Utils

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -1493,7 +1493,7 @@ add_library(ggml
             ../include/ggml-backend.h
             ggml.c
             ggml-alloc.c
-            ggml-backend.c
+            ggml-backend.cpp
             ggml-quants.c
             ggml-quants.h
             ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}

diff --git a/ggml/src/ggml-backend.c → ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.c → ggml/src/ggml-backend.cpp
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -4288,8 +4288,9 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
         if (batch_size < min_batch_size) return false;
         int64_t n_experts_tot    = op->src[0]->ne[2];
         int64_t n_experts_active = ids->ne[0];
-        //printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3]);
-        return batch_size*n_experts_active >= min_batch_size*n_experts_tot;
+        bool should_offload = batch_size*n_experts_active >= min_batch_size*n_experts_tot;
+        //printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld -> %d (%ld, %ld)\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], should_offload, batch_size*n_experts_active, min_batch_size*n_experts_tot);
+        return should_offload;
     }
 
     return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;

diff --git a/include/llama.h b/include/llama.h
@@ -424,6 +424,7 @@ extern "C" {
         bool fused_up_gate;     // whether to use fused up/gate op [EXPERIMENTAL]
         int  min_experts;
         float thresh_experts;
+        bool only_active_experts;
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted

diff --git a/src/llama.cpp b/src/llama.cpp
@@ -18957,6 +18957,7 @@ struct llama_context_params llama_context_default_params() {
         /*.fused_up_gate               =*/ true,
         /*.min_experts                 =*/ -1,
         /*.thtesh_experts              =*/ 0.0f,
+        /*.only_active_experts         =*/ false,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
         /*.offload_policy              =*/ nullptr,
@@ -19548,6 +19549,11 @@ struct llama_context * llama_new_context_with_model(
         }
     }
 
+    if (params.only_active_experts) {
+        LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
+        ggml_backend_sched_set_only_active_experts(ctx->sched, true);
+    }
+
     return ctx;
 }