Add a command line argument

Iwan Kawrakow · Iwan Kawrakow · commit 3c43f9dc7dbc · 2025-09-02T18:58:34.000+03:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1378,6 +1378,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
+    if (arg == "--offload-only-active-experts" || arg == "-ooae") {
+        params.only_active_exps = true;
+        return true;
+    }
     if (arg == "--host") {
         CHECK_ARG
         params.hostname = argv[i];
@@ -2746,6 +2750,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.fused_up_gate     = params.fused_up_gate;
     cparams.min_experts       = params.min_experts;
     cparams.thresh_experts    = params.thresh_experts;
+    cparams.only_active_experts = params.only_active_exps;
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
diff --git a/common/common.h b/common/common.h
@@ -223,6 +223,7 @@ struct gpt_params {
     bool repack_tensors    = false; // repack tensors if interleaved variant is available
     bool use_thp           = false; // use transparent huge pages (linux only)
     bool validate_quants   = false; // if true, check for NaNs while loading the model
+    bool only_active_exps  = false; // if true, offload only active experts (relevant only for hybrid CPU/GPU)
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -210,6 +210,7 @@ extern "C" {
 
     // enable or disable op offload for a given op
     GGML_API void                 ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off);
+    GGML_API void                 ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off);
 
     //
     // Utils
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1160,6 +1160,7 @@ struct ggml_backend_sched {
 
     uint32_t op_offload[(GGML_OP_COUNT + 31)/32];
 
+    bool only_active_experts;
     bool debug;
 };
 
@@ -1180,6 +1181,11 @@ void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op
     }
 }
 
+void ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off) {
+    if (!sched) return;
+    sched->only_active_experts = on_or_off;
+}
+
 static inline bool ggml_backend_sched_offload_enabled(ggml_backend_sched_t sched, enum ggml_op op) {
     int int_op = (int)op;
     if (!sched || op < 0 || op >= GGML_OP_COUNT) return false;
@@ -1889,9 +1895,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
-#if 1
+
                 ggml_tensor * node = split->graph.nodes[0];
-                if (split->graph.n_nodes > 0 &&
+                if (sched->only_active_experts && split->graph.n_nodes > 0 &&
                     ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
                     ggml_backend_buffer_is_host(input->buffer) &&
                     node->src[cur_arg] == input_cpy &&
@@ -1954,7 +1960,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     copy_experts(first_id, last_id);
                     if (node->op == GGML_OP_MOE_FUSED_UP_GATE) ++cur_arg;
                 } else
-#endif
                 // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                 // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
                 if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
diff --git a/include/llama.h b/include/llama.h
@@ -424,6 +424,7 @@ extern "C" {
         bool fused_up_gate;     // whether to use fused up/gate op [EXPERIMENTAL]
         int  min_experts;
         float thresh_experts;
+        bool only_active_experts;
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -18957,6 +18957,7 @@ struct llama_context_params llama_context_default_params() {
         /*.fused_up_gate               =*/ true,
         /*.min_experts                 =*/ -1,
         /*.thtesh_experts              =*/ 0.0f,
+        /*.only_active_experts         =*/ false,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
         /*.offload_policy              =*/ nullptr,
@@ -19548,6 +19549,11 @@ struct llama_context * llama_new_context_with_model(
         }
     }
 
+    if (params.only_active_experts) {
+        LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
+        ggml_backend_sched_set_only_active_experts(ctx->sched, true);
+    }
+
     return ctx;
 }