From 160fe837dc3b13d80fc7a180247e66c85e7c6f4a Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sat, 16 Aug 2025 13:06:17 +0300
Subject: [PATCH 1/5] Offload only activated experts

---
 ggml/src/CMakeLists.txt                       |   2 +-
 ggml/src/{ggml-backend.c => ggml-backend.cpp} | 166 +++++++++++++-----
 ggml/src/ggml-cuda.cu                         |   5 +-
 3 files changed, 128 insertions(+), 45 deletions(-)
 rename ggml/src/{ggml-backend.c => ggml-backend.cpp} (93%)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 5924805bd..1c9fcc248 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1493,7 +1493,7 @@ add_library(ggml
             ../include/ggml-backend.h
             ggml.c
             ggml-alloc.c
-            ggml-backend.c
+            ggml-backend.cpp
             ggml-quants.c
             ggml-quants.h
             ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.cpp
similarity index 93%
rename from ggml/src/ggml-backend.c
rename to ggml/src/ggml-backend.cpp
index 07b879f12..769ed5d52 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.cpp
@@ -3,12 +3,14 @@
 #include "ggml-impl.h"
 #include "ggml-rpc.h"
 
-#include <assert.h>
-#include <limits.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cassert>
+#include <climits>
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <set>
 
 #define IK_PRINT_TIMING 0
 
@@ -60,9 +62,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
         struct ggml_backend_buffer_i           iface,
                ggml_backend_buffer_context_t   context,
                size_t                          size) {
-    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
-
-    (*buffer) = (struct ggml_backend_buffer) {
+    ggml_backend_buffer_t buffer = new ggml_backend_buffer {
         /* .interface = */ iface,
         /* .buft      = */ buft,
         /* .context   = */ context,
@@ -442,6 +442,29 @@ static size_t ggml_backend_registry_count = 0;
 
 GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
 
+#ifdef GGML_USE_CUDA
+extern "C" GGML_CALL void ggml_backend_cuda_reg_devices(void);
+#endif
+#ifdef GGML_USE_SYCL
+extern "C" void ggml_backend_sycl_reg_devices(void);
+#endif
+#ifdef GGML_USE_METAL
+extern "C" GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
+extern "C" GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+#endif
+#ifdef GGML_USE_VULKAN
+extern "C" GGML_CALL int ggml_backend_vk_reg_devices(void);
+#endif
+#ifdef GGML_USE_KOMPUTE
+extern "C" GGML_CALL void ggml_backend_kompute_reg_devices(void);
+#endif
+#ifdef GGML_USE_CANN
+extern "C" GGML_CALL int ggml_backend_cann_reg_devices(void);
+#endif
+#ifdef GGML_USE_RPC
+extern "C" GGML_CALL void ggml_backend_rpc_reg_devices(void);
+#endif
+
 GGML_CALL static void ggml_backend_registry_init(void) {
     static bool initialized = false;
 
@@ -455,37 +478,29 @@ GGML_CALL static void ggml_backend_registry_init(void) {
 
     // add forward decls here to avoid including the backend headers
 #ifdef GGML_USE_CUDA
-    extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
     ggml_backend_cuda_reg_devices();
 #endif
 
 #ifdef GGML_USE_SYCL
-    extern void ggml_backend_sycl_reg_devices(void);
     ggml_backend_sycl_reg_devices();
 #endif
 
 #ifdef GGML_USE_METAL
-    extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
-    extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
     ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
 #endif
 
 #ifdef GGML_USE_VULKAN
-    extern GGML_CALL int ggml_backend_vk_reg_devices(void);
     ggml_backend_vk_reg_devices();
 #endif
 
 #ifdef GGML_USE_KOMPUTE
-    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
     ggml_backend_kompute_reg_devices();
 #endif
 
 #ifdef GGML_USE_CANN
-    extern GGML_CALL int ggml_backend_cann_reg_devices(void);
     ggml_backend_cann_reg_devices();
 #endif
 #ifdef GGML_USE_RPC
-    extern GGML_CALL void ggml_backend_rpc_reg_devices(void);
     ggml_backend_rpc_reg_devices();
 #endif
 }
@@ -495,11 +510,11 @@ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn ini
 
     size_t id = ggml_backend_registry_count;
 
-    ggml_backend_registry[id] = (struct ggml_backend_reg) {
+    ggml_backend_registry[id] = ggml_backend_reg {
         /* .name                = */ {0},
         /* .fn                  = */ init_fn,
         /* .default_buffer_type = */ default_buffer_type,
-        /* .user_data           = */ user_data,
+        /* .user_data           = */ user_data
     };
 
     snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
@@ -804,13 +819,13 @@ struct ggml_backend_plan_cpu {
 GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+    struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu *)malloc(sizeof(struct ggml_backend_plan_cpu));
 
     cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
 
     if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+        cpu_plan->cplan.work_data = (uint8_t *)malloc(cpu_plan->cplan.work_size);
         if (cpu_plan->cplan.work_data == NULL) {
             free(cpu_plan);
             return NULL;
@@ -854,7 +869,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
         }
         cpu_ctx->work_size = cplan.work_size;
     }
-    cplan.work_data = cpu_ctx->work_data;
+    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
 
     cplan.abort_callback      = cpu_ctx->abort_callback;
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
@@ -915,7 +930,7 @@ static ggml_guid_t ggml_backend_cpu_guid(void) {
 }
 
 ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+    struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context *)malloc(sizeof(struct ggml_backend_cpu_context));
     if (ctx == NULL) {
         return NULL;
     }
@@ -926,13 +941,13 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     ctx->abort_callback      = NULL;
     ctx->abort_callback_data = NULL;
 
-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+    ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
     if (cpu_backend == NULL) {
         free(ctx);
         return NULL;
     }
 
-    *cpu_backend = (struct ggml_backend) {
+    *cpu_backend = ggml_backend {
         /* .guid      = */ ggml_backend_cpu_guid(),
         /* .interface = */ cpu_backend_i,
         /* .context   = */ ctx
@@ -1630,7 +1645,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 i_split++;
                 if (i_split >= sched->splits_capacity) {
                     sched->splits_capacity *= 2;
-                    sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
+                    sched->splits = (ggml_backend_sched_split *)realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
                     GGML_ASSERT(sched->splits != NULL);
                 }
                 GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
@@ -1720,8 +1735,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
     int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
     if (sched->graph.size < graph_size) {
         sched->graph.size = graph_size;
-        sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
-        sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
+        sched->graph.nodes = (ggml_tensor **)realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
+        sched->graph.leafs = (ggml_tensor **)realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
         GGML_ASSERT(sched->graph.nodes != NULL);
         GGML_ASSERT(sched->graph.leafs != NULL);
     }
@@ -1844,6 +1859,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         int split_backend_id = split->backend_id;
         ggml_backend_t split_backend = sched->backends[split_backend_id];
 
+        //printf("Graph split %d has %d inputs:\n", i, split->n_inputs);
+        //for (int j = 0; j < split->n_inputs; j++) printf("  %s,  %s\n", split->inputs[j]->name,
+        //        split->inputs[j]->src[0] ? split->inputs[j]->src[0]->name : "none");
+
         // copy the input tensors to the split backend
         for (int j = 0; j < split->n_inputs; j++) {
             ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
@@ -1865,6 +1884,69 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
+#if 1
+                ggml_tensor * node = split->graph.nodes[0];
+                if (split->graph.n_nodes > 0 &&
+                    ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
+                    ggml_backend_buffer_is_host(input->buffer) &&
+                    node->src[0] == input_cpy &&
+                   (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_MOE_FUSED_UP_GATE)) {
+
+                    ggml_backend_synchronize(input_backend);
+
+                    // find the ids
+                    ggml_tensor * ids_tensor = node->op == GGML_OP_MUL_MAT_ID ? node->src[2] : node->src[3];
+                    std::vector<int32_t> ids(ggml_nbytes(ids_tensor) / sizeof(int32_t));
+                    ggml_backend_tensor_get_async(split_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
+
+                    ggml_backend_synchronize(split_backend);
+
+                    std::set<int32_t> unique_ids;
+                    for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
+                        for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
+                            int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
+                            unique_ids.insert(id);
+                        }
+                    }
+
+                    // group consecutive experts and copy them together
+                    GGML_ASSERT(!unique_ids.empty());
+                    //printf("Offloading %ld out of %ld experts\n", unique_ids.size(), node->src[0]->ne[2]);
+
+                    auto it = unique_ids.begin();
+                    int32_t first_id = *it;
+                    int32_t last_id = first_id;
+
+                    auto copy_experts = [&](int32_t first_id, int32_t last_id) {
+                        const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
+                        const size_t expert_offset = first_id * expert_size;
+                        const size_t expert_size_copy =  (last_id - first_id + 1) * expert_size;
+                        const size_t padding = 512;
+                        const size_t padding_end = last_id < input->ne[2] - 1 ? std::min<size_t>(expert_size, padding) : 0;
+
+                        ggml_backend_tensor_set_async(split_backend,
+                            input_cpy,
+                            (const uint8_t *)input->data + expert_offset, expert_offset,
+                            // copy a bit extra to ensure there are no NaNs in the padding
+                            expert_size_copy + padding_end);
+                    };
+
+                    for (++it; it != unique_ids.end(); ++it) {
+                        const int32_t id = *it;
+
+                        if (id == last_id + 1) {
+                            last_id = id;
+                            continue;
+                        }
+
+                        copy_experts(first_id, last_id);
+
+                        first_id = id;
+                        last_id = id;
+                    }
+                    copy_experts(first_id, last_id);
+                } else
+#endif
                 // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                 // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
                 if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
@@ -1950,7 +2032,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
     GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
 
-    struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
+    struct ggml_backend_sched * sched = (ggml_backend_sched *)calloc(1, sizeof(struct ggml_backend_sched));
 
     for (int i = 0; i < (GGML_OP_COUNT + 31)/32; ++i) sched->op_offload[i] = 0xffffffff;
 
@@ -1961,20 +2043,20 @@ ggml_backend_sched_t ggml_backend_sched_new(
     // initialize hash table
     // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
     sched->hash_set    = ggml_hash_set_new(graph_size);
-    sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
-    sched->hv_tensor_copies      = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
+    sched->hv_tensor_backend_ids = (int *)malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+    sched->hv_tensor_copies      = (ggml_tensor **)malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
 
     const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
-    sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
-    sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
-    sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
-    sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
+    sched->node_backend_ids = (int *)calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
+    sched->leaf_backend_ids = (int *)calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
+    sched->prev_node_backend_ids = (int *)calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
+    sched->prev_leaf_backend_ids = (int *)calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
 
     sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
-    sched->context_buffer = malloc(sched->context_buffer_size);
+    sched->context_buffer = (char *)malloc(sched->context_buffer_size);
 
     const int initial_splits_capacity = 16;
-    sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
+    sched->splits = (ggml_backend_sched_split *)calloc(initial_splits_capacity, sizeof(sched->splits[0]));
     sched->splits_capacity = initial_splits_capacity;
 
     for (int b = 0; b < n_backends; b++) {
@@ -2219,8 +2301,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
 
 struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
     struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
-    struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
-    bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
+    struct ggml_tensor ** node_copies = (ggml_tensor **)calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
+    bool * node_init = (bool *)calloc(hash_set.size, sizeof(node_init[0]));
 
     struct ggml_init_params params = {
         /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -2238,7 +2320,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
         free(node_init);
         ggml_free(ctx_allocated);
         ggml_free(ctx_unallocated);
-        return (struct ggml_backend_graph_copy) {
+        return {
             /* .buffer           = */ NULL,
             /* .ctx_allocated    = */ NULL,
             /* .ctx_unallocated  = */ NULL,
@@ -2261,7 +2343,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
         free(node_init);
         ggml_free(ctx_allocated);
         ggml_free(ctx_unallocated);
-        return (struct ggml_backend_graph_copy) {
+        return {
             /* .buffer           = */ NULL,
             /* .ctx_allocated    = */ NULL,
             /* .ctx_unallocated  = */ NULL,
@@ -2290,7 +2372,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     free(node_copies);
     free(node_init);
 
-    return (struct ggml_backend_graph_copy) {
+    return {
         /* .buffer           = */ buffer,
         /* .ctx_allocated    = */ ctx_allocated,
         /* .ctx_unallocated  = */ ctx_unallocated,
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index ac8246b92..5272a38bf 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -4288,8 +4288,9 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
         if (batch_size < min_batch_size) return false;
         int64_t n_experts_tot    = op->src[0]->ne[2];
         int64_t n_experts_active = ids->ne[0];
-        //printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3]);
-        return batch_size*n_experts_active >= min_batch_size*n_experts_tot;
+        bool should_offload = batch_size*n_experts_active >= min_batch_size*n_experts_tot;
+        //printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld -> %d (%ld, %ld)\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], should_offload, batch_size*n_experts_active, min_batch_size*n_experts_tot);
+        return should_offload;
     }
 
     return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;

From 5951266711bdc5b26f771664b6b764cc98abaf36 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sat, 16 Aug 2025 13:54:31 +0300
Subject: [PATCH 2/5] This seems to do the trick for -fmoe

---
 ggml/src/ggml-backend.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 769ed5d52..1c3eea261 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1859,6 +1859,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         int split_backend_id = split->backend_id;
         ggml_backend_t split_backend = sched->backends[split_backend_id];
 
+        int cur_arg = 0;
+
         //printf("Graph split %d has %d inputs:\n", i, split->n_inputs);
         //for (int j = 0; j < split->n_inputs; j++) printf("  %s,  %s\n", split->inputs[j]->name,
         //        split->inputs[j]->src[0] ? split->inputs[j]->src[0]->name : "none");
@@ -1889,7 +1891,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 if (split->graph.n_nodes > 0 &&
                     ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
                     ggml_backend_buffer_is_host(input->buffer) &&
-                    node->src[0] == input_cpy &&
+                    node->src[cur_arg] == input_cpy &&
                    (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_MOE_FUSED_UP_GATE)) {
 
                     ggml_backend_synchronize(input_backend);
@@ -1918,7 +1920,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     int32_t last_id = first_id;
 
                     auto copy_experts = [&](int32_t first_id, int32_t last_id) {
-                        const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
+                        const size_t expert_size = (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_MOE_FUSED_UP_GATE) ? input->nb[2] : input->nb[1];
                         const size_t expert_offset = first_id * expert_size;
                         const size_t expert_size_copy =  (last_id - first_id + 1) * expert_size;
                         const size_t padding = 512;
@@ -1929,6 +1931,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             (const uint8_t *)input->data + expert_offset, expert_offset,
                             // copy a bit extra to ensure there are no NaNs in the padding
                             expert_size_copy + padding_end);
+
                     };
 
                     for (++it; it != unique_ids.end(); ++it) {
@@ -1945,6 +1948,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                         last_id = id;
                     }
                     copy_experts(first_id, last_id);
+                    if (node->op == GGML_OP_MOE_FUSED_UP_GATE) ++cur_arg;
                 } else
 #endif
                 // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events

From 15911fa35cf747cfc8e0e551c0b29bd98f36ad30 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sat, 16 Aug 2025 14:05:15 +0300
Subject: [PATCH 3/5] Do not recalculate activated expers for fused up/gate

---
 ggml/src/ggml-backend.cpp | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 1c3eea261..da5b832af 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1860,6 +1860,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         ggml_backend_t split_backend = sched->backends[split_backend_id];
 
         int cur_arg = 0;
+        std::vector<int32_t> ids;
+        std::set<int32_t> unique_ids;
 
         //printf("Graph split %d has %d inputs:\n", i, split->n_inputs);
         //for (int j = 0; j < split->n_inputs; j++) printf("  %s,  %s\n", split->inputs[j]->name,
@@ -1894,26 +1896,27 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     node->src[cur_arg] == input_cpy &&
                    (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_MOE_FUSED_UP_GATE)) {
 
-                    ggml_backend_synchronize(input_backend);
+                    if (ids.empty()) {
+                        // find the ids
+                        ggml_tensor * ids_tensor = node->op == GGML_OP_MUL_MAT_ID ? node->src[2] : node->src[3];
+                        ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
+                        ggml_backend_synchronize(input_backend);
 
-                    // find the ids
-                    ggml_tensor * ids_tensor = node->op == GGML_OP_MUL_MAT_ID ? node->src[2] : node->src[3];
-                    std::vector<int32_t> ids(ggml_nbytes(ids_tensor) / sizeof(int32_t));
-                    ggml_backend_tensor_get_async(split_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
+                        ggml_backend_tensor_get_async(split_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
 
-                    ggml_backend_synchronize(split_backend);
+                        ggml_backend_synchronize(split_backend);
 
-                    std::set<int32_t> unique_ids;
-                    for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
-                        for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
-                            int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
-                            unique_ids.insert(id);
+                        for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
+                            for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
+                                int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
+                                unique_ids.insert(id);
+                            }
                         }
-                    }
 
-                    // group consecutive experts and copy them together
-                    GGML_ASSERT(!unique_ids.empty());
-                    //printf("Offloading %ld out of %ld experts\n", unique_ids.size(), node->src[0]->ne[2]);
+                        // group consecutive experts and copy them together
+                        GGML_ASSERT(!unique_ids.empty());
+
+                    }
 
                     auto it = unique_ids.begin();
                     int32_t first_id = *it;

From 9ef79d3073cfcb4262924a2eb685adca3187929f Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sat, 16 Aug 2025 17:48:52 +0300
Subject: [PATCH 4/5] Log out of bounds access details

---
 ggml/src/ggml-backend.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index da5b832af..51c892fc8 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -200,6 +200,7 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
 
 void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    if (offset + size > ggml_nbytes(tensor)) fprintf(stderr, "%s(%s): offset = %zu, size = %zu, nbytes = %zu\n", __func__, tensor->name, offset, size, ggml_nbytes(tensor));
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
 
     if (backend->iface.set_tensor_async == NULL) {

From 3c43f9dc7dbcff20d4ec64b028dcb373adcca63f Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 2 Sep 2025 18:58:34 +0300
Subject: [PATCH 5/5] Add a command line argument

---
 common/common.cpp           |  5 +++++
 common/common.h             |  1 +
 ggml/include/ggml-backend.h |  1 +
 ggml/src/ggml-backend.cpp   | 11 ++++++++---
 include/llama.h             |  1 +
 src/llama.cpp               |  6 ++++++
 6 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index dd45f83c4..024228a9c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1378,6 +1378,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
+    if (arg == "--offload-only-active-experts" || arg == "-ooae") {
+        params.only_active_exps = true;
+        return true;
+    }
     if (arg == "--host") {
         CHECK_ARG
         params.hostname = argv[i];
@@ -2746,6 +2750,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.fused_up_gate     = params.fused_up_gate;
     cparams.min_experts       = params.min_experts;
     cparams.thresh_experts    = params.thresh_experts;
+    cparams.only_active_experts = params.only_active_exps;
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
diff --git a/common/common.h b/common/common.h
index 5ba6ad609..38027ca27 100644
--- a/common/common.h
+++ b/common/common.h
@@ -223,6 +223,7 @@ struct gpt_params {
     bool repack_tensors    = false; // repack tensors if interleaved variant is available
     bool use_thp           = false; // use transparent huge pages (linux only)
     bool validate_quants   = false; // if true, check for NaNs while loading the model
+    bool only_active_exps  = false; // if true, offload only active experts (relevant only for hybrid CPU/GPU)
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 1110ff3aa..6c843fa81 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -210,6 +210,7 @@ extern "C" {
 
     // enable or disable op offload for a given op
     GGML_API void                 ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off);
+    GGML_API void                 ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off);
 
     //
     // Utils
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 51c892fc8..4e9ef4739 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1160,6 +1160,7 @@ struct ggml_backend_sched {
 
     uint32_t op_offload[(GGML_OP_COUNT + 31)/32];
 
+    bool only_active_experts;
     bool debug;
 };
 
@@ -1180,6 +1181,11 @@ void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op
     }
 }
 
+void ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off) {
+    if (!sched) return;
+    sched->only_active_experts = on_or_off;
+}
+
 static inline bool ggml_backend_sched_offload_enabled(ggml_backend_sched_t sched, enum ggml_op op) {
     int int_op = (int)op;
     if (!sched || op < 0 || op >= GGML_OP_COUNT) return false;
@@ -1889,9 +1895,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
-#if 1
+
                 ggml_tensor * node = split->graph.nodes[0];
-                if (split->graph.n_nodes > 0 &&
+                if (sched->only_active_experts && split->graph.n_nodes > 0 &&
                     ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
                     ggml_backend_buffer_is_host(input->buffer) &&
                     node->src[cur_arg] == input_cpy &&
@@ -1954,7 +1960,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     copy_experts(first_id, last_id);
                     if (node->op == GGML_OP_MOE_FUSED_UP_GATE) ++cur_arg;
                 } else
-#endif
                 // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                 // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
                 if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
diff --git a/include/llama.h b/include/llama.h
index 27b0298f9..28b54392c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -424,6 +424,7 @@ extern "C" {
         bool fused_up_gate;     // whether to use fused up/gate op [EXPERIMENTAL]
         int  min_experts;
         float thresh_experts;
+        bool only_active_experts;
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
diff --git a/src/llama.cpp b/src/llama.cpp
index d793d0062..8154bce85 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18957,6 +18957,7 @@ struct llama_context_params llama_context_default_params() {
         /*.fused_up_gate               =*/ true,
         /*.min_experts                 =*/ -1,
         /*.thtesh_experts              =*/ 0.0f,
+        /*.only_active_experts         =*/ false,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
         /*.offload_policy              =*/ nullptr,
@@ -19548,6 +19549,11 @@ struct llama_context * llama_new_context_with_model(
         }
     }
 
+    if (params.only_active_experts) {
+        LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
+        ggml_backend_sched_set_only_active_experts(ctx->sched, true);
+    }
+
     return ctx;
 }