MoE CPU offloading

Judd · Judd · commit 7fd47f10aa46 · 2025-02-19T10:48:36.000+08:00
diff --git a/README.md b/README.md
@@ -13,7 +13,8 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
-* 2024-07-14: [ggml updated](https://github.com/ggml-org/llama.cpp/tree/0f2bbe656473177538956d22b6842bcaa0449fab) again
+* 2025-02-19: MoE CPU offloading
+* 2025-02-17: [ggml updated](https://github.com/ggml-org/llama.cpp/tree/0f2bbe656473177538956d22b6842bcaa0449fab) again
 * 2025-02-10: [GPU acceleration](./docs/gpu.md) 🔥
 * 2025-01-25: MiniCPM Embedding & ReRanker
 * 2025-01-21: DeepSeek-R1-Distill-Llama & Qwen
@@ -32,7 +33,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 ## Features
 
-* [x] Accelerated memory-efficient CPU inference with int4/int8 quantization, optimized KV cache and parallel computing;
+* [x] Accelerated memory-efficient CPU/GPU inference with int4/int8 quantization, optimized KV cache and parallel computing;
 * [x] Use OOP to address the similarities between different _Transformer_ based models;
 * [x] Streaming generation with typewriter effect;
 * [x] Continuous chatting (content length is virtually unlimited)
diff --git a/README_ja.md b/README_ja.md
@@ -12,7 +12,7 @@
 
 ## 特徴
 
-* [x] int4/int8 量子化、最適化された KV キャッシュ、並列計算によるメモリ効率の高い CPU 推論の加速
+* [x] int4/int8 量子化、最適化された KV キャッシュ、並列計算によるメモリ効率の高い CPU/GPU 推論の加速
 * [x] OOP を使用して、異なる _Transformer_ ベースのモデル間の類似性に対処
 * [x] タイプライター効果を伴うストリーミング生成
 * [x] 継続的なチャット（コンテンツの長さは事実上無制限）
diff --git a/README_zh.md b/README_zh.md
@@ -12,7 +12,7 @@
 
 ## 特点
 
-- [x] 内存高效、加速 CPU 推理：使用 int4/int8 量化、优化的 KV 缓存和并行计算。
+- [x] 内存高效、加速 CPU/GPU 推理：使用 int4/int8 量化、优化的 KV 缓存和并行计算。
 - [x] 面向对象编程：关注基于 Transformer 的模型之间的相似性。
 - [x] 流式生成：打字机效果。
 - [x] 连续聊天：内容长度几乎无限。
diff --git a/src/backend.cpp b/src/backend.cpp
@@ -119,7 +119,7 @@ namespace chatllm
         total[usage] += size;
         ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(get_allocator(usage), size);
 
-        CHATLLM_CHECK(buf) << __FUNCTION__ << "() failed to allocate buffer";
+        CHATLLM_CHECK(buf) << __FUNCTION__ << "() failed to allocate buffer of size " << size;
 
         auto r = new BackendBuffer(buf);
         buffers.emplace_back(r);
@@ -261,6 +261,11 @@ namespace chatllm
         alloc_of_tensor.insert_or_assign(tensor, allocator);
     }
 
+    void LayerAllocatorManager::override_to_cpu_only(bool flag)
+    {
+        cpu_override = flag;
+    }
+
     int LayerAllocatorManager::get_mapped_layer_id(int layer_id)
     {
         int id = layer_id;
@@ -276,7 +281,7 @@ namespace chatllm
         default:
             break;
         }
-        if ((id < 0) || (id >= (int)allocators.size()))
+        if (cpu_override || (id < 0) || (id >= (int)allocators.size()))
             id = (int)allocators.size() - 1;
 
         return id;
@@ -734,6 +739,11 @@ namespace chatllm
         backend_context->layer_allocators.move_to_layer(layer_id);
     }
 
+    void ComputeContext::backend_cpu_override(bool flag)
+    {
+        backend_context->layer_allocators.override_to_cpu_only(flag);
+    }
+
     BackendBufAllocator *ComputeContext::get_allocator(void)
     {
         return backend_context->layer_allocators.get_allocator();
diff --git a/src/backend.h b/src/backend.h
@@ -166,6 +166,8 @@ namespace chatllm
 
         void register_tensor_allocator(ggml::tensor *tensor,  LayerBufAllocator *allocator);
 
+        void override_to_cpu_only(bool flag);
+
     protected:
         int get_mapped_layer_id(int layer_id);
     public:
@@ -175,6 +177,7 @@ namespace chatllm
         int epilog_layer_backend_map_to_layer_id = -1;
         int cur_layer = MiscLayer::Prolog;
         std::map<ggml::tensor *, LayerBufAllocator *> alloc_of_tensor;
+        bool cpu_override = false;
     };
 
     class ComputeManager
@@ -319,6 +322,12 @@ namespace chatllm
     class ComputeContext
     {
     public:
+        // additional user options
+        struct UserOptions
+        {
+            bool moe_on_cpu = false;
+        };
+
         ComputeContext(BackendContext *backend_context);
 
         virtual struct ggml_context *get_ctx() = 0;
@@ -328,6 +337,7 @@ namespace chatllm
         virtual void cb_op_tensor(ggml::tensor *tensor);
 
         virtual void move_to_layer(int layer_id);
+        virtual void backend_cpu_override(bool flag);
 
         BackendBufAllocator *get_allocator(void);
         BackendBufAllocator *get_allocator(ggml::tensor *tensor);
@@ -352,6 +362,9 @@ namespace chatllm
 
         BackendContext *get_backend_context(void) { return backend_context; }
 
+    public:
+        UserOptions user_options;
+
     protected:
         virtual ggml_backend_sched_t get_sched(void);
 
diff --git a/src/chat.h b/src/chat.h
@@ -280,6 +280,51 @@ namespace chatllm
         ggml::type dtype;
     };
 
+    class LayerMover
+    {
+    public:
+        LayerMover(InitContext *ctx, int layer_id): ctx(ctx)
+        {
+            ctx->move_to_layer(layer_id);
+        }
+
+        operator InitContext *() const
+        {
+            return ctx;
+        }
+    private:
+        InitContext *ctx;
+    };
+
+    class CPUMover
+    {
+    public:
+        CPUMover(ComputeContext *ctx, bool activated): ctx(ctx), activated(activated)
+        {
+            if (activated)
+                ctx->backend_cpu_override(true);
+        }
+
+        ~CPUMover()
+        {
+            if (activated)
+                ctx->backend_cpu_override(false);
+        }
+
+        operator InitContext *() const
+        {
+            return dynamic_cast<InitContext *>(ctx);
+        }
+
+        operator ComputeContext *() const
+        {
+            return ctx;
+        }
+    private:
+        ComputeContext *ctx;
+        const bool activated;
+    };
+
     class ChunkInterceptor;
 
     class BaseStreamer
@@ -844,10 +889,11 @@ namespace chatllm
             int   max_length;
             std::string layer_spec;
             std::string gpu_layers;
-            extra_args(int max_length, const std::string &layer_spec, const std::string &gpu_layers)
-                : max_length(max_length), layer_spec(layer_spec), gpu_layers(gpu_layers)
+            bool moe_on_cpu;
+            extra_args(int max_length, const std::string &layer_spec, const std::string &gpu_layers, bool moe_on_cpu)
+                : max_length(max_length), layer_spec(layer_spec), gpu_layers(gpu_layers), moe_on_cpu(moe_on_cpu)
             {}
-            extra_args() : extra_args(-1, "", "") {}
+            extra_args() : extra_args(-1, "", "", false) {}
         };
 
         ModelObject(const std::string &path);
diff --git a/src/layers.cpp b/src/layers.cpp
@@ -1450,6 +1450,8 @@ namespace chatllm
 
         ggml::tensor * logits = gate.forward(ctx, hidden_states); // [qlen, num_experts]
 
+        CPUMover mover(ctx, ctx->user_options.moe_on_cpu);
+
         ggml::tensor * probs = ggml::soft_max(ctx, logits); // [qlen, num_experts]
 
         // select experts
diff --git a/src/layers.h b/src/layers.h
@@ -1781,14 +1781,18 @@ namespace chatllm
         BaseSparseMLP() = default;
         BaseSparseMLP(InitContext *ctx, int hidden_size, int intermediate_size, int num_local_experts, int num_experts_per_tok,
                   ActFunc act, bool gate_use_bias)
-            : num_local_experts(num_local_experts), num_experts_per_tok(num_experts_per_tok),
+            :
+              num_local_experts(num_local_experts), num_experts_per_tok(num_experts_per_tok),
               gate(ctx, hidden_size, num_local_experts, gate_use_bias),
+              mover(new CPUMover(ctx, ctx->user_options.moe_on_cpu)),
               experts_gate(ctx, hidden_size, intermediate_size, num_local_experts),
               experts_down(ctx, intermediate_size, hidden_size, num_local_experts),
               experts_up  (ctx, hidden_size, intermediate_size, num_local_experts),
               act(act),
               norm_topk_prob(true)
         {
+            delete mover;
+            mover = nullptr;
         }
 
         using Block::forward;
@@ -1819,6 +1823,7 @@ namespace chatllm
         const int num_local_experts;
         const int num_experts_per_tok;
         Linear gate;
+        CPUMover *mover;        // when `+moe_on_cpu` is set, all things are done on CPU except for `gate`
         MultiLinear experts_gate;
         MultiLinear experts_down;
         MultiLinear experts_up;
diff --git a/src/main.cpp b/src/main.cpp
@@ -73,6 +73,7 @@ struct Args
     int save_session_rounds = -1;
     int beam_size = -1;
     int log_level = 4;
+    bool moe_on_cpu = false;
 };
 
 #define MULTI_LINE_END_MARKER_W  L"\\."
@@ -125,6 +126,7 @@ void usage(const std::string &prog)
               << "Performance options:\n"
               << "  -n, --threads N         number of threads for inference (default: number of cores)\n"
               << "  -ngl, --n_gpu_layers N  number of model layers to offload to each GPU (default: GPU not used)\n"
+              << "  +moe_on_cpu             alway use CPU for sparse operations (MoE) (default: off)\n"
               << "Sampling options:\n"
               << "  --sampling ALG          sampling algorithm (ALG = greedy | top_p | tfs) (default: top_p) \n"
               << "                          where, tfs = Tail Free Sampling\n"
@@ -232,6 +234,12 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
                 args.field.push_back(f(argv[c].c_str()));                   \
         }
 
+    #define handle_flag(field)    \
+        else if ((strcmp(arg, "+" #field) == 0))                            \
+        {                                                                   \
+            args.field = true;                                              \
+        }
+
     size_t c = 1;
 
     try
@@ -271,14 +279,9 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
             {
                 args.reversed_role = true;
             }
-            else if (strcmp(arg, "+rag_dump") == 0)
-            {
-                args.rag_dump = true;
-            }
-            else if (strcmp(arg, "+rerank_rewrite") == 0)
-            {
-                args.rerank_rewrite = true;
-            }
+            handle_flag(rag_dump)
+            handle_flag(rerank_rewrite)
+            handle_flag(moe_on_cpu)
             else if (strcmp(arg, "--format") == 0)
             {
                 c++;
@@ -655,6 +658,9 @@ static void run_qa_ranker(Args &args, chatllm::Pipeline &pipeline, TextStreamer
                                          gen_config.set_ai_prefix(args.ai_prefix); gen_config.dump_dot = args.dump_dot; \
                                          gen_config.emb_rank_query_sep = args.emb_rank_query_sep;
 
+#define DEF_ExtraArgs(pipe_args, args)  \
+    chatllm::ModelObject::extra_args pipe_args(args.max_length, args.layer_spec, args.n_gpu_layers, args.moe_on_cpu)
+
 chatllm::BaseStreamer *get_streamer_for_log(void);
 
 void log_internal(int level, const char * text)
@@ -1003,7 +1009,7 @@ int main(int argc, const char **argv)
 
     try
     {
-        chatllm::ModelObject::extra_args pipe_args(args.max_length, args.layer_spec, args.n_gpu_layers);
+        DEF_ExtraArgs(pipe_args, args);
         TextStreamer streamer(nullptr);
         streamer.log_level = args.log_level;
         log_streamer = &streamer;
@@ -1240,7 +1246,7 @@ int chatllm_start(struct chatllm_obj *obj, f_chatllm_print f_print, f_chatllm_en
 
     try
     {
-        chatllm::ModelObject::extra_args pipe_args(args.max_length, args.layer_spec, args.n_gpu_layers);
+        DEF_ExtraArgs(pipe_args, args);
 
         if ((args.embedding_model_path.size() < 1) || (args.vector_stores.empty()))
         {
diff --git a/src/models.cpp b/src/models.cpp
@@ -26,7 +26,10 @@ namespace chatllm
     struct RuntimeConfig
     {
         std::string gpu_layers;
-        RuntimeConfig(const std::string &gpu_layers): gpu_layers(gpu_layers) {}
+        bool moe_on_cpu;
+        RuntimeConfig(const std::string &gpu_layers, bool moe_on_cpu):
+            gpu_layers(gpu_layers), moe_on_cpu(moe_on_cpu)
+        {}
     };
 
     class ForwardContext : public ComputeContext
@@ -44,22 +47,6 @@ namespace chatllm
         ggml_cgraph *gf;
     };
 
-    class LayerMover
-    {
-    public:
-        LayerMover(InitContext *ctx, int layer_id): ctx(ctx)
-        {
-            ctx->move_to_layer(layer_id);
-        }
-
-        operator InitContext *() const
-        {
-            return ctx;
-        }
-    private:
-        InitContext *ctx;
-    };
-
     static ForwardContext *dbg_ctx = nullptr;
     static std::unordered_map<ggml::tensor *, std::string> inspected_set;
     static ggml::tensor *dbg_w = nullptr;
@@ -1151,6 +1138,7 @@ namespace chatllm
         {
             std::vector<BackendContext::gpu_cfg> gpu_cfgs;
             parse_gpu_layers(gpu_cfgs, rt_config.gpu_layers);
+            w_ctx_.user_options.moe_on_cpu = rt_config.moe_on_cpu;
             backend_context.init(gpu_cfgs, config_.num_hidden_layers, GRAPH_SIZE);
         }
 
@@ -1209,6 +1197,7 @@ namespace chatllm
             }
 
             ForwardContext ctx(&backend_context);
+            ctx.user_options = w_ctx_.user_options;
 
             ctx.gctx = GGMLContext({.mem_size = backend_context.buf_compute_meta.size(), .mem_buffer = backend_context.buf_compute_meta.data(), .no_alloc = true});
             ctx.gf = ggml::new_graph_custom(&ctx, GRAPH_SIZE, false);
@@ -1933,7 +1922,7 @@ namespace chatllm
             config.num_hidden_layers = (int)layers.size();
         }
 
-        RuntimeConfig rt_config(args.gpu_layers);
+        RuntimeConfig rt_config(args.gpu_layers, args.moe_on_cpu);
 
         // load model
         ConditionalGeneration *model = new ConditionalGeneration(config, rt_config);