@@ -1106,6 +1106,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
11061106        }
11071107        return  true ;
11081108    }
1109+     if  (arg == " --cpu-moe"   || arg == " -cmoe"  ) {
1110+         params.tensor_buft_overrides .push_back ({strdup (" \\ .ffn_(up|down|gate)_exps"  ), ggml_backend_cpu_buffer_type ()});
1111+         return  true ;
1112+     }
1113+     if  (arg == " --n-cpu-moe"   || arg == " -ncmoe"  ) {
1114+         CHECK_ARG
1115+         int32_t  n_layers = std::stoi (argv[i]);
1116+         if  (n_layers < 0 ) {
1117+             fprintf (stderr, " error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n "  , n_layers);
1118+             invalid_param = true ;
1119+             return  true ;
1120+         }
1121+         for  (int32_t  l = 0 ; l < n_layers; ++l) {
1122+             std::string pattern = " blk\\ ."   + std::to_string (l) + " \\ .(ffn_(up|down|gate)_exps)"  ;
1123+             params.tensor_buft_overrides .push_back ({strdup (pattern.c_str ()), ggml_backend_cpu_buffer_type ()});
1124+         }
1125+         return  true ;
1126+     }
11091127    if  (arg == " --no-mmap"  ) {
11101128        params.use_mmap  = false ;
11111129        return  true ;
@@ -1820,6 +1838,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
18201838        options.push_back ({ " *"  ,           "        --no-mmap"  ,              " do not memory-map model (slower load but may reduce pageouts if not using mlock)"   });
18211839    }
18221840    options.push_back ({ " *"  ,           "        --run-time-repack"  ,      " repack tensors if interleaved variant is available"  });
1841+     options.push_back ({ " *"  ,           "        --cpu-moe"  ,              " keep all MoE weights in CPU memory"  });
1842+     options.push_back ({ " *"  ,           "        --n-cpu-moe N"  ,          " keep MoE weights of the first N layers in CPU memory"  });
18231843    options.push_back ({ " *"  ,           "        --numa TYPE"  ,            " attempt optimizations that help on some NUMA systems\n " 
18241844                                                                        "   - distribute: spread execution evenly over all nodes\n " 
18251845                                                                        "   - isolate: only spawn threads on CPUs on the node that execution started on\n " 
0 commit comments