Skip to content

Commit 3c43f9d

Browse files
author
Iwan Kawrakow
committed
Add a command line argument
1 parent 9ef79d3 commit 3c43f9d

File tree

6 files changed

+22
-3
lines changed

6 files changed

+22
-3
lines changed

common/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
13781378
}
13791379
return true;
13801380
}
1381+
if (arg == "--offload-only-active-experts" || arg == "-ooae") {
1382+
params.only_active_exps = true;
1383+
return true;
1384+
}
13811385
if (arg == "--host") {
13821386
CHECK_ARG
13831387
params.hostname = argv[i];
@@ -2746,6 +2750,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
27462750
cparams.fused_up_gate = params.fused_up_gate;
27472751
cparams.min_experts = params.min_experts;
27482752
cparams.thresh_experts = params.thresh_experts;
2753+
cparams.only_active_experts = params.only_active_exps;
27492754

27502755
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
27512756
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ struct gpt_params {
223223
bool repack_tensors = false; // repack tensors if interleaved variant is available
224224
bool use_thp = false; // use transparent huge pages (linux only)
225225
bool validate_quants = false; // if true, check for NaNs while loading the model
226+
bool only_active_exps = false; // if true, offload only active experts (relevant only for hybrid CPU/GPU)
226227

227228
std::string cache_type_k = "f16"; // KV cache data type for the K
228229
std::string cache_type_v = "f16"; // KV cache data type for the V

ggml/include/ggml-backend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ extern "C" {
210210

211211
// enable or disable op offload for a given op
212212
GGML_API void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off);
213+
GGML_API void ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off);
213214

214215
//
215216
// Utils

ggml/src/ggml-backend.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,7 @@ struct ggml_backend_sched {
11601160

11611161
uint32_t op_offload[(GGML_OP_COUNT + 31)/32];
11621162

1163+
bool only_active_experts;
11631164
bool debug;
11641165
};
11651166

@@ -1180,6 +1181,11 @@ void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op
11801181
}
11811182
}
11821183

1184+
void ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off) {
1185+
if (!sched) return;
1186+
sched->only_active_experts = on_or_off;
1187+
}
1188+
11831189
static inline bool ggml_backend_sched_offload_enabled(ggml_backend_sched_t sched, enum ggml_op op) {
11841190
int int_op = (int)op;
11851191
if (!sched || op < 0 || op >= GGML_OP_COUNT) return false;
@@ -1889,9 +1895,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
18891895
} else {
18901896
ggml_backend_synchronize(split_backend);
18911897
}
1892-
#if 1
1898+
18931899
ggml_tensor * node = split->graph.nodes[0];
1894-
if (split->graph.n_nodes > 0 &&
1900+
if (sched->only_active_experts && split->graph.n_nodes > 0 &&
18951901
ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
18961902
ggml_backend_buffer_is_host(input->buffer) &&
18971903
node->src[cur_arg] == input_cpy &&
@@ -1954,7 +1960,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
19541960
copy_experts(first_id, last_id);
19551961
if (node->op == GGML_OP_MOE_FUSED_UP_GATE) ++cur_arg;
19561962
} else
1957-
#endif
19581963
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
19591964
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
19601965
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ extern "C" {
424424
bool fused_up_gate; // whether to use fused up/gate op [EXPERIMENTAL]
425425
int min_experts;
426426
float thresh_experts;
427+
bool only_active_experts;
427428

428429
// Abort callback
429430
// if it returns true, execution of llama_decode() will be aborted

src/llama.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18957,6 +18957,7 @@ struct llama_context_params llama_context_default_params() {
1895718957
/*.fused_up_gate =*/ true,
1895818958
/*.min_experts =*/ -1,
1895918959
/*.thtesh_experts =*/ 0.0f,
18960+
/*.only_active_experts =*/ false,
1896018961
/*.abort_callback =*/ nullptr,
1896118962
/*.abort_callback_data =*/ nullptr,
1896218963
/*.offload_policy =*/ nullptr,
@@ -19548,6 +19549,11 @@ struct llama_context * llama_new_context_with_model(
1954819549
}
1954919550
}
1955019551

19552+
if (params.only_active_experts) {
19553+
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
19554+
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
19555+
}
19556+
1955119557
return ctx;
1955219558
}
1955319559

0 commit comments

Comments
 (0)