Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1378,6 +1378,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "--offload-only-active-experts" || arg == "-ooae") {
params.only_active_exps = true;
return true;
}
if (arg == "--host") {
CHECK_ARG
params.hostname = argv[i];
Expand Down Expand Up @@ -2746,6 +2750,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.fused_up_gate = params.fused_up_gate;
cparams.min_experts = params.min_experts;
cparams.thresh_experts = params.thresh_experts;
cparams.only_active_experts = params.only_active_exps;

cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ struct gpt_params {
bool repack_tensors = false; // repack tensors if interleaved variant is available
bool use_thp = false; // use transparent huge pages (linux only)
bool validate_quants = false; // if true, check for NaNs while loading the model
bool only_active_exps = false; // if true, offload only active experts (relevant only for hybrid CPU/GPU)

std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ extern "C" {

// enable or disable op offload for a given op
GGML_API void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off);
GGML_API void ggml_backend_sched_set_only_active_experts(ggml_backend_sched_t sched, bool on_or_off);

//
// Utils
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1493,7 +1493,7 @@ add_library(ggml
../include/ggml-backend.h
ggml.c
ggml-alloc.c
ggml-backend.c
ggml-backend.cpp
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
Expand Down
179 changes: 137 additions & 42 deletions ggml/src/ggml-backend.c → ggml/src/ggml-backend.cpp

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4288,8 +4288,9 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
if (batch_size < min_batch_size) return false;
int64_t n_experts_tot = op->src[0]->ne[2];
int64_t n_experts_active = ids->ne[0];
//printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3]);
return batch_size*n_experts_active >= min_batch_size*n_experts_tot;
bool should_offload = batch_size*n_experts_active >= min_batch_size*n_experts_tot;
//printf("%s(%s): op->ne[2] = %ld, n_experts_tot = %ld, n_experts_active = %ld, ids: %s, %ld x %ld x %ld x %ld -> %d (%ld, %ld)\n", __func__, op->name, op->ne[2], n_experts_tot, n_experts_active, ids->name, ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], should_offload, batch_size*n_experts_active, min_batch_size*n_experts_tot);
return should_offload;
}

return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ extern "C" {
bool fused_up_gate; // whether to use fused up/gate op [EXPERIMENTAL]
int min_experts;
float thresh_experts;
bool only_active_experts;

// Abort callback
// if it returns true, execution of llama_decode() will be aborted
Expand Down
6 changes: 6 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18957,6 +18957,7 @@ struct llama_context_params llama_context_default_params() {
/*.fused_up_gate =*/ true,
/*.min_experts =*/ -1,
/*.thtesh_experts =*/ 0.0f,
/*.only_active_experts =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
/*.offload_policy =*/ nullptr,
Expand Down Expand Up @@ -19548,6 +19549,11 @@ struct llama_context * llama_new_context_with_model(
}
}

if (params.only_active_experts) {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
}

return ctx;
}

Expand Down