Skip to content

Commit 8669c3d

Browse files
ikawrakowIwan Kawrakow
andauthored
GPU offload policy (#405)
* Adding GPU offload policy * Minor --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 504fb89 commit 8669c3d

File tree

7 files changed

+77
-2
lines changed

7 files changed

+77
-2
lines changed

common/common.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,6 +1213,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12131213
}
12141214
return true;
12151215
}
1216+
if (arg == "--offload-policy" || arg == "-op") {
1217+
CHECK_ARG
1218+
auto p = string_split_pairs<int,int>(argv[i], ',');
1219+
if (p.empty()) {
1220+
fprintf(stderr, "error: Invalid offload policy argument: %s\n", argv[i]);
1221+
invalid_param = true;
1222+
} else {
1223+
params.offload_policy.insert(params.offload_policy.end(), p.begin(), p.end());
1224+
}
1225+
return true;
1226+
}
12161227
if (arg == "--host") {
12171228
CHECK_ARG
12181229
params.hostname = argv[i];
@@ -2222,6 +2233,10 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
22222233
return iparams;
22232234
}
22242235

2236+
for (auto [op, on_off] : params.offload_policy) {
2237+
llama_set_offload_policy(lctx, op, on_off);
2238+
}
2239+
22252240
if (!params.control_vectors.empty()) {
22262241
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
22272242
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
@@ -2418,6 +2433,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
24182433
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
24192434
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
24202435

2436+
if (!params.offload_policy.empty()) cparams.offload_policy = (void *)&params.offload_policy;
2437+
24212438
return cparams;
24222439
}
24232440

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ struct gpt_params {
143143
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
144144
std::vector<llama_model_kv_override> kv_overrides;
145145
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
146+
std::vector<std::pair<int,int>> offload_policy;
146147

147148
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
148149
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale

ggml/include/ggml-backend.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ extern "C" {
208208
// Set a callback to be called for each resulting node during graph compute
209209
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
210210

211+
// enable or disable op offload for a given op
212+
GGML_API void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off);
213+
211214
//
212215
// Utils
213216
//

ggml/src/ggml-backend.c

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1104,9 +1104,34 @@ struct ggml_backend_sched {
11041104
char * context_buffer;
11051105
size_t context_buffer_size;
11061106

1107+
uint32_t op_offload[(GGML_OP_COUNT + 31)/32];
1108+
11071109
bool debug;
11081110
};
11091111

1112+
void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off) {
1113+
int int_op = (int)op;
1114+
if (!sched) return;
1115+
if (int_op < 0 || int_op >= (int)GGML_OP_COUNT) {
1116+
uint32_t mask = on_or_off ? 0xffffffff : 0;
1117+
for (int i = 0; i < (GGML_OP_COUNT + 31)/32; ++i) sched->op_offload[i] = mask;
1118+
return;
1119+
}
1120+
int i = int_op >> 5;
1121+
int j = int_op & 31;
1122+
if (on_or_off) {
1123+
sched->op_offload[i] |= (1u << j);
1124+
} else {
1125+
sched->op_offload[i] &= (~(1u << j));
1126+
}
1127+
}
1128+
1129+
static inline bool ggml_backend_sched_offload_enabled(ggml_backend_sched_t sched, enum ggml_op op) {
1130+
int int_op = (int)op;
1131+
if (!sched || op < 0 || op >= GGML_OP_COUNT) return false;
1132+
return sched->op_offload[int_op >> 5] & (1u << (int_op & 31));
1133+
}
1134+
11101135
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
11111136
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
11121137
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
@@ -1181,6 +1206,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
11811206
}
11821207

11831208
// operations with weights are preferably run on the same backend as the weights
1209+
bool offload_enabled = ggml_backend_sched_offload_enabled(sched, tensor->op);
11841210
for (int i = 0; i < GGML_MAX_SRC; i++) {
11851211
const struct ggml_tensor * src = tensor->src[i];
11861212
if (src == NULL) {
@@ -1189,7 +1215,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
11891215
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
11901216
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
11911217
// check if a backend with higher prio wants to offload the op
1192-
if (src_backend_id == sched->n_backends - 1) {
1218+
if (offload_enabled && src_backend_id == sched->n_backends - 1) {
11931219
for (int b = 0; b < src_backend_id; b++) {
11941220
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
11951221
SET_CAUSE(tensor, "1.off");
@@ -1888,6 +1914,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
18881914

18891915
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
18901916

1917+
for (int i = 0; i < (GGML_OP_COUNT + 31)/32; ++i) sched->op_offload[i] = 0xffffffff;
1918+
18911919
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
18921920
sched->n_backends = n_backends;
18931921
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;

ggml/src/ggml-cuda.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3391,6 +3391,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
33913391
struct ggml_tensor * a = op->src[0];
33923392
struct ggml_tensor * b = op->op == GGML_OP_MOE_FUSED_UP_GATE ? op->src[2] : op->src[1];
33933393
if (op->op == GGML_OP_MOE_FUSED_UP_GATE && a->type != op->src[1]->type) {
3394+
printf("%s: returning false for GGML_OP_MOE_FUSED_UP_GATE because src0->type != src1->type\n", __func__);
33943395
return false;
33953396
}
33963397
//==================================================================
@@ -3399,6 +3400,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
33993400
//}
34003401
//==================================================================
34013402
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16 && !ggml_is_quantized(a->type)) {
3403+
printf("%s: returning false for op %d because (case 1)\n", __func__, (int)op->op);
34023404
return false;
34033405
}
34043406
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
@@ -3621,7 +3623,7 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
36213623
const int min_batch_size = 32;
36223624

36233625
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
3624-
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
3626+
(op->ne[2] >= min_batch_size && (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE));
36253627

36263628
GGML_UNUSED(backend);
36273629
}

include/llama.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,7 @@ extern "C" {
408408
// currently works only with CPU execution
409409
ggml_abort_callback abort_callback;
410410
void * abort_callback_data;
411+
void * offload_policy;
411412
};
412413

413414
// model quantization parameters
@@ -523,6 +524,8 @@ extern "C" {
523524
struct llama_model * model,
524525
struct llama_context_params params);
525526

527+
LLAMA_API void llama_set_offload_policy(struct llama_context * lctx, int op, bool on_or_off);
528+
526529
// Frees all allocated memory
527530
LLAMA_API void llama_free(struct llama_context * ctx);
528531

src/llama.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19980,6 +19980,7 @@ struct llama_context_params llama_context_default_params() {
1998019980
/*.thtesh_experts =*/ 0.0f,
1998119981
/*.abort_callback =*/ nullptr,
1998219982
/*.abort_callback_data =*/ nullptr,
19983+
/*.offload_policy =*/ nullptr,
1998319984
};
1998419985

1998519986
return result;
@@ -20574,6 +20575,19 @@ struct llama_context * llama_new_context_with_model(
2057420575
}
2057520576
}
2057620577

20578+
if (params.offload_policy) {
20579+
const std::vector<std::pair<int, int>>& policy = *(const std::vector<std::pair<int, int>>*)params.offload_policy;
20580+
for (auto [op, on_off] : policy) {
20581+
if (op < 0 || op >= int(GGML_OP_COUNT)) {
20582+
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for all ops to %s\n", on_off ? "ON" : "OFF");
20583+
} else {
20584+
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for op %s to %s\n",
20585+
ggml_op_name(ggml_op(op)), on_off ? "ON" : "OFF");
20586+
}
20587+
ggml_backend_sched_set_op_offload(ctx->sched, ggml_op(op), on_off);
20588+
}
20589+
}
20590+
2057720591
return ctx;
2057820592
}
2057920593

@@ -23222,3 +23236,10 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
2322223236
fputs(text, stderr);
2322323237
fflush(stderr);
2322423238
}
23239+
23240+
void llama_set_offload_policy(struct llama_context * lctx, int op, bool on_or_off) {
23241+
if (!lctx || !lctx->sched) return;
23242+
const char * op_name = op < 0 || op >= int(GGML_OP_COUNT) ? "all ops" : ggml_op_name(ggml_op(op));
23243+
printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXX offload(%s) = %d\n", op_name, on_or_off);
23244+
ggml_backend_sched_set_op_offload(lctx->sched, ggml_op(op), on_or_off);
23245+
}

0 commit comments

Comments
 (0)