@@ -1104,9 +1104,34 @@ struct ggml_backend_sched {
11041104 char * context_buffer ;
11051105 size_t context_buffer_size ;
11061106
1107+ uint32_t op_offload [(GGML_OP_COUNT + 31 )/32 ];
1108+
11071109 bool debug ;
11081110};
11091111
1112+ void ggml_backend_sched_set_op_offload (ggml_backend_sched_t sched , enum ggml_op op , bool on_or_off ) {
1113+ int int_op = (int )op ;
1114+ if (!sched ) return ;
1115+ if (int_op < 0 || int_op >= (int )GGML_OP_COUNT ) {
1116+ uint32_t mask = on_or_off ? 0xffffffff : 0 ;
1117+ for (int i = 0 ; i < (GGML_OP_COUNT + 31 )/32 ; ++ i ) sched -> op_offload [i ] = mask ;
1118+ return ;
1119+ }
1120+ int i = int_op >> 5 ;
1121+ int j = int_op & 31 ;
1122+ if (on_or_off ) {
1123+ sched -> op_offload [i ] |= (1u << j );
1124+ } else {
1125+ sched -> op_offload [i ] &= (~(1u << j ));
1126+ }
1127+ }
1128+
1129+ static inline bool ggml_backend_sched_offload_enabled (ggml_backend_sched_t sched , enum ggml_op op ) {
1130+ int int_op = (int )op ;
1131+ if (!sched || op < 0 || op >= GGML_OP_COUNT ) return false;
1132+ return sched -> op_offload [int_op >> 5 ] & (1u << (int_op & 31 ));
1133+ }
1134+
11101135#define hash_id (tensor ) ggml_hash_find_or_insert(&sched->hash_set, tensor)
11111136#define tensor_backend_id (tensor ) sched->hv_tensor_backend_ids[hash_id(tensor)]
11121137#define tensor_id_copy (id , backend_id , copy_id ) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
@@ -1181,6 +1206,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
11811206 }
11821207
11831208 // operations with weights are preferably run on the same backend as the weights
1209+ bool offload_enabled = ggml_backend_sched_offload_enabled (sched , tensor -> op );
11841210 for (int i = 0 ; i < GGML_MAX_SRC ; i ++ ) {
11851211 const struct ggml_tensor * src = tensor -> src [i ];
11861212 if (src == NULL ) {
@@ -1189,7 +1215,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
11891215 if (src -> buffer != NULL && src -> buffer -> usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS ) {
11901216 int src_backend_id = ggml_backend_sched_backend_from_buffer (sched , src , tensor );
11911217 // check if a backend with higher prio wants to offload the op
1192- if (src_backend_id == sched -> n_backends - 1 ) {
1218+ if (offload_enabled && src_backend_id == sched -> n_backends - 1 ) {
11931219 for (int b = 0 ; b < src_backend_id ; b ++ ) {
11941220 if (ggml_backend_supports_op (sched -> backends [b ], tensor ) && ggml_backend_offload_op (sched -> backends [b ], tensor )) {
11951221 SET_CAUSE (tensor , "1.off" );
@@ -1888,6 +1914,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
18881914
18891915 struct ggml_backend_sched * sched = calloc (1 , sizeof (struct ggml_backend_sched ));
18901916
1917+ for (int i = 0 ; i < (GGML_OP_COUNT + 31 )/32 ; ++ i ) sched -> op_offload [i ] = 0xffffffff ;
1918+
18911919 sched -> debug = getenv ("GGML_SCHED_DEBUG" ) != NULL ;
18921920 sched -> n_backends = n_backends ;
18931921 sched -> n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1 ;
0 commit comments