@@ -117,6 +117,31 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
117117 { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
118118};
119119
120+ // RAII helper for temporary buffer assignment
121+ struct buffer_guard {
122+ explicit buffer_guard(ggml_tensor * t, ggml_backend_buffer_type_t buft) : t(t) {
123+ t->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
124+ }
125+ ~buffer_guard() {
126+ if (t->buffer) {
127+ ggml_backend_buffer_free(t->buffer);
128+ t->buffer = nullptr;
129+ }
130+ }
131+ ggml_tensor * t;
132+ };
133+
134+ // cache for operation support checks
135+ struct op_support_key {
136+ ggml_backend_dev_t dev;
137+ ggml_backend_buffer_type_t buft;
138+ ggml_op op;
139+ bool operator<(const op_support_key & other) const {
140+ return std::tie(dev, buft, op) < std::tie(other.dev, other.buft, other.op);
141+ }
142+ };
143+ static std::map<op_support_key, bool> g_op_support_cache;
144+
120145std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
121146 return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
122147}
@@ -135,6 +160,12 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
135160static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
136161 GGML_ASSERT(w != nullptr);
137162
163+ op_support_key key { dev, buft, op };
164+ auto it = g_op_support_cache.find(key);
165+ if (it != g_op_support_cache.end()) {
166+ return it->second;
167+ }
168+
138169 if (op == GGML_OP_NONE) {
139170 return true;
140171 }
@@ -245,10 +276,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
245276
246277 // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
247278 GGML_ASSERT(w->buffer == nullptr);
248- w->buffer = ggml_backend_buft_alloc_buffer(buft, 0 );
279+ buffer_guard guard(w, buft );
249280 bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
250- ggml_backend_buffer_free(w->buffer);
251- w->buffer = nullptr ;
281+
282+ g_op_support_cache[key] = op_supported ;
252283
253284 return op_supported;
254285}
@@ -262,7 +293,9 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
262293 for (const auto & cur : buft_list) {
263294 ggml_backend_dev_t cur_dev = cur.first;
264295 ggml_backend_buffer_type_t cur_buft = cur.second;
265- if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
296+ bool should_offload = ggml_backend_dev_type(cur_dev) != GGML_BACKEND_DEVICE_TYPE_CPU ?
297+ ggml_backend_dev_offload_op(cur_dev, tensor) : true;
298+ if (should_offload && weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
266299 return cur_buft;
267300 }
268301 }
0 commit comments