ggml-org · Somebodyhere101 · May 20, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
@@ -2344,8 +2344,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                     }
                     throw std::invalid_argument("unknown buffer type");
                 }
-                // FIXME: this leaks memory
-                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+                // store pattern to ensure lifetime for the C-string
+                params.tensor_buft_override_names.push_back(tensor_name);
+                params.tensor_buft_overrides.push_back({params.tensor_buft_override_names.back().c_str(), buft_list.at(buffer_type)});
             }
         }
     ));

@@ -8,6 +8,7 @@
 #include <string>
 #include <string_view>
 #include <vector>
+#include <deque>
 #include <sstream>
 
 #ifdef _WIN32
@@ -281,6 +282,7 @@ struct common_params {
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
+    std::deque<std::string> tensor_buft_override_names; // storage for tensor override patterns
     std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)

@@ -66,10 +66,22 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
 }
 
+// return the next offset aligned to the specified power-of-two boundary
+// optimized to avoid expensive modulo operations for common alignments
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
-    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
-    return offset + align;
+
+    uintptr_t addr = (uintptr_t) buffer + offset;
+
+    switch (alignment) {
+        case 16: return offset + ((-addr) & 15);
+        case 32: return offset + ((-addr) & 31);
+        case 64: return offset + ((-addr) & 63);
+        default: {
+            size_t mask = alignment - 1;
+            return offset + ((-addr) & mask);
+        }
+    }
 }
 
 // tallocr

@@ -87,9 +87,13 @@ llama_context::llama_context(
     // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
     // ref: https://github.com/ggerganov/llama.cpp/pull/5021
     // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
+    bool gpu_backend = model.params.n_gpu_layers > 0;
+    if (gpu_backend && cparams.causal_attn) {
+        uint32_t padded = GGML_PAD(cparams.n_batch, GGML_KQ_MASK_PAD);
+        if (padded != cparams.n_batch) {
+            LLAMA_LOG_WARN("%s: n_batch padded from %u to %u due to GPU requirements\n", __func__, cparams.n_batch, padded);
+            cparams.n_batch = padded;
+        }
     }
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

@@ -117,6 +117,31 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
     { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 };
 
+// RAII helper for temporary buffer assignment
+struct buffer_guard {
+    explicit buffer_guard(ggml_tensor * t, ggml_backend_buffer_type_t buft) : t(t) {
+        t->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+    }
+    ~buffer_guard() {
+        if (t->buffer) {
+            ggml_backend_buffer_free(t->buffer);
+            t->buffer = nullptr;
+        }
+    }
+    ggml_tensor * t;
+};
+
+// cache for operation support checks
+struct op_support_key {
+    ggml_backend_dev_t        dev;
+    ggml_backend_buffer_type_t buft;
+    ggml_op                   op;
+    bool operator<(const op_support_key & other) const {
+        return std::tie(dev, buft, op) < std::tie(other.dev, other.buft, other.op);
+    }
+};
+static std::map<op_support_key, bool> g_op_support_cache;
+
 std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
     return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
 }
@@ -135,6 +160,12 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
 static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
     GGML_ASSERT(w != nullptr);
 
+    op_support_key key { dev, buft, op };
+    auto it = g_op_support_cache.find(key);
+    if (it != g_op_support_cache.end()) {
+        return it->second;
+    }
+
     if (op == GGML_OP_NONE) {
         return true;
     }
@@ -245,10 +276,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
 
     // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
     GGML_ASSERT(w->buffer == nullptr);
-    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+    buffer_guard guard(w, buft);
     bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-    ggml_backend_buffer_free(w->buffer);
-    w->buffer = nullptr;
+
+    g_op_support_cache[key] = op_supported;
 
     return op_supported;
 }
@@ -262,7 +293,9 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
     for (const auto & cur : buft_list) {
         ggml_backend_dev_t cur_dev = cur.first;
         ggml_backend_buffer_type_t cur_buft = cur.second;
-        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
+        bool should_offload = ggml_backend_dev_type(cur_dev) != GGML_BACKEND_DEVICE_TYPE_CPU ?
+                              ggml_backend_dev_offload_op(cur_dev, tensor) : true;
+        if (should_offload && weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
             return cur_buft;
         }
     }