Optimize alignment and buffer management

Somebodyhere101 · Somebodyhere101 · commit 2c2decb37c26 · 2025-05-20T14:19:14.000+04:00
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -66,10 +66,22 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
 }
 
+// return the next offset aligned to the specified power-of-two boundary
+// optimized to avoid expensive modulo operations for common alignments
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
-    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
-    return offset + align;
+
+    uintptr_t addr = (uintptr_t) buffer + offset;
+
+    switch (alignment) {
+        case 16: return offset + ((-addr) & 15);
+        case 32: return offset + ((-addr) & 31);
+        case 64: return offset + ((-addr) & 63);
+        default: {
+            size_t mask = alignment - 1;
+            return offset + ((-addr) & mask);
+        }
+    }
 }
 
 // tallocr
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -87,9 +87,13 @@ llama_context::llama_context(
     // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
     // ref: https://github.com/ggerganov/llama.cpp/pull/5021
     // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
+    bool gpu_backend = model.params.n_gpu_layers > 0;
+    if (gpu_backend && cparams.causal_attn) {
+        uint32_t padded = GGML_PAD(cparams.n_batch, GGML_KQ_MASK_PAD);
+        if (padded != cparams.n_batch) {
+            LLAMA_LOG_WARN("%s: n_batch padded from %u to %u due to GPU requirements\n", __func__, cparams.n_batch, padded);
+            cparams.n_batch = padded;
+        }
     }
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -117,6 +117,31 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
     { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 };
 
+// RAII helper for temporary buffer assignment
+struct buffer_guard {
+    explicit buffer_guard(ggml_tensor * t, ggml_backend_buffer_type_t buft) : t(t) {
+        t->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+    }
+    ~buffer_guard() {
+        if (t->buffer) {
+            ggml_backend_buffer_free(t->buffer);
+            t->buffer = nullptr;
+        }
+    }
+    ggml_tensor * t;
+};
+
+// cache for operation support checks
+struct op_support_key {
+    ggml_backend_dev_t        dev;
+    ggml_backend_buffer_type_t buft;
+    ggml_op                   op;
+    bool operator<(const op_support_key & other) const {
+        return std::tie(dev, buft, op) < std::tie(other.dev, other.buft, other.op);
+    }
+};
+static std::map<op_support_key, bool> g_op_support_cache;
+
 std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
     return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
 }
@@ -135,6 +160,12 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
 static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
     GGML_ASSERT(w != nullptr);
 
+    op_support_key key { dev, buft, op };
+    auto it = g_op_support_cache.find(key);
+    if (it != g_op_support_cache.end()) {
+        return it->second;
+    }
+
     if (op == GGML_OP_NONE) {
         return true;
     }
@@ -245,10 +276,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
 
     // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
     GGML_ASSERT(w->buffer == nullptr);
-    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+    buffer_guard guard(w, buft);
     bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-    ggml_backend_buffer_free(w->buffer);
-    w->buffer = nullptr;
+
+    g_op_support_cache[key] = op_supported;
 
     return op_supported;
 }
@@ -262,7 +293,9 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
     for (const auto & cur : buft_list) {
         ggml_backend_dev_t cur_dev = cur.first;
         ggml_backend_buffer_type_t cur_buft = cur.second;
-        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
+        bool should_offload = ggml_backend_dev_type(cur_dev) != GGML_BACKEND_DEVICE_TYPE_CPU ?
+                              ggml_backend_dev_offload_op(cur_dev, tensor) : true;
+        if (should_offload && weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
             return cur_buft;
         }
     }