Skip to content

Commit 2c2decb

Browse files
Optimize alignment and buffer management
1 parent 088e00b commit 2c2decb

File tree

3 files changed

+58
-9
lines changed

3 files changed

+58
-9
lines changed

ggml/src/ggml-alloc.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,22 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
6666
}
6767
}
6868

69+
// return the next offset aligned to the specified power-of-two boundary
70+
// optimized to avoid expensive modulo operations for common alignments
6971
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
7072
assert(alignment && !(alignment & (alignment - 1))); // power of 2
71-
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
72-
return offset + align;
73+
74+
uintptr_t addr = (uintptr_t) buffer + offset;
75+
76+
switch (alignment) {
77+
case 16: return offset + ((-addr) & 15);
78+
case 32: return offset + ((-addr) & 31);
79+
case 64: return offset + ((-addr) & 63);
80+
default: {
81+
size_t mask = alignment - 1;
82+
return offset + ((-addr) & mask);
83+
}
84+
}
7385
}
7486

7587
// tallocr

src/llama-context.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,13 @@ llama_context::llama_context(
8787
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
8888
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
8989
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
90-
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
91-
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
92-
cparams.n_batch = GGML_KQ_MASK_PAD;
90+
bool gpu_backend = model.params.n_gpu_layers > 0;
91+
if (gpu_backend && cparams.causal_attn) {
92+
uint32_t padded = GGML_PAD(cparams.n_batch, GGML_KQ_MASK_PAD);
93+
if (padded != cparams.n_batch) {
94+
LLAMA_LOG_WARN("%s: n_batch padded from %u to %u due to GPU requirements\n", __func__, cparams.n_batch, padded);
95+
cparams.n_batch = padded;
96+
}
9397
}
9498

9599
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

src/llama-model.cpp

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,31 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
117117
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
118118
};
119119

120+
// RAII helper for temporary buffer assignment
121+
struct buffer_guard {
122+
explicit buffer_guard(ggml_tensor * t, ggml_backend_buffer_type_t buft) : t(t) {
123+
t->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
124+
}
125+
~buffer_guard() {
126+
if (t->buffer) {
127+
ggml_backend_buffer_free(t->buffer);
128+
t->buffer = nullptr;
129+
}
130+
}
131+
ggml_tensor * t;
132+
};
133+
134+
// cache for operation support checks
135+
struct op_support_key {
136+
ggml_backend_dev_t dev;
137+
ggml_backend_buffer_type_t buft;
138+
ggml_op op;
139+
bool operator<(const op_support_key & other) const {
140+
return std::tie(dev, buft, op) < std::tie(other.dev, other.buft, other.op);
141+
}
142+
};
143+
static std::map<op_support_key, bool> g_op_support_cache;
144+
120145
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
121146
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
122147
}
@@ -135,6 +160,12 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
135160
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
136161
GGML_ASSERT(w != nullptr);
137162

163+
op_support_key key { dev, buft, op };
164+
auto it = g_op_support_cache.find(key);
165+
if (it != g_op_support_cache.end()) {
166+
return it->second;
167+
}
168+
138169
if (op == GGML_OP_NONE) {
139170
return true;
140171
}
@@ -245,10 +276,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
245276

246277
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
247278
GGML_ASSERT(w->buffer == nullptr);
248-
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
279+
buffer_guard guard(w, buft);
249280
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
250-
ggml_backend_buffer_free(w->buffer);
251-
w->buffer = nullptr;
281+
282+
g_op_support_cache[key] = op_supported;
252283

253284
return op_supported;
254285
}
@@ -262,7 +293,9 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
262293
for (const auto & cur : buft_list) {
263294
ggml_backend_dev_t cur_dev = cur.first;
264295
ggml_backend_buffer_type_t cur_buft = cur.second;
265-
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
296+
bool should_offload = ggml_backend_dev_type(cur_dev) != GGML_BACKEND_DEVICE_TYPE_CPU ?
297+
ggml_backend_dev_offload_op(cur_dev, tensor) : true;
298+
if (should_offload && weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
266299
return cur_buft;
267300
}
268301
}

0 commit comments

Comments
 (0)