Skip to content

Commit ab56b24

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 09b18dc + 606a73f commit ab56b24

File tree

13 files changed

+274
-169
lines changed

13 files changed

+274
-169
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ jobs:
9797
ctest -L 'main|curl' --verbose --timeout 900
9898
9999
macOS-latest-cmake-x64:
100-
runs-on: macos-13
100+
runs-on: macos-15-intel
101101

102102
steps:
103103
- name: Clone

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ jobs:
7575
name: llama-bin-macos-arm64.zip
7676

7777
macOS-x64:
78-
runs-on: macos-13
78+
runs-on: macos-15-intel
7979

8080
steps:
8181
- name: Clone

ggml/src/ggml-alloc.c

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -392,12 +392,8 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
392392
free(alloc);
393393
}
394394

395-
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
396-
size_t max_size = 0;
397-
for (int i = 0; i < alloc->n_chunks; i++) {
398-
max_size += alloc->chunks[i]->max_size;
399-
}
400-
return max_size;
395+
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
396+
return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
401397
}
402398

403399

@@ -417,10 +413,8 @@ static void ggml_vbuffer_free(struct vbuffer * buf) {
417413
free(buf);
418414
}
419415

420-
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
421-
int n = 0;
422-
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
423-
return n;
416+
static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
417+
return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
424418
}
425419

426420
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
@@ -885,12 +879,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
885879
}
886880
}
887881

888-
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
889-
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
890-
891882
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
892-
if (new_size > cur_size || galloc->buffers[i] == NULL) {
883+
bool realloc = galloc->buffers[i] == NULL;
884+
size_t new_size = 0;
885+
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
886+
size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
887+
size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
888+
new_size += new_chunk_size;
889+
if (new_chunk_size > cur_chunk_size) {
890+
realloc = true;
891+
}
892+
}
893+
if (realloc) {
893894
#ifndef NDEBUG
895+
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
894896
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
895897
#endif
896898

ggml/src/ggml-metal/ggml-metal-common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static bool ggml_mem_ranges_add_dst(ggml_mem_ranges_t mrs, const ggml_tensor * t
112112
}
113113

114114
bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
115-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
115+
for (int i = 0; i < GGML_MAX_SRC; i++) {
116116
if (tensor->src[i]) {
117117
ggml_mem_ranges_add_src(mrs, tensor->src[i]);
118118
}
@@ -173,7 +173,7 @@ static bool ggml_mem_ranges_check_dst(ggml_mem_ranges_t mrs, const ggml_tensor *
173173
}
174174

175175
bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
176-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
176+
for (int i = 0; i < GGML_MAX_SRC; i++) {
177177
if (tensor->src[i]) {
178178
if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
179179
return false;

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 95 additions & 103 deletions
Large diffs are not rendered by default.

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,13 @@ void main() {
153153
}
154154

155155
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
156+
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
156157

157158
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
158159
uint32_t c = (idx + tid) % Bc;
159160
uint32_t r = (idx + tid) / Bc;
160161
if (idx + tid < Bc * Br) {
161-
if (!KV_bounds_check || j * Bc + c < KV) {
162+
if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
162163
masksh[c][r] = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]);
163164
} else {
164165
masksh[c][r] = float(0);

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,13 @@ void main() {
201201
}
202202

203203
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
204+
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
205+
204206
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
205207
uint32_t c = (idx + tid) % Bc;
206208
uint32_t r = (idx + tid) / Bc;
207209
if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
208-
if (!KV_bounds_check || j * Bc + c < KV) {
210+
if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
209211
sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]));
210212
}
211213
}
@@ -356,8 +358,8 @@ void main() {
356358
}
357359

358360
if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) {
359-
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
360-
float sink = perElemOpGetSink(r, 0u, ACC_TYPE(0), iq2);
361+
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
362+
float sink = perElemOpGetSink(tile_row(r), 0u, ACC_TYPE(0), iq2);
361363

362364
float ms = 1.0f;
363365
float vs = 1.0f;

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,15 +154,31 @@ void main() {
154154
}
155155

156156
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
157-
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
158-
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
159-
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
157+
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
160158

161-
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
159+
if (nem1_bounds_check) {
160+
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
161+
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
162+
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
162163

163-
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
164+
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
164165

165-
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
166+
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
167+
168+
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
169+
} else {
170+
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
171+
// Don't clamp against nem1 when GQA is enabled
172+
uint32_t m_height = p.gqa_ratio > 1 ? ~0 : p.nem1;
173+
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, m_height, KV);
174+
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
175+
176+
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
177+
178+
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
179+
180+
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
181+
}
166182
}
167183

168184
// Clear padding elements to -inf, so they don't contribute to rowmax

src/llama-model.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7843,6 +7843,8 @@ struct llm_build_bert : public llm_graph_context {
78437843
}
78447844

78457845
if (model.layers[il].attn_q_norm) {
7846+
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
7847+
78467848
Qcur = build_norm(Qcur,
78477849
model.layers[il].attn_q_norm,
78487850
model.layers[il].attn_q_norm_b,
@@ -7852,6 +7854,8 @@ struct llm_build_bert : public llm_graph_context {
78527854
}
78537855

78547856
if (model.layers[il].attn_k_norm) {
7857+
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
7858+
78557859
Kcur = build_norm(Kcur,
78567860
model.layers[il].attn_k_norm,
78577861
model.layers[il].attn_k_norm_b,
@@ -8234,6 +8238,9 @@ struct llm_build_mpt : public llm_graph_context {
82348238

82358239
// Q/K Layernorm
82368240
if (model.layers[il].attn_q_norm) {
8241+
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
8242+
Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
8243+
82378244
Qcur = build_norm(Qcur,
82388245
model.layers[il].attn_q_norm,
82398246
model.layers[il].attn_q_norm_b,

tests/test-alloc.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,41 @@ static void test_buffer_size_zero() {
548548
GGML_ASSERT(backend_b.context->allocated_total() == 0);
549549
}
550550

551+
// Test re-using gallocr for a different graph. The new graph has the same
552+
// total size, but one of the chunks is larger, so reallocation is required.
553+
static void test_reallocation() {
554+
dummy_backend backend = dummy_backend_init(32, /*align*/ 4);
555+
ggml_gallocr_ptr galloc;
556+
{
557+
auto [ctx, graph, ctx_ptr] = make_context();
558+
ggml_tensor * x[4];
559+
x[0] = make_input_with_size(ctx, 24);
560+
x[1] = make_input_with_size(ctx, 16);
561+
x[2] = ggml_view_1d(ctx, x[0], 4, 0);
562+
x[3] = ggml_add(ctx, x[2], x[1]);
563+
assign_names(ctx);
564+
565+
galloc = allocate_graph(graph, x[3], &backend.buffer_type);
566+
check_all_allocated(graph);
567+
GGML_ASSERT(backend.context->allocated_total() == 40);
568+
}
569+
{
570+
auto [ctx, graph, ctx_ptr] = make_context();
571+
ggml_tensor * x[3];
572+
x[0] = make_input_with_size(ctx, 20);
573+
x[1] = make_input_with_size(ctx, 20);
574+
x[2] = ggml_add(ctx, x[0], x[1]);
575+
assign_names(ctx);
576+
ggml_set_output(x[2]);
577+
ggml_build_forward_expand(graph, x[2]);
578+
579+
bool result = ggml_gallocr_alloc_graph(galloc.get(), graph);
580+
GGML_ASSERT(result);
581+
check_all_allocated(graph);
582+
GGML_ASSERT(backend.context->allocated_total() == 40);
583+
}
584+
}
585+
551586
static void run(const char * name, void (*f)()) {
552587
printf("%s ", name);
553588
fflush(stdout);
@@ -568,5 +603,6 @@ int main() {
568603
run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
569604
run("test_multiple_buffer_types", test_multiple_buffer_types);
570605
run("test_buffer_size_zero", test_buffer_size_zero);
606+
run("test_reallocation", test_reallocation);
571607
return 0;
572608
}

0 commit comments

Comments
 (0)