Skip to content

Commit cb58451

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 414e382 + b730706 commit cb58451

18 files changed

+352
-277
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 101 additions & 115 deletions
Large diffs are not rendered by default.

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ layout (constant_id = 4) const uint32_t HSV = 32;
99
layout (constant_id = 5) const uint32_t Clamp = 0;
1010
layout (constant_id = 6) const uint32_t D_split = 16;
1111

12+
// Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
13+
const uint32_t HSK_pad = (HSK + 15) & ~15;
14+
const uint32_t HSV_pad = (HSV + 15) & ~15;
15+
1216
layout (push_constant) uniform parameter {
1317
uint32_t N;
1418
uint32_t KV;

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,14 @@ const uint32_t MatBc = 16;
4646
shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
4747
shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x];
4848

49-
const uint32_t qstride = HSK / 4 + 2; // in units of f16vec4
49+
const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
5050
shared f16vec4 Qf[Br * qstride];
5151

5252
// Avoid padding for hsk==256 to make it fit in 48KB shmem.
5353
const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br;
5454
shared ACC_TYPE sfsh[Bc * sfshstride];
5555

56-
const uint32_t kshstride = HSK / 4 + 2; // in units of f16vec4
56+
const uint32_t kshstride = HSK_pad / 4 + 2; // in units of f16vec4
5757
shared f16vec4 ksh[Bc * kshstride];
5858

5959
shared float slope[Br];
@@ -74,6 +74,21 @@ void main() {
7474

7575
#define tile_row(r) (row_tid * rows_per_thread + (r))
7676

77+
// Zero-initialize shared memory for Q/K when HSK is not a multiple of 16 (HSK_pad > HSK).
78+
if ((HSK % 16) != 0) {
79+
[[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) {
80+
if (i + tid < Br * qstride) {
81+
Qf[i + tid] = f16vec4(0);
82+
}
83+
}
84+
[[unroll]] for (uint i = 0; i < Bc * kshstride; i += gl_WorkGroupSize.x) {
85+
if (i + tid < Bc * kshstride) {
86+
ksh[i + tid] = f16vec4(0);
87+
}
88+
}
89+
barrier();
90+
}
91+
7792
uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
7893

7994
[[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
@@ -151,14 +166,14 @@ void main() {
151166
}
152167
barrier();
153168

154-
// K * Q^T -> S^T: Bc x HSK * HSK x Br -> Bc x Br
169+
// K * Q^T -> S^T: Bc x HSK_pad * HSK_pad x Br -> Bc x Br
155170
// Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
156171
// This is written transposed in order to allow for N being 8 if implementations need it
157172
coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
158173
coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
159174
coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
160175

161-
for (uint32_t d = 0; d < HSK / 16; ++d) {
176+
for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
162177
coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);
163178

164179
uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,16 @@ void main() {
104104
tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
105105
tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
106106

107-
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseAccumulator> Q;
108-
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA> Qf16;
107+
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseAccumulator> Q;
108+
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
109109

110110
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
111-
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK));
111+
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad));
112112

113-
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA>(Q);
113+
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
114114
Qf16 *= float16_t(p.scale);
115115

116-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
116+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
117117

118118
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
119119

@@ -140,10 +140,10 @@ void main() {
140140

141141
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
142142

143-
coopmat<float16_t, gl_ScopeWorkgroup, HSK, Bc, gl_MatrixUseB> K_T;
143+
coopmat<float16_t, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
144144

145145
uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
146-
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK), tensorViewTranspose DECODEFUNC);
146+
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose DECODEFUNC);
147147
S = coopMatMulAdd(Qf16, K_T, S);
148148

149149
if (p.logit_softcap != 0.0f) {
@@ -208,31 +208,31 @@ void main() {
208208
rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
209209
rowsum = coopMatMulAdd(P_A, One, rowsum);
210210

211-
coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV, gl_MatrixUseB> V;
211+
coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
212212
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
213-
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV) DECODEFUNC);
213+
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) DECODEFUNC);
214214

215215
L = eM*L + rowsum;
216216

217217
// This is the "diagonal" matrix in the paper, but since we do componentwise
218218
// multiply rather than matrix multiply it has the diagonal element smeared
219219
// across the row
220-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> eMdiag;
220+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> eMdiag;
221221

222222
// resize eM by using smear/reduce
223223
coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
224224

225225
// multiply with fp16 accumulation, then add to O.
226-
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
226+
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
227227
PV = coopMatMulAdd(P_A, V, PV);
228228

229-
O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(PV);
229+
O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(PV);
230230
}
231231

232232
// If there is split_k, then the split_k resolve shader does the final
233233
// division by L. Store the intermediate O value and per-row m and L values.
234234
if (p.k_num > 1) {
235-
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
235+
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
236236

237237
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
238238
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
@@ -243,16 +243,16 @@ void main() {
243243
return;
244244
}
245245

246-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Ldiag;
246+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> Ldiag;
247247

248248
// resize L by using smear/reduce
249249
coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
250250

251251
if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) {
252-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> S;
252+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> S;
253253
coopMatPerElementNV(S, S, perElemOpGetSink, iq2);
254254

255-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Mr;
255+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> Mr;
256256

257257
// resize M by using smear/reduce
258258
coopMatReduceNV(Mr, M, gl_CooperativeMatrixReduceRowNV, smearReduce);
@@ -285,7 +285,7 @@ void main() {
285285

286286
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
287287

288-
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
288+
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
289289
if (p.gqa_ratio > 1) {
290290
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
291291
} else {
@@ -295,6 +295,6 @@ void main() {
295295
// permute dimensions
296296
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
297297

298-
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV), tensorViewPermute);
298+
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV_pad), tensorViewPermute);
299299
}
300300
}

ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,11 @@ layout (push_constant) uniform parameter2
2323
uint rms_partials;
2424
} p;
2525

26-
layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[];
27-
layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[];
26+
// Workaround for MoltenVK Bug, see https://github.com/ggml-org/llama.cpp/issues/15498
27+
// layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[];
28+
// layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[];
29+
layout (binding = 0) buffer A {A_TYPE data_a[];} a[];
30+
layout (binding = 0) buffer D {D_TYPE data_d[];} d[];
2831

2932
layout (binding = 0, std430) buffer PartialBuf {float partial_sums[];} partials[];
3033

src/llama-hparams.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,28 @@ bool llama_hparams::is_swa(uint32_t il) const {
153153

154154
GGML_ABORT("fatal error");
155155
}
156+
157+
bool llama_hparams::has_kv(uint32_t il) const {
158+
if (n_layer_kv_from_start >= 0) {
159+
if (il < (uint32_t) n_layer_kv_from_start) {
160+
return true;
161+
}
162+
163+
return false;
164+
}
165+
166+
// by default, all layers have kv
167+
return true;
168+
}
169+
170+
uint32_t llama_hparams::n_layer_kv() const {
171+
uint32_t res = 0;
172+
173+
for (uint32_t il = 0; il < n_layer; ++il) {
174+
if (has_kv(il)) {
175+
res++;
176+
}
177+
}
178+
179+
return res;
180+
}

src/llama-hparams.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ struct llama_hparams {
4141
uint32_t n_embd;
4242
uint32_t n_embd_features = 0;
4343
uint32_t n_layer;
44+
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
4445
uint32_t n_rot;
4546
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
4647
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@@ -221,6 +222,11 @@ struct llama_hparams {
221222
uint32_t n_pos_per_embd() const;
222223

223224
bool is_swa(uint32_t il) const;
225+
226+
bool has_kv(uint32_t il) const;
227+
228+
// number of layers for which has_kv() returns true
229+
uint32_t n_layer_kv() const;
224230
};
225231

226232
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

src/llama-kv-cache-iswa.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,26 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
2222
uint32_t kv_size,
2323
uint32_t n_seq_max,
2424
uint32_t n_ubatch,
25-
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
26-
llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
27-
llama_kv_cache::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
25+
uint32_t n_pad,
26+
const layer_filter_cb & filter,
27+
const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
28+
29+
// chain filters
30+
const layer_filter_cb filter_base = [&](int32_t il) {
31+
if (filter && !filter(il)) {
32+
return false;
33+
}
34+
35+
return !model.hparams.is_swa(il);
36+
};
37+
38+
const layer_filter_cb filter_swa = [&](int32_t il) {
39+
if (filter && !filter(il)) {
40+
return false;
41+
}
42+
43+
return model.hparams.is_swa(il);
44+
};
2845

2946
const uint32_t size_base = kv_size;
3047

@@ -41,16 +58,16 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
4158
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
4259

4360
kv_base = std::make_unique<llama_kv_cache>(
44-
model, std::move(filter_base), type_k, type_v,
61+
model, type_k, type_v,
4562
v_trans, offload, unified, size_base, n_seq_max, n_pad,
46-
0, LLAMA_SWA_TYPE_NONE);
63+
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
4764

4865
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
4966

5067
kv_swa = std::make_unique<llama_kv_cache>(
51-
model, std::move(filter_swa), type_k, type_v,
68+
model, type_k, type_v,
5269
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
53-
hparams.n_swa, hparams.swa_type);
70+
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
5471
}
5572

5673
void llama_kv_cache_iswa::clear(bool data) {

src/llama-kv-cache-iswa.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@ class llama_kv_cache_iswa : public llama_memory_i {
2020
bool v_trans,
2121
bool offload,
2222
bool swa_full,
23-
bool ,
23+
bool unified,
2424
uint32_t kv_size,
2525
uint32_t n_seq_max,
2626
uint32_t n_ubatch,
27-
uint32_t n_pad);
27+
uint32_t n_pad,
28+
const layer_filter_cb & filter,
29+
const layer_reuse_cb & reuse);
2830

2931
~llama_kv_cache_iswa() = default;
3032

0 commit comments

Comments
 (0)