Skip to content

Commit b8a5cfd

Browse files
SavicStefanStefan Savic
andauthored
vulkan: Increase BK to 32; use BK/4 for non-CM mul_mm.comp (ggml-org#16636)
Signed-off-by: Stefan Savic <[email protected]> Co-authored-by: Stefan Savic <[email protected]>
1 parent 08416eb commit b8a5cfd

File tree

1 file changed

+31
-2
lines changed

1 file changed

+31
-2
lines changed

ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ layout (push_constant) uniform parameter
100100
layout (constant_id = 0) const uint BLOCK_SIZE = 64;
101101
layout (constant_id = 1) const uint BM = 64;
102102
layout (constant_id = 2) const uint BN = 64;
103-
layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant
104103
layout (constant_id = 4) const uint WM = 32;
105104
layout (constant_id = 5) const uint WN = 32;
106105
layout (constant_id = 6) const uint WMITER = 2;
@@ -109,6 +108,14 @@ layout (constant_id = 8) const uint TN = 2;
109108
layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat
110109
layout (constant_id = 10) const uint WARP = 32;
111110

111+
#if defined(DATA_A_F32) || defined(DATA_A_F16)
112+
#define BK 32
113+
#define BK_STEP 4
114+
#else
115+
layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant
116+
#define BK_STEP 2
117+
#endif
118+
112119
#ifdef COOPMAT
113120
#define SHMEM_STRIDE (BK / 2 + 4)
114121
#else
@@ -244,8 +251,13 @@ void main() {
244251
}
245252
#else
246253
ACC_TYPE_VEC2 sums[WMITER * TM * WNITER * TN/2];
254+
#if defined(DATA_A_F32) || defined(DATA_A_F16)
255+
FLOAT_TYPE_VEC4 cache_a[WMITER * TM];
256+
FLOAT_TYPE_VEC4 cache_b;
257+
#else
247258
FLOAT_TYPE_VEC2 cache_a[WMITER * TM];
248259
FLOAT_TYPE_VEC2 cache_b;
260+
#endif
249261

250262
[[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN/2; i++) {
251263
sums[i] = ACC_TYPE_VEC2(0.0f, 0.0f);
@@ -283,24 +295,41 @@ void main() {
283295
}
284296
}
285297
#else
286-
[[unroll]] for (uint i = 0; i < BK / 2; i++) {
298+
[[unroll]] for (uint i = 0; i < BK / BK_STEP; i++) {
287299
// Load from shared into cache
288300
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
289301
[[unroll]] for (uint j = 0; j < TM; j++) {
302+
#if defined(DATA_A_F32) || defined(DATA_A_F16)
303+
cache_a[wsir * TM + j].xy = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + 2 * i ];
304+
cache_a[wsir * TM + j].zw = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + 2 * i + 1];
305+
#else
290306
cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i];
307+
#endif
291308
}
292309
}
293310

294311
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
295312
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
313+
#if defined(DATA_A_F32) || defined(DATA_A_F16)
314+
cache_b.xy = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + 2 * i ];
315+
cache_b.zw = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + 2 * i + 1];
316+
#else
296317
cache_b = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + i];
318+
#endif
297319

298320
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
299321
[[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
300322
// [WNITER][TN][WMITER][TM / 2] -> [wsic][cc][wsir][cr]
301323
const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
324+
#if defined(DATA_A_F32) || defined(DATA_A_F16)
325+
sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y),
326+
fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].w), ACC_TYPE(cache_b.w), sums[sums_idx].x))));
327+
sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y),
328+
fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].w), ACC_TYPE(cache_b.w), sums[sums_idx].y))));
329+
#else
302330
sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y), sums[sums_idx].x));
303331
sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y), sums[sums_idx].y));
332+
#endif
304333
}
305334
}
306335
}

0 commit comments

Comments
 (0)