Nexesenex
diff --git a/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp‎
Lines changed: 146 additions & 265 deletions b/‎ggml/src/ggml-vulkan/ggml-vulkan.cpp‎
Lines changed: 146 additions & 265 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/add.comp‎
Lines changed: 1 addition & 41 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/add.comp‎
Lines changed: 1 addition & 41 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp‎
Lines changed: 0 additions & 4 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp‎
Lines changed: 4 additions & 19 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp‎
Lines changed: 4 additions & 19 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp‎
Lines changed: 18 additions & 18 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp‎
Lines changed: 3 additions & 46 deletions b/‎ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp‎
Lines changed: 3 additions & 46 deletions
@@ -1,69 +1,29 @@
 #version 450
 
 #extension GL_EXT_shader_16bit_storage : require
-#if ADD_RMS
-#extension GL_KHR_shader_subgroup_arithmetic : enable
-#extension GL_KHR_shader_subgroup_basic : enable
-#endif
 
 #include "types.comp"
 #include "generic_binary_head.comp"
 
 const uint num_threads = 256;
 
-layout (binding = 3, std430) buffer PartialBuf {float partial_sums[];};
-
 layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
 
-#if ADD_RMS
-// XXX TODO this could be sized based on number of subgroups, but that't not considered a constant
-shared FLOAT_TYPE sumsh[num_threads];
-#endif
-
 void main() {
     uint idx = get_idx();
-    uint orig_idx = idx;
 
     // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
     const uint num_iter = 2;
 
-    FLOAT_TYPE sum_sq = 0;
-
     [[unroll]] for (uint i = 0; i < num_iter; ++i) {
         if (idx >= p.ne) {
             continue;
         }
         uint i00, i01, i02, i03;
         get_indices(idx, i00, i01, i02, i03);
 
-        FLOAT_TYPE sum = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]);
-        sum_sq += sum*sum;
-
-        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(sum);
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
 
         idx += num_threads;
     }
-
-#if ADD_RMS
-    if (p.param3 != 0) {
-        // reduce the sum within each subgroup, then across subgroups
-        const uint NumSubgroups = num_threads / gl_SubgroupSize;
-        sum_sq = subgroupAdd(sum_sq);
-        if (gl_SubgroupInvocationID == 0) {
-            sumsh[gl_SubgroupID] = sum_sq;
-        }
-        barrier();
-        [[unroll]] for (uint s = NumSubgroups / 2; s > 0; s >>= 1) {
-            if (gl_SubgroupID < s && gl_SubgroupInvocationID == 0) {
-                sum_sq += sumsh[gl_SubgroupID + s];
-                sumsh[gl_SubgroupID] = sum_sq;
-            }
-            barrier();
-        }
-
-        if (gl_SubgroupID == 0 && gl_SubgroupInvocationID == 0) {
-            partial_sums[orig_idx / (num_iter * num_threads)] = sum_sq;
-        }
-    }
-#endif
 }
@@ -9,10 +9,6 @@ layout (constant_id = 4) const uint32_t HSV = 32;
 layout (constant_id = 5) const uint32_t Clamp = 0;
 layout (constant_id = 6) const uint32_t D_split = 16;
 
-// Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
-const uint32_t HSK_pad = (HSK + 15) & ~15;
-const uint32_t HSV_pad = (HSV + 15) & ~15;
-
 layout (push_constant) uniform parameter {
     uint32_t N;
     uint32_t KV;
 
@@ -46,14 +46,14 @@ const uint32_t MatBc = 16;
 shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
 shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x];
 
-const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
+const uint32_t qstride = HSK / 4 + 2; // in units of f16vec4
 shared f16vec4 Qf[Br * qstride];
 
 // Avoid padding for hsk==256 to make it fit in 48KB shmem.
 const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br;
 shared ACC_TYPE sfsh[Bc * sfshstride];
 
-const uint32_t kshstride = HSK_pad / 4 + 2; // in units of f16vec4
+const uint32_t kshstride = HSK / 4 + 2; // in units of f16vec4
 shared f16vec4 ksh[Bc * kshstride];
 
 shared float slope[Br];
@@ -74,21 +74,6 @@ void main() {
 
 #define tile_row(r) (row_tid * rows_per_thread + (r))
 
-    // Zero-initialize shared memory for Q/K when HSK is not a multiple of 16 (HSK_pad > HSK).
-    if ((HSK % 16) != 0) {
-        [[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) {
-            if (i + tid < Br * qstride) {
-                Qf[i + tid] = f16vec4(0);
-            }
-        }
-        [[unroll]] for (uint i = 0; i < Bc * kshstride; i += gl_WorkGroupSize.x) {
-            if (i + tid < Bc * kshstride) {
-                ksh[i + tid] = f16vec4(0);
-            }
-        }
-        barrier();
-    }
-
     uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
 
     [[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
@@ -166,14 +151,14 @@ void main() {
         }
         barrier();
 
-        // K * Q^T -> S^T: Bc x HSK_pad * HSK_pad x Br -> Bc x Br
+        // K * Q^T -> S^T: Bc x HSK * HSK x Br -> Bc x Br
         // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
         // This is written transposed in order to allow for N being 8 if implementations need it
         coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
         coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
         coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
 
-        for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
+        for (uint32_t d = 0; d < HSK / 16; ++d) {
             coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);
 
             uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;
 
@@ -104,16 +104,16 @@ void main() {
     tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
     tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
 
-    coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseAccumulator> Q;
-    coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
+    coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseAccumulator> Q;
+    coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA> Qf16;
 
     uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
-    coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad));
+    coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK));
 
-    Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
+    Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA>(Q);
     Qf16 *= float16_t(p.scale);
 
-    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
+    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
 
     coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
 
@@ -140,10 +140,10 @@ void main() {
 
         coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
 
-        coopmat<float16_t, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
+        coopmat<float16_t, gl_ScopeWorkgroup, HSK, Bc, gl_MatrixUseB> K_T;
 
         uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
-        coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose DECODEFUNC);
+        coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK), tensorViewTranspose DECODEFUNC);
         S = coopMatMulAdd(Qf16, K_T, S);
 
         if (p.logit_softcap != 0.0f) {
@@ -208,31 +208,31 @@ void main() {
         rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
         rowsum = coopMatMulAdd(P_A, One, rowsum);
 
-        coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
+        coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV, gl_MatrixUseB> V;
         uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
-        coopMatLoadTensorNV(V,  data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) DECODEFUNC);
+        coopMatLoadTensorNV(V,  data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV) DECODEFUNC);
 
         L = eM*L + rowsum;
 
         // This is the "diagonal" matrix in the paper, but since we do componentwise
         // multiply rather than matrix multiply it has the diagonal element smeared
         // across the row
-        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> eMdiag;
+        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> eMdiag;
 
         // resize eM by using smear/reduce
         coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
 
         // multiply with fp16 accumulation, then add to O.
-        coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
+        coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
         PV = coopMatMulAdd(P_A, V, PV);
 
-        O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(PV);
+        O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(PV);
     }
 
     // If there is split_k, then the split_k resolve shader does the final
     // division by L. Store the intermediate O value and per-row m and L values.
     if (p.k_num > 1) {
-        coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
+        coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
 
         uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
         coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
@@ -243,16 +243,16 @@ void main() {
         return;
     }
 
-    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> Ldiag;
+    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Ldiag;
 
     // resize L by using smear/reduce
     coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
 
     if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) {
-        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> S;
+        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> S;
         coopMatPerElementNV(S, S, perElemOpGetSink, iq2);
 
-        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> Mr;
+        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Mr;
 
         // resize M by using smear/reduce
         coopMatReduceNV(Mr, M, gl_CooperativeMatrixReduceRowNV, smearReduce);
@@ -285,7 +285,7 @@ void main() {
 
     uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
 
-    coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
+    coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
     if (p.gqa_ratio > 1) {
         coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
     } else {
@@ -295,6 +295,6 @@ void main() {
         // permute dimensions
         tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
 
-        coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV_pad), tensorViewPermute);
+        coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV), tensorViewPermute);
     }
 }
@@ -3,10 +3,6 @@
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_EXT_nonuniform_qualifier : enable
 #extension GL_EXT_control_flow_attributes : require
-#if ADD_RMS
-#extension GL_KHR_shader_subgroup_arithmetic : enable
-#extension GL_KHR_shader_subgroup_basic : enable
-#endif
 
 #include "rte.comp"
 #include "types.comp"
@@ -18,18 +14,11 @@ layout (push_constant) uniform parameter2
     uint ne20; uint ne21; uint ne22; uint ne23;
 
     // strides for srcs+dst
-    uint nb[12][4];
-
-    uint rms_partials;
+    uint nb[8][4];
 } p;
 
-// Workaround for MoltenVK Bug, see https://github.com/ggml-org/llama.cpp/issues/15498
-// layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[];
-// layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[];
-layout (binding = 0) buffer A {A_TYPE data_a[];} a[];
-layout (binding = 0) buffer D {D_TYPE data_d[];} d[];
-
-layout (binding = 0, std430) buffer PartialBuf {float partial_sums[];} partials[];
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[];
+layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[];
 
 layout(constant_id = 0) const uint num_srcs = 2;
 
@@ -53,22 +42,14 @@ const uint num_threads = 256;
 
 layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
 
-#if ADD_RMS
-// XXX TODO this could be sized based on number of subgroups, but that't not considered a constant
-shared FLOAT_TYPE sumsh[num_threads];
-#endif
-
 void main() {
     uint idx = get_idx();
-    uint orig_idx = idx;
 
     uint ne = p.ne20 * p.ne21 * p.ne22 * p.ne23;
 
     // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
     const uint num_iter = 2;
 
-    FLOAT_TYPE sum_sq = 0;
-
     [[unroll]] for (uint i = 0; i < num_iter; ++i) {
         if (idx >= ne) {
             continue;
@@ -80,32 +61,8 @@ void main() {
         [[unroll]] for (uint s = 0; s < num_srcs; ++s) {
             sum += FLOAT_TYPE(a[s].data_a[src_idx(s, i00, i01, i02, i03)]);
         }
-        sum_sq += sum*sum;
         d[num_srcs].data_d[dst_idx(i00, i01, i02, i03)] = D_TYPE(sum);
 
         idx += num_threads;
     }
-
-#if ADD_RMS
-    if (p.rms_partials != 0) {
-        // reduce the sum within each subgroup, then across subgroups
-        const uint NumSubgroups = num_threads / gl_SubgroupSize;
-        sum_sq = subgroupAdd(sum_sq);
-        if (gl_SubgroupInvocationID == 0) {
-            sumsh[gl_SubgroupID] = sum_sq;
-        }
-        barrier();
-        [[unroll]] for (uint s = NumSubgroups / 2; s > 0; s >>= 1) {
-            if (gl_SubgroupID < s && gl_SubgroupInvocationID == 0) {
-                sum_sq += sumsh[gl_SubgroupID + s];
-                sumsh[gl_SubgroupID] = sum_sq;
-            }
-            barrier();
-        }
-
-        if (gl_SubgroupID == 0 && gl_SubgroupInvocationID == 0) {
-            partials[num_srcs + 1].partial_sums[orig_idx / (num_iter * num_threads)] = sum_sq;
-        }
-    }
-#endif
 }