Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

#include "mul_mat_vec_base.comp"

layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

shared FLOAT_TYPE tmp[32];
layout (constant_id = 0) const uint SUBGROUP_SIZE = 32;

shared FLOAT_TYPE tmp[SUBGROUP_SIZE];

void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
Expand All @@ -21,21 +23,19 @@ void main() {
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;

const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
uint it_size = gl_WorkGroupSize.x/16;

const uint tid = gl_LocalInvocationID.x;
const uint itid = tid/it_size; // 0...16
const uint ix = tid%it_size;

const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
const uint step = 8;

const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = tid - step*v_im; // 0...15 or 0...7
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - step*v_im; // 0...15 or 0...7

#if K_QUANTS_PER_ITERATION == 1
const uint l0 = v_in; // 0...15
const uint is = 0;
#else
const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28
const uint is = v_in / 4;
#endif

const uint ql_offset = 64*v_im + l0;
const uint qh_offset = 32*v_im + l0;
Expand All @@ -44,7 +44,7 @@ void main() {

FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp

[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;

const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
Expand Down Expand Up @@ -95,10 +95,10 @@ void main() {
}

tmp[gl_LocalInvocationID.x] = temp;

// sum up partial sums and write back result

barrier();
[[unroll]] for (uint s = 16; s > 0; s >>= 1) {
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
Expand Down