b4rtaz · b4rtaz · Aug 16, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 16, 2025
diff --git a/Makefile b/Makefile
@@ -70,7 +70,7 @@ VULKAN_SHADER_BINS := $(VULKAN_SHADER_SRCS:.comp=.spv)
 DEPS += $(VULKAN_SHADER_BINS)
 
 %.spv: %.comp
-	$(CGLSLC) -c $< -o $@
+	$(CGLSLC) -c $< -o $@ --target-env=vulkan1.2
 nn-vulkan-test: src/nn/nn-vulkan-test.cpp nn-quants.o nn-core.o nn-executor.o nn-vulkan.o ${DEPS}
 	$(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
 endif

diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -662,16 +662,16 @@ void testMultiheadAtt_F32_F32() {
 int main() {
     initQuants();
 
-    testRmsNorm_F32_F32_F32<3>();
+    testRmsNorm_F32_F32_F32<4>();
     testRmsNorm_F32_F32_F32<1024>();
-    testRmsNorm_F32_F32_F32<3191>();
+    testRmsNorm_F32_F32_F32<3196>();
 
-    testSilu_F32_F32<3>();
+    testSilu_F32_F32<4>();
     testSilu_F32_F32<32>();
-    testSilu_F32_F32<101>();
+    testSilu_F32_F32<104>();
 
     testMul_F32_F32<32>();
-    testMul_F32_F32<47>();
+    testMul_F32_F32<48>();
 
     testMergeAdd_F32_F32();
 
@@ -686,7 +686,7 @@ int main() {
 
     testCast_F32_F32<128>();
     testCast_F32_F32<32>();
-    testCast_F32_F32<9>();
+    testCast_F32_F32<8>();
 
     testCast_F32_Q80<256>();
     testCast_F32_Q80<64>();

diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -522,22 +522,48 @@ static std::vector<NnVulkanBatchInfo> buildBatchInfo(NnOpConfig *opConfig, NnVul
     return offset;
 }
 
-static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSize, NnUint *groupCount) {
+static NnUint roundUpPow2(NnUint n, NnUint min, NnUint max) {
+    NnUint p = 1;
+    while (p << 1 <= n) p <<= 1;
+    if (p < n)  p <<= 1;
+    if (p < min) p = min;
+    if (p > max) p = max;
+    return p;
+}
+
+static uint32_t resolveShaderNThreads(const NnOpConfig *opConfig, const NnSize2D inputSize) {
+    if (opConfig->code == OP_MATMUL) {
+        if (opConfig->weightSize.floatType == F_Q40) {
+            constexpr NnUint maxThreads = 256; // Shader constant
+            NnUint t = roundUpPow2(inputSize.x / (Q40_BLOCK_SIZE * 2), 32, maxThreads);
+            return t;
+        }
+    }
+    return 0;
+}
+
+static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSize, NnUint *groupCount, const NnSize2D inputSize, const NnSize2D outputSize) {
     groupCount[0] = 1;
     groupCount[1] = batchSize;
     groupCount[2] = 1;
 
-    if (opConfig->code == OP_CAST ||
-        opConfig->code == OP_MUL ||
-        opConfig->code == OP_SILU ||
-        opConfig->code == OP_SHIFT ||
-        opConfig->code == OP_MERGE_ADD)
-        groupCount[2] = 32;
-    else if (opConfig->code == OP_MATMUL) {
+    if (opConfig->code == OP_CAST) {
+        if (outputSize.floatType == F_Q80) {
+            groupCount[2] = outputSize.x / Q80_BLOCK_SIZE;
+        } else {
+            constexpr NnUint chunkSize = 4; // Shader constant
+            groupCount[2] = outputSize.x / chunkSize;
+        }
+    } else if (opConfig->code == OP_MERGE_ADD) {
+        if (inputSize.floatType == F_Q80) {
+            groupCount[2] = outputSize.x / Q80_BLOCK_SIZE; // Yes, outputSize is used here
+        } else {
+            groupCount[2] = 32;
+        }
+    } else if (opConfig->code == OP_MATMUL) {
         if (opConfig->weightSize.floatType == F_Q40) {
-            // Must be synced with the shader
-            constexpr NnUint tileSizeN = 2;
-            constexpr NnUint tileSizeD = 16;
+            constexpr NnUint tileSizeN = 2; // Shader constant
+            constexpr NnUint tileSizeD = 8; // Shader constant
             const NnUint blockSize = getBlockSize(opConfig->weightSize.floatType);
             assert(opConfig->weightSize.y % (tileSizeN * blockSize) == 0);
             assert(opConfig->weightSize.x % tileSizeD == 0);
@@ -550,8 +576,17 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
         groupCount[2] = ((NnMultiHeadAttOpConfig *)opConfig->config)->nHeads0;
     else if (opConfig->code == OP_INV_RMS)
         groupCount[2] = ((NnInvRmsOpConfig *)opConfig->config)->nColumns;
-    else if (opConfig->code == OP_RMS_NORM)
-        groupCount[2] = ((NnRmsNormOpConfig *)opConfig->config)->nColumns;
+    else if (
+        opConfig->code == OP_EMBEDDING ||
+        opConfig->code == OP_RMS_NORM ||
+        opConfig->code == OP_MUL ||
+        opConfig->code == OP_SILU ||
+        opConfig->code == OP_SHIFT
+    ) {
+        constexpr NnUint chunkSize = 4; // Shader constant
+        assert(outputSize.x % chunkSize == 0);
+        groupCount[2] = outputSize.x / chunkSize;
+    }
 }
 
 static std::vector<uint32_t> readShader(const char *fileName) {
@@ -647,12 +682,10 @@ NnVulkanDeviceSegment::NnVulkanDeviceSegment(NnVulkanContext *context, NnVulkanD
 
     std::vector<vk::PipelineShaderStageCreateInfo> shaderCreateInfos(segmentConfig->nOps);
 
-    constexpr NnUint maxConsts = 3;
-    std::vector<NnUint> nConsts(segmentConfig->nOps);
-    std::vector<int> consts(segmentConfig->nOps * maxConsts);
     std::vector<vk::SpecializationInfo> specInfos(segmentConfig->nOps);
-    std::vector<vk::SpecializationMapEntry> specMapEntries(segmentConfig->nOps * maxConsts);
-
+    std::vector<vk::SpecializationMapEntry> specEntries(segmentConfig->nOps);
+    std::vector<uint32_t> nThreads(segmentConfig->nOps);
+
     for (NnUint opIndex = 0; opIndex < segmentConfig->nOps; opIndex++) {
         NnOpConfig *opConfig = &segmentConfig->ops[opIndex];
         NnSize2D inputSize = data->resolveBufferSize(&opConfig->input);
@@ -674,12 +707,17 @@ NnVulkanDeviceSegment::NnVulkanDeviceSegment(NnVulkanContext *context, NnVulkanD
             code.data()
         );
 
+        nThreads[opIndex] = resolveShaderNThreads(opConfig, inputSize);
+        specEntries[opIndex] = vk::SpecializationMapEntry(0, 0, sizeof(uint32_t));
+        specInfos[opIndex] = vk::SpecializationInfo(1, &specEntries[opIndex], sizeof(uint32_t), &nThreads[opIndex]);
+
         vk::ShaderModule shaderModule = context->device.createShaderModule(shaderModuleCreateInfo);
         vk::PipelineShaderStageCreateInfo shaderCreateInfo(
             vk::PipelineShaderStageCreateFlags(),
             vk::ShaderStageFlagBits::eCompute,
             shaderModule,
-            "main"
+            "main",
+            &specInfos[opIndex]
         );
 
         shaderModules[opIndex] = shaderModule;
@@ -837,7 +875,10 @@ void NnVulkanDeviceSegment::forward(NnUint opIndex, NnUint nThreads, NnUint thre
 
         NnUint opGroupCount[3];
         for (NnUint opIndex = 0; opIndex < segmentConfig->nOps; opIndex++) {
-            resolveShaderGroups(&segmentConfig->ops[opIndex], batchSize, opGroupCount);
+            NnSize2D inputSize = data->resolveBufferSize(&segmentConfig->ops[opIndex].input);
+            NnSize2D outputSize = data->resolveBufferSize(&segmentConfig->ops[opIndex].output);
+
+            resolveShaderGroups(&segmentConfig->ops[opIndex], batchSize, opGroupCount, inputSize, outputSize);
 
             if (opIndex > 0) {
                 std::vector<NnOpBufferUsage> *usages = &opBufferUsages[opIndex];

diff --git a/src/nn/vulkan/cast-forward-f32-f32.comp b/src/nn/vulkan/cast-forward-f32-f32.comp
@@ -1,9 +1,11 @@
 #version 450
 
-#define N_THREADS 256
+#extension GL_EXT_control_flow_attributes : enable
+
+#define CHUNK_SIZE 4
 #define N_BATCHES 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -17,20 +19,14 @@ layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };
 
 void main() {
-    const uint threadIndex = gl_LocalInvocationID.x;
-    const uint nWorkGroups = gl_NumWorkGroups.z;
     const uint batchIndex = gl_WorkGroupID.y;
-    const uint workGroupIndex = gl_WorkGroupID.z;
-
+    const uint chunkIndex = gl_WorkGroupID.z;
     const BatchInfo info = infos[batchIndex];
-    const uint slice = info.inputSizeX / nWorkGroups;
-    const uint rest = info.inputSizeX % nWorkGroups;
-    const uint offset = workGroupIndex * slice + min(rest, workGroupIndex);
-    const uint dim = slice + (workGroupIndex < rest ? 1 : 0);
+    const uint offset = chunkIndex * CHUNK_SIZE;
     const uint xOffset = info.inputOffset + offset;
     const uint yOffset = info.outputOffset + offset;
 
-    for (uint i = threadIndex; i < dim; i += N_THREADS) {
+    [[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
         y[yOffset + i] = x[xOffset + i];
     }
 }
diff --git a/src/nn/vulkan/cast-forward-f32-q80.comp b/src/nn/vulkan/cast-forward-f32-q80.comp
@@ -5,10 +5,9 @@
 #extension GL_EXT_shader_explicit_arithmetic_types : enable
 
 #define Q80_BLOCK_SIZE 32
-#define N_THREADS 256
 #define N_BATCHES 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -27,37 +26,25 @@ layout(binding = 1) writeonly buffer outputBuffer { BlockQ80 y[]; };
 layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };
 
 void main() {
-    const uint threadIndex = gl_LocalInvocationID.x;
-    const uint nWorkGroups = gl_NumWorkGroups.z;
     const uint batchIndex = gl_WorkGroupID.y;
-    const uint workGroupIndex = gl_WorkGroupID.z;
+    const uint d = gl_WorkGroupID.z;
 
     const BatchInfo info = infos[batchIndex];
-    const uint slice = info.outputSizeX / nWorkGroups;
-    const uint rest = info.outputSizeX % nWorkGroups;
-    const uint yStart = workGroupIndex * slice + min(rest, workGroupIndex);
-    const uint yEnd = yStart + slice + (workGroupIndex < rest ? 1 : 0);
-    const uint xOffset = info.inputOffset;
-    const uint yOffset = info.outputOffset;
-
-    for (uint i = yStart + threadIndex; i < yEnd; i += N_THREADS) {
-        const uint xiOffset = xOffset + i * Q80_BLOCK_SIZE;
-        const uint yiOffset = yOffset + i;
-
-        float amax = 0.0;
-        [[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
-            const float v = abs(x[xiOffset + j]);
-            amax = max(amax, v);
-        }
-
-        const float d = amax / 127.0f;
-        const float id = d != 0.0f ? 1.0f / d : 0.0f;
-
-        y[yiOffset].d = float16_t(d);
-
-        [[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
-            const float v = x[xiOffset + j];
-            y[yiOffset].qs[j] = int8_t(clamp(round(v * id), -127.0f, 127.0f));
-        }
+    const uint xiOffset = info.inputOffset + d * Q80_BLOCK_SIZE;
+    const uint yiOffset = info.outputOffset + d;
+
+    float amax = 0.0;
+    [[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
+        amax = max(amax, abs(x[xiOffset + j]));
+    }
+
+    const float dd = amax / 127.0f;
+    const float id = dd != 0.0f ? 1.0f / dd : 0.0f;
+
+    y[yiOffset].d = float16_t(dd);
+
+    [[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
+        const float v = x[xiOffset + j];
+        y[yiOffset].qs[j] = int8_t(clamp(round(v * id), -127.0f, 127.0f));
     }
 }
diff --git a/src/nn/vulkan/embedding-forward-f32-f32.comp b/src/nn/vulkan/embedding-forward-f32-f32.comp
@@ -1,9 +1,11 @@
 #version 450
 
-#define N_THREADS 256
+#extension GL_EXT_control_flow_attributes : enable
+
+#define CHUNK_SIZE 4
 #define N_BATCHES 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -20,23 +22,16 @@ layout(binding = 3) readonly buffer weightBuffer { float weight[]; };
 shared uint sharedPosition;
 
 void main() {
-    const uint threadIndex = gl_LocalInvocationID.x;
-    const uint batchIndex = gl_GlobalInvocationID.y;
-
-    if (threadIndex == 0) {
-        sharedPosition = uint(x[batchIndex]);
-    }
+    const uint batchIndex = gl_WorkGroupID.y;
+    const uint chunkIndex = gl_WorkGroupID.z;
+    const uint position = uint(x[batchIndex]);
 
-    barrier();
-
-    const uint position = sharedPosition;
     const BatchInfo info = infos[batchIndex];
+    const uint offset = chunkIndex * CHUNK_SIZE;
+    const uint yOffset = info.outputOffset + offset;
+    const uint wOffset = position * info.outputSizeX + offset;
 
-    const uint outputSizeX = info.outputSizeX;
-    const uint yOffset = info.outputOffset;
-    const uint wOffset = position * outputSizeX;
-
-    for (uint i = threadIndex; i < outputSizeX; i += N_THREADS) {
+    [[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
         y[yOffset + i] = weight[wOffset + i];
     }
 }
diff --git a/src/nn/vulkan/matmul-forward-q80-q40-f32.comp b/src/nn/vulkan/matmul-forward-q80-q40-f32.comp
@@ -4,14 +4,14 @@
 #extension GL_EXT_shader_16bit_storage : enable
 #extension GL_EXT_shader_explicit_arithmetic_types : enable
 
-#define N_THREADS 64
+#define MAX_THREADS 256
 #define N_BATCHES 32
 #define TILE_SIZE_X 2
-#define TILE_SIZE_D 16
+#define TILE_SIZE_D 8
 
 #define Q80_Q40_BLOCK_SIZE 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -35,17 +35,18 @@ layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };
 layout(binding = 3) readonly buffer weightBuffer { BlockQ40 weight[]; };
 
-shared float16_t sums[N_THREADS * TILE_SIZE_D];
+shared float16_t sums[MAX_THREADS * TILE_SIZE_D];
 
 void main() {
+    const uint nThreads = gl_WorkGroupSize.x;
     const uint threadIndex = gl_LocalInvocationID.x;
     const uint batchIndex = gl_WorkGroupID.y;
     const uint workGroupIndex = gl_WorkGroupID.z;
     const BatchInfo info = infos[batchIndex];
 
     const uint xTiles = info.inputSizeX / TILE_SIZE_X;
-    const uint xSlice = xTiles / N_THREADS;
-    const uint xRest = xTiles % N_THREADS;
+    const uint xSlice = xTiles / nThreads;
+    const uint xRest = xTiles % nThreads;
 
     const uint inputOffset = info.inputOffset;
     const uint inputSizeX = info.inputSizeX;
@@ -97,15 +98,15 @@ void main() {
 
     barrier();
 
-    [[unroll]] for (uint i = N_THREADS / 2; i > 0; i >>= 1) {
+    for (uint i = nThreads / 2; i > 0; i >>= 1) {
         for (uint dt = 0; dt < TILE_SIZE_D; dt++) {
             if (threadIndex < i) {
                 sums[threadIndex * TILE_SIZE_D + dt] += sums[(threadIndex + i) * TILE_SIZE_D + dt];
             }
         }
         barrier();
     }
-    for (uint dt = threadIndex; dt < TILE_SIZE_D; dt += N_THREADS) {
+    for (uint dt = threadIndex; dt < TILE_SIZE_D; dt += nThreads) {
         y[outputOffset + d + dt] = float(sums[dt]);
     }
 }