feat: optimized shaders.

b4rtaz · b4rtaz · commit 2f1cdc2a2bd4 · 2025-08-15T00:03:16.000+02:00
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -662,16 +662,16 @@ void testMultiheadAtt_F32_F32() {
 int main() {
     initQuants();
 
-    testRmsNorm_F32_F32_F32<3>();
+    testRmsNorm_F32_F32_F32<4>();
     testRmsNorm_F32_F32_F32<1024>();
-    testRmsNorm_F32_F32_F32<3191>();
+    testRmsNorm_F32_F32_F32<3196>();
 
-    testSilu_F32_F32<3>();
+    testSilu_F32_F32<4>();
     testSilu_F32_F32<32>();
-    testSilu_F32_F32<101>();
+    testSilu_F32_F32<104>();
 
     testMul_F32_F32<32>();
-    testMul_F32_F32<47>();
+    testMul_F32_F32<48>();
 
     testMergeAdd_F32_F32();
 
@@ -686,7 +686,7 @@ int main() {
 
     testCast_F32_F32<128>();
     testCast_F32_F32<32>();
-    testCast_F32_F32<9>();
+    testCast_F32_F32<8>();
 
     testCast_F32_Q80<256>();
     testCast_F32_Q80<64>();
diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -539,13 +539,7 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
         } else {
             groupCount[2] = 32;
         }
-    } else if (
-        opConfig->code == OP_MUL ||
-        opConfig->code == OP_SILU ||
-        opConfig->code == OP_SHIFT
-    )
-        groupCount[2] = 32;
-    else if (opConfig->code == OP_MATMUL) {
+    } else if (opConfig->code == OP_MATMUL) {
         if (opConfig->weightSize.floatType == F_Q40) {
             // Must be synced with the shader
             constexpr NnUint tileSizeN = 2;
@@ -562,8 +556,17 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
         groupCount[2] = ((NnMultiHeadAttOpConfig *)opConfig->config)->nHeads0;
     else if (opConfig->code == OP_INV_RMS)
         groupCount[2] = ((NnInvRmsOpConfig *)opConfig->config)->nColumns;
-    else if (opConfig->code == OP_RMS_NORM)
-        groupCount[2] = ((NnRmsNormOpConfig *)opConfig->config)->nColumns;
+    else if (
+        opConfig->code == OP_EMBEDDING ||
+        opConfig->code == OP_RMS_NORM ||
+        opConfig->code == OP_MUL ||
+        opConfig->code == OP_SILU ||
+        opConfig->code == OP_SHIFT
+    ) {
+        constexpr NnUint chunkSize = 4;
+        assert(outputSize.x % chunkSize == 0);
+        groupCount[2] = outputSize.x / chunkSize;
+    }
 }
 
 static std::vector<uint32_t> readShader(const char *fileName) {
diff --git a/src/nn/vulkan/embedding-forward-f32-f32.comp b/src/nn/vulkan/embedding-forward-f32-f32.comp
@@ -1,9 +1,11 @@
 #version 450
 
-#define N_THREADS 256
+#extension GL_EXT_control_flow_attributes : enable
+
+#define CHUNK_SIZE 4
 #define N_BATCHES 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -20,23 +22,16 @@ layout(binding = 3) readonly buffer weightBuffer { float weight[]; };
 shared uint sharedPosition;
 
 void main() {
-    const uint threadIndex = gl_LocalInvocationID.x;
-    const uint batchIndex = gl_GlobalInvocationID.y;
-
-    if (threadIndex == 0) {
-        sharedPosition = uint(x[batchIndex]);
-    }
+    const uint batchIndex = gl_WorkGroupID.y;
+    const uint chunkIndex = gl_WorkGroupID.z;
+    const uint position = uint(x[batchIndex]);
 
-    barrier();
-
-    const uint position = sharedPosition;
     const BatchInfo info = infos[batchIndex];
+    const uint offset = chunkIndex * CHUNK_SIZE;
+    const uint yOffset = info.outputOffset + offset;
+    const uint wOffset = position * info.outputSizeX + offset;
 
-    const uint outputSizeX = info.outputSizeX;
-    const uint yOffset = info.outputOffset;
-    const uint wOffset = position * outputSizeX;
-
-    for (uint i = threadIndex; i < outputSizeX; i += N_THREADS) {
+    [[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
         y[yOffset + i] = weight[wOffset + i];
     }
 }
diff --git a/src/nn/vulkan/mul-forward-f32-f32.comp b/src/nn/vulkan/mul-forward-f32-f32.comp
@@ -1,9 +1,11 @@
 #version 450
 
-#define N_THREADS 256
+#extension GL_EXT_control_flow_attributes : enable
+
+#define CHUNK_SIZE 4
 #define N_BATCHES 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -21,21 +23,15 @@ layout(binding = 3) readonly uniform configBuffer {
 layout(binding = 4) readonly buffer multiplierBuffer { float m[]; };
 
 void main() {
-    const uint threadIndex = gl_LocalInvocationID.x;
-    const uint nWorkGroups = gl_NumWorkGroups.z;
     const uint batchIndex = gl_WorkGroupID.y;
-    const uint workGroupIndex = gl_WorkGroupID.z;
+    const uint chunkIndex = gl_WorkGroupID.z;
 
     const BatchInfo info = infos[batchIndex];
-    const uint slice = info.inputSizeX / nWorkGroups;
-    const uint rest = info.inputSizeX % nWorkGroups;
-    const uint offset = workGroupIndex * slice + min(rest, workGroupIndex);
-
-    const uint dim = slice + (workGroupIndex < rest ? 1 : 0);
+    const uint offset = chunkIndex * CHUNK_SIZE;
     const uint xyOffset = info.inputOffset + offset;
     const uint mOffset = info.inputSizeX * batchIndex + offset;
 
-    for (uint i = threadIndex; i < dim; i += N_THREADS) {
+    [[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
         y[xyOffset + i] = x[xyOffset + i] * m[mOffset + i];
     }
 }
diff --git a/src/nn/vulkan/rms-norm-forward-f32-f32-f32.comp b/src/nn/vulkan/rms-norm-forward-f32-f32-f32.comp
@@ -1,9 +1,11 @@
 #version 450
 
-#define N_THREADS 256
+#extension GL_EXT_control_flow_attributes : enable
+
 #define N_BATCHES 32
+#define CHUNK_SIZE 4
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -22,27 +24,20 @@ layout(binding = 4) readonly uniform configBuffer {
 };
 layout(binding = 5) readonly buffer invRmsBuffer { float invRms[]; };
 
-shared float sharedS;
-
 void main() {
-    const uint threadIndex = uint(gl_LocalInvocationID.x);
     const uint batchIndex = gl_WorkGroupID.y;
-    const uint colIndex = gl_WorkGroupID.z;
-
-    if (threadIndex == 0) {
-        sharedS = invRms[batchIndex * nColumns + colIndex];
-    }
-
-    barrier();
+    const uint chunkIndex = gl_WorkGroupID.z;
 
     const BatchInfo info = infos[batchIndex];
     const uint dim = info.inputSizeX / nColumns;
-    const uint offset = dim * colIndex;
+    const uint offset = chunkIndex * CHUNK_SIZE;
+    const uint colIndex = offset / dim;
+    const float s = invRms[batchIndex * nColumns + colIndex];
+
     const uint xOffset = info.inputOffset + offset;
     const uint yOffset = info.outputOffset + offset;
-    const float s = sharedS;
 
-    for (uint i = threadIndex; i < dim; i += N_THREADS) {
-        y[yOffset + i] = (x[xOffset + i] * s) * weight[i];
+    [[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
+        y[yOffset + i] = (x[xOffset + i] * s) * weight[(offset + i) % dim];
     }
 }
diff --git a/src/nn/vulkan/shift-forward-f32-f32.comp b/src/nn/vulkan/shift-forward-f32-f32.comp
@@ -1,9 +1,11 @@
 #version 450
 
-#define N_THREADS 256
+#extension GL_EXT_control_flow_attributes : enable
+
+#define CHUNK_SIZE 4
 #define N_BATCHES 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -23,28 +25,17 @@ layout(binding = 4) readonly buffer indexBuffer { float indexes[]; };
 shared uint sharedIndex;
 
 void main() {
-    const uint threadIndex = gl_LocalInvocationID.x;
-    const uint nWorkGroups = gl_NumWorkGroups.z;
     const uint batchIndex = gl_WorkGroupID.y;
-    const uint workGroupIndex = gl_WorkGroupID.z;
-
-    if (threadIndex == 0) {
-        sharedIndex = uint(indexes[batchIndex]);
-    }
-
-    barrier();
+    const uint chunkIndex = gl_WorkGroupID.z;
 
-    BatchInfo info = infos[batchIndex];
-    const uint slice = info.inputSizeX / nWorkGroups;
-    const uint rest = info.inputSizeX % nWorkGroups;
-    const uint offset = workGroupIndex * slice + min(rest, workGroupIndex);
+    const uint index = uint(indexes[batchIndex]);
 
-    const uint index = sharedIndex;
-    const uint dim = slice + (workGroupIndex < rest ? 1 : 0);;
+    const BatchInfo info = infos[batchIndex];
+    const uint offset = chunkIndex * CHUNK_SIZE;
     const uint xOffset = info.inputOffset + offset;;
     const uint yOffset = index * info.inputSizeX + offset;
 
-    for (uint i = threadIndex; i < dim; i += N_THREADS) {
+    [[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
         y[yOffset + i] = x[xOffset + i];
     }
 }
diff --git a/src/nn/vulkan/silu-forward-f32-f32.comp b/src/nn/vulkan/silu-forward-f32-f32.comp
@@ -1,9 +1,11 @@
 #version 450
 
-#define N_THREADS 256
+#extension GL_EXT_control_flow_attributes : enable
+
 #define N_BATCHES 32
+#define CHUNK_SIZE 4
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -17,21 +19,15 @@ layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };
 
 void main() {
-    const uint threadIndex = gl_LocalInvocationID.x;
-    const uint nWorkGroups = gl_NumWorkGroups.z;
     const uint batchIndex = gl_WorkGroupID.y;
-    const uint workGroupIndex = gl_WorkGroupID.z;
+    const uint chunkIndex = gl_WorkGroupID.z;
 
     const BatchInfo info = infos[batchIndex];
-    const uint slice = info.inputSizeX / nWorkGroups;
-    const uint rest = info.inputSizeX % nWorkGroups;
-    const uint offset = workGroupIndex * slice + min(rest, workGroupIndex);
-
-    const uint dim = slice + (workGroupIndex < rest ? 1 : 0);
+    const uint offset = chunkIndex * CHUNK_SIZE;
     const uint xOffset = info.inputOffset + offset;
     const uint yOffset = info.outputOffset + offset;
 
-    for (uint i = threadIndex; i < dim; i += N_THREADS) {
+    [[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
         float v = x[xOffset + i];
         y[yOffset + i] = v / (1.0f + exp(-v));
     }