feat: vulkan optimization. (#196)

b4rtaz · web-flow · commit 50dfb13929f9 · 2025-04-13T23:35:39.000+02:00
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -529,8 +529,8 @@ void testMatmul_F32_F32_F32() {
 }
 
 void testMatmul_Q80_Q40_F32() {
-    #define MATMUL_Q80_Q40_N 64
-    #define MATMUL_Q80_Q40_D 96
+    #define MATMUL_Q80_Q40_N 512
+    #define MATMUL_Q80_Q40_D 512
     execute(
         [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
             NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_Q80, N_BATCHES, MATMUL_Q80_Q40_N));
@@ -552,19 +552,19 @@ void testMatmul_Q80_Q40_F32() {
             constexpr NnUint weightSize = MATMUL_Q80_Q40_N * MATMUL_Q80_Q40_D;
             constexpr NnUint weightBlocks = weightSize / Q40_BLOCK_SIZE;
 
-            float x[xSize];
-            float weight[weightSize];
-            NnBlockQ40 weightQ40[weightBlocks];
+            std::unique_ptr<float[]> x(new float[xSize]);
+            std::unique_ptr<float[]> weight(new float[weightSize]);
+            std::unique_ptr<NnBlockQ40[]> weightQ40(new NnBlockQ40[weightBlocks]);
 
             for (NnUint i = 0; i < xSize; i++)
-                x[i] = i * 0.01f;
+                x[i] = i * 0.001f;
             for (NnUint i = 0; i < weightSize; i++)
-                weight[i] = i * 0.001f;
+                weight[i] = i * 0.0001f;
 
-            quantizeF32toQ80(x, xPipe, xSize, 1, 0);
-            quantizeF32toQ40(weight, weightQ40, weightSize, 1, 0);
+            quantizeF32toQ80(x.get(), xPipe, xSize, 1, 0);
+            quantizeF32toQ40(weight.get(), weightQ40.get(), weightSize, 1, 0);
 
-            executor->loadWeight("matmul", 0, weightBlocks * sizeof(NnBlockQ40), (NnByte *)weightQ40);
+            executor->loadWeight("matmul", 0, weightBlocks * sizeof(NnBlockQ40), (NnByte *)weightQ40.get());
 
             // act
             executor->forward();
@@ -576,8 +576,8 @@ void testMatmul_Q80_Q40_F32() {
                     for (NnUint n = 0; n < MATMUL_Q80_Q40_N; n++)
                         sum += x[b * MATMUL_Q80_Q40_N + n] * weight[d * MATMUL_Q80_Q40_N + n];
                     const NnUint p = b * MATMUL_Q80_Q40_D + d;
-                    const float change = (yPipe[p] - sum) / sum;
-                    assertFloat(p, change, 0.0, 0.04f);
+                    const float tolerance = sum * 0.025f;
+                    assertFloat(p, yPipe[p], sum, tolerance);
                 }
             }
             printOk("testMatmul_Q80_Q40_F32");
diff --git a/src/nn/vulkan/cast-forward-f32-q80.comp b/src/nn/vulkan/cast-forward-f32-q80.comp
@@ -5,7 +5,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types : enable
 
 #define Q80_BLOCK_SIZE 32
-#define N_THREADS 64
+#define N_THREADS 256
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
@@ -61,7 +61,7 @@ void main() {
         const uint yiOffset = yOffset + i;
 
         float amax = 0.0;
-        for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
+        [[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
             const float v = abs(x[xiOffset + j]);
             amax = max(amax, v);
         }
diff --git a/src/nn/vulkan/matmul-forward-q80-q40-f32.comp b/src/nn/vulkan/matmul-forward-q80-q40-f32.comp
@@ -7,8 +7,8 @@
 #define Q80_Q40_BLOCK_SIZE 32
 #define N_THREADS 256
 
-#define GROUP_SIZE 64
-#define N_THREADS_PER_GROUP (N_THREADS / GROUP_SIZE)
+#define N_OUTPUTS_PER_ITER 64
+#define N_THREADS_PER_OUTPUT (N_THREADS / N_OUTPUTS_PER_ITER)
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
@@ -50,13 +50,14 @@ void main() {
         const uint batchIndex = gl_WorkGroupID.y;
         const uint workGroupIndex = gl_WorkGroupID.z;
 
-        sharedInputOffset = infos[batchIndex].inputOffset;
-        sharedInputSizeX = infos[batchIndex].inputSizeX;
-        sharedOutputOffset = infos[batchIndex].outputOffset;
-        sharedInputSizeXPerGroup = (sharedInputSizeX + N_THREADS_PER_GROUP - 1) / N_THREADS_PER_GROUP;
+        const BatchInfo info = infos[batchIndex];
+        sharedInputOffset = info.inputOffset;
+        sharedInputSizeX = info.inputSizeX;
+        sharedOutputOffset = info.outputOffset;
+        sharedInputSizeXPerGroup = (sharedInputSizeX + N_THREADS_PER_OUTPUT - 1) / N_THREADS_PER_OUTPUT;
 
-        const uint ySlice = infos[batchIndex].outputSizeX / nWorkGroups;
-        const uint yRest = infos[batchIndex].outputSizeX % nWorkGroups;
+        const uint ySlice = info.outputSizeX / nWorkGroups;
+        const uint yRest = info.outputSizeX % nWorkGroups;
         sharedStart = workGroupIndex * ySlice + (workGroupIndex < yRest ? workGroupIndex : yRest);
         sharedEnd = sharedStart + ySlice + (workGroupIndex < yRest ? 1 : 0);
     }
@@ -70,12 +71,12 @@ void main() {
     const uint outputOffset = sharedOutputOffset;
     const uint inputSizeXPerGroup = sharedInputSizeXPerGroup;
 
-    const uint dGroup = threadIndex / N_THREADS_PER_GROUP;
-    const uint iGroup = threadIndex % N_THREADS_PER_GROUP;
+    const uint dGroup = threadIndex / N_THREADS_PER_OUTPUT;
+    const uint iGroup = threadIndex % N_THREADS_PER_OUTPUT;
     const uint iStart = inputSizeXPerGroup * iGroup;
     const uint iEnd = min(iStart + inputSizeXPerGroup, inputSizeX);
 
-    for (uint dBatch = sharedStart; dBatch < dEnd; dBatch += GROUP_SIZE) {
+    for (uint dBatch = sharedStart; dBatch < dEnd; dBatch += N_OUTPUTS_PER_ITER) {
         const uint d = dBatch + dGroup;
         if (d >= dEnd) {
             break;
@@ -85,19 +86,20 @@ void main() {
         for (uint i = iStart; i < iEnd; i++) {
             const uint xi = inputOffset + i;
             const uint wi = d * inputSizeX + i;
+            const float16_t scale = x[xi].d * weight[wi].d;
             [[unroll]] for (uint j = 0; j < Q80_Q40_BLOCK_SIZE / 2; j++) {
                 sum += (
                     float16_t(x[xi].qs[j])                          * (float16_t(weight[wi].qs[j] & 0xF) - float16_t(8.0f)) +
                     float16_t(x[xi].qs[j + Q80_Q40_BLOCK_SIZE / 2]) * (float16_t(weight[wi].qs[j] >>  4) - float16_t(8.0f))
-                ) * x[xi].d * weight[wi].d;
+                ) * scale;
             }
         }
         sums[threadIndex] = sum;
 
         barrier();
         memoryBarrierShared();
 
-        [[unroll]] for (uint i = N_THREADS_PER_GROUP / 2; i > 0; i >>= 1) {
+        [[unroll]] for (uint i = N_THREADS_PER_OUTPUT / 2; i > 0; i >>= 1) {
             if (iGroup < i)
                 sums[threadIndex] += sums[threadIndex + i];
             barrier();
diff --git a/src/nn/vulkan/merge-add-forward-f32-f32.comp b/src/nn/vulkan/merge-add-forward-f32-f32.comp
@@ -1,6 +1,6 @@
 #version 450
 
-#define N_THREADS 64
+#define N_THREADS 256
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/src/nn/vulkan/merge-add-forward-q80-f32.comp b/src/nn/vulkan/merge-add-forward-q80-f32.comp
@@ -5,7 +5,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types : enable
 
 #define Q80_BLOCK_SIZE 32
-#define N_THREADS 64
+#define N_THREADS 256
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/src/nn/vulkan/mul-forward-f32-f32.comp b/src/nn/vulkan/mul-forward-f32-f32.comp
@@ -1,6 +1,6 @@
 #version 450
 
-#define N_THREADS 64
+#define N_THREADS 256
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/src/nn/vulkan/multi-head-att-forward-f32-f32.comp b/src/nn/vulkan/multi-head-att-forward-f32-f32.comp
@@ -1,6 +1,8 @@
 #version 450
 
-#define N_THREADS 64
+#extension GL_EXT_control_flow_attributes : enable
+
+#define N_THREADS 256
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
@@ -36,7 +38,8 @@ layout(binding = 8) buffer attBufferBuffer { float att[]; };
 
 shared BatchInfo sharedInfo;
 shared uint position;
-shared float sharedSum;
+shared float sharedMaxScore;
+shared float temp[N_THREADS];
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
@@ -45,7 +48,7 @@ void main() {
 
     const uint kvMul = nHeads / nKvHeads;
     const uint headIndex = h / kvMul;
-    const float headSizeRoot = sqrt(float(headSize));
+    const float invHeadSizeRoot = 1.0 / sqrt(float(headSize));
 
 
     if (threadIndex == 0) {
@@ -61,56 +64,65 @@ void main() {
     const uint kvOffset = headIndex * headSize;
     const uint yOffset = sharedInfo.outputOffset + h * headSize;
 
+    float ms = -1e10f;
     for (uint p = threadIndex; p <= position; p += N_THREADS) {
-        float score = 0.0;
         const uint kOffset = kvOffset + p * kvDim0;
+
+        float score = 0.0;
         for (uint i = 0; i < headSize; i++) {
             score += query[qOffset + i] * keyCache[kOffset + i];
         }
-        att[attOffset + p] = score / headSizeRoot;
+        score *= invHeadSizeRoot;
+        ms = max(ms, score);
+        att[attOffset + p] = score;
     }
 
+    temp[threadIndex] = ms;
+
     barrier();
+    memoryBarrierShared();
 
-    // softmax
-    if (threadIndex == 0) {
-        // TODO: split into multiple threads
-        float maxScore = att[attOffset];
-        for (uint p = 1; p <= position; p++) {
-            maxScore = max(maxScore, att[attOffset + p]);
-        }
+    [[unroll]] for (uint i = N_THREADS / 2; i > 0; i >>= 1) {
+        if (threadIndex < i)
+            temp[threadIndex] = max(temp[threadIndex], temp[threadIndex + i]);
+        barrier();
+    }
 
-        float sum = 0.0;
-        for (uint p = 0; p <= position; p++) {
-            float v = exp(att[attOffset + p] - maxScore);
-            att[attOffset + p] = v;
-            sum += v;
-        }
-        sharedSum = sum;
+    if (threadIndex == 0) {
+        sharedMaxScore = temp[0];
     }
 
     barrier();
+    memoryBarrierShared();
 
-    const float sum = sharedSum;
+    const float maxScore = sharedMaxScore;
 
+    float s = 0.0;
     for (uint p = threadIndex; p <= position; p += N_THREADS) {
-        att[attOffset + p] /= sum;
+        float v = exp(att[attOffset + p] - maxScore);
+        att[attOffset + p] = v;
+        s += v;
     }
 
-    // return
-    for (uint i = threadIndex; i < headSize; i += N_THREADS) {
-        y[yOffset + i] = 0.0;
+    temp[threadIndex] = s;
+    barrier();
+
+    [[unroll]] for (uint i = N_THREADS / 2; i > 0; i >>= 1) {
+        if (threadIndex < i)
+            temp[threadIndex] += temp[threadIndex + i];
+        barrier();
     }
 
-    barrier();
+    const float yScale = 1.0 / temp[0];
 
     for (uint i = threadIndex; i < headSize; i += N_THREADS) {
         float sum = 0.0;
+        const uint vOffset = kvOffset + i;
         for (uint p = 0; p <= position; p += 1) {
             const float a = att[attOffset + p];
-            const float v = valueCache[kvOffset + p * kvDim0 + i];
+            const float v = valueCache[vOffset + p * kvDim0];
             sum += v * a;
         }
-        y[yOffset + i] = sum;
+        y[yOffset + i] = sum * yScale;
     }
 }
diff --git a/src/nn/vulkan/rms-norm-forward-f32-f32-f32.comp b/src/nn/vulkan/rms-norm-forward-f32-f32-f32.comp
@@ -1,6 +1,6 @@
 #version 450
 
-#define N_THREADS 64
+#define N_THREADS 256
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/src/nn/vulkan/silu-forward-f32-f32.comp b/src/nn/vulkan/silu-forward-f32-f32.comp
@@ -21,7 +21,6 @@ shared uint sharedYOffset;
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
-    const uint batchIndex = gl_GlobalInvocationID.y;
 
     if (threadIndex == 0) {
         const uint nWorkGroups = gl_NumWorkGroups.z;