feat: tweaks.

b4rtaz · b4rtaz · commit 7cf3ee9049ab · 2025-08-16T23:41:24.000+02:00
diff --git a/Makefile b/Makefile
@@ -70,7 +70,7 @@ VULKAN_SHADER_BINS := $(VULKAN_SHADER_SRCS:.comp=.spv)
 DEPS += $(VULKAN_SHADER_BINS)
 
 %.spv: %.comp
-	$(CGLSLC) -c $< -o $@ --target-env=vulkan1.1
+	$(CGLSLC) -c $< -o $@ --target-env=vulkan1.2
 nn-vulkan-test: src/nn/nn-vulkan-test.cpp nn-quants.o nn-core.o nn-executor.o nn-vulkan.o ${DEPS}
 	$(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
 endif
diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -522,6 +522,26 @@ static std::vector<NnVulkanBatchInfo> buildBatchInfo(NnOpConfig *opConfig, NnVul
     return offset;
 }
 
+static NnUint roundUpPow2(NnUint n, NnUint min, NnUint max) {
+    NnUint p = 1;
+    while (p << 1 <= n) p <<= 1;
+    if (p < n)  p <<= 1;
+    if (p < min) p = min;
+    if (p > max) p = max;
+    return p;
+}
+
+static uint32_t resolveShaderNThreads(const NnOpConfig *opConfig, const NnSize2D inputSize) {
+    if (opConfig->code == OP_MATMUL) {
+        if (opConfig->weightSize.floatType == F_Q40) {
+            constexpr NnUint maxThreads = 256; // Shader constant
+            NnUint t = roundUpPow2(inputSize.x / (Q40_BLOCK_SIZE * 2), 32, maxThreads);
+            return t;
+        }
+    }
+    return 0;
+}
+
 static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSize, NnUint *groupCount, const NnSize2D inputSize, const NnSize2D outputSize) {
     groupCount[0] = 1;
     groupCount[1] = batchSize;
@@ -531,7 +551,7 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
         if (outputSize.floatType == F_Q80) {
             groupCount[2] = outputSize.x / Q80_BLOCK_SIZE;
         } else {
-            constexpr NnUint chunkSize = 4;
+            constexpr NnUint chunkSize = 4; // Shader constant
             groupCount[2] = outputSize.x / chunkSize;
         }
     } else if (opConfig->code == OP_MERGE_ADD) {
@@ -542,9 +562,8 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
         }
     } else if (opConfig->code == OP_MATMUL) {
         if (opConfig->weightSize.floatType == F_Q40) {
-            // Must be synced with the shader
-            constexpr NnUint tileSizeN = 2;
-            constexpr NnUint tileSizeD = 8;
+            constexpr NnUint tileSizeN = 2; // Shader constant
+            constexpr NnUint tileSizeD = 8; // Shader constant
             const NnUint blockSize = getBlockSize(opConfig->weightSize.floatType);
             assert(opConfig->weightSize.y % (tileSizeN * blockSize) == 0);
             assert(opConfig->weightSize.x % tileSizeD == 0);
@@ -564,7 +583,7 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
         opConfig->code == OP_SILU ||
         opConfig->code == OP_SHIFT
     ) {
-        constexpr NnUint chunkSize = 4;
+        constexpr NnUint chunkSize = 4; // Shader constant
         assert(outputSize.x % chunkSize == 0);
         groupCount[2] = outputSize.x / chunkSize;
     }
@@ -663,12 +682,10 @@ NnVulkanDeviceSegment::NnVulkanDeviceSegment(NnVulkanContext *context, NnVulkanD
 
     std::vector<vk::PipelineShaderStageCreateInfo> shaderCreateInfos(segmentConfig->nOps);
 
-    constexpr NnUint maxConsts = 3;
-    std::vector<NnUint> nConsts(segmentConfig->nOps);
-    std::vector<int> consts(segmentConfig->nOps * maxConsts);
     std::vector<vk::SpecializationInfo> specInfos(segmentConfig->nOps);
-    std::vector<vk::SpecializationMapEntry> specMapEntries(segmentConfig->nOps * maxConsts);
-    
+    std::vector<vk::SpecializationMapEntry> specEntries(segmentConfig->nOps);
+    std::vector<uint32_t> nThreads(segmentConfig->nOps);
+
     for (NnUint opIndex = 0; opIndex < segmentConfig->nOps; opIndex++) {
         NnOpConfig *opConfig = &segmentConfig->ops[opIndex];
         NnSize2D inputSize = data->resolveBufferSize(&opConfig->input);
@@ -690,12 +707,17 @@ NnVulkanDeviceSegment::NnVulkanDeviceSegment(NnVulkanContext *context, NnVulkanD
             code.data()
         );
 
+        nThreads[opIndex] = resolveShaderNThreads(opConfig, inputSize);
+        specEntries[opIndex] = vk::SpecializationMapEntry(0, 0, sizeof(uint32_t));
+        specInfos[opIndex] = vk::SpecializationInfo(1, &specEntries[opIndex], sizeof(uint32_t), &nThreads[opIndex]);
+
         vk::ShaderModule shaderModule = context->device.createShaderModule(shaderModuleCreateInfo);
         vk::PipelineShaderStageCreateInfo shaderCreateInfo(
             vk::PipelineShaderStageCreateFlags(),
             vk::ShaderStageFlagBits::eCompute,
             shaderModule,
-            "main"
+            "main",
+            &specInfos[opIndex]
         );
 
         shaderModules[opIndex] = shaderModule;
diff --git a/src/nn/vulkan/matmul-forward-q80-q40-f32.comp b/src/nn/vulkan/matmul-forward-q80-q40-f32.comp
@@ -4,14 +4,14 @@
 #extension GL_EXT_shader_16bit_storage : enable
 #extension GL_EXT_shader_explicit_arithmetic_types : enable
 
-#define N_THREADS 64
+#define MAX_THREADS 256
 #define N_BATCHES 32
 #define TILE_SIZE_X 2
 #define TILE_SIZE_D 8
 
 #define Q80_Q40_BLOCK_SIZE 32
 
-layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 struct BatchInfo {
     uint inputOffset;
@@ -35,17 +35,18 @@ layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };
 layout(binding = 3) readonly buffer weightBuffer { BlockQ40 weight[]; };
 
-shared float16_t sums[N_THREADS * TILE_SIZE_D];
+shared float16_t sums[MAX_THREADS * TILE_SIZE_D];
 
 void main() {
+    const uint nThreads = gl_WorkGroupSize.x;
     const uint threadIndex = gl_LocalInvocationID.x;
     const uint batchIndex = gl_WorkGroupID.y;
     const uint workGroupIndex = gl_WorkGroupID.z;
     const BatchInfo info = infos[batchIndex];
 
     const uint xTiles = info.inputSizeX / TILE_SIZE_X;
-    const uint xSlice = xTiles / N_THREADS;
-    const uint xRest = xTiles % N_THREADS;
+    const uint xSlice = xTiles / nThreads;
+    const uint xRest = xTiles % nThreads;
 
     const uint inputOffset = info.inputOffset;
     const uint inputSizeX = info.inputSizeX;
@@ -97,15 +98,15 @@ void main() {
 
     barrier();
 
-    [[unroll]] for (uint i = N_THREADS / 2; i > 0; i >>= 1) {
+    for (uint i = nThreads / 2; i > 0; i >>= 1) {
         for (uint dt = 0; dt < TILE_SIZE_D; dt++) {
             if (threadIndex < i) {
                 sums[threadIndex * TILE_SIZE_D + dt] += sums[(threadIndex + i) * TILE_SIZE_D + dt];
             }
         }
         barrier();
     }
-    for (uint dt = threadIndex; dt < TILE_SIZE_D; dt += N_THREADS) {
+    for (uint dt = threadIndex; dt < TILE_SIZE_D; dt += nThreads) {
         y[outputOffset + d + dt] = float(sums[dt]);
     }
 }