Improve attention performance for qwen2.5 & deepseek

orionpapadakis · orionpapadakis · commit a8aeaf8da3d3 · 2025-09-01T13:22:05.000+03:00
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/Qwen2Kernels.java b/src/main/java/org/beehive/gpullama3/tornadovm/Qwen2Kernels.java
@@ -0,0 +1,131 @@
+package org.beehive.gpullama3.tornadovm;
+
+import uk.ac.manchester.tornado.api.KernelContext;
+import uk.ac.manchester.tornado.api.math.TornadoMath;
+import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
+import uk.ac.manchester.tornado.api.types.arrays.IntArray;
+
+public class Qwen2Kernels {
+
+    public static void processHeadsFlashAttention(KernelContext context, FloatArray q, FloatArray key_cache, FloatArray value_cache, FloatArray xb, int nHeads, int headSize, int kvDim, int kvMul,
+            IntArray positionHolder, int layer, int contextLength) {
+
+        // Thread and workgroup information
+        int globalTid = context.globalIdx;
+        int localTid = context.localIdx;
+        int localSize = context.localGroupSizeX;
+        int workgroupId = context.groupIdx;
+
+        // Calculate which head this workgroup processes
+        int h = workgroupId;
+
+        // Early exit if beyond head count
+        if (h >= nHeads) {
+            return;
+        }
+
+        int pos = positionHolder.get(0);
+        int loff = layer * contextLength * kvDim;
+        int kvHeadIdx = h / kvMul;
+        int BLOCK_SIZE_C = 8;
+
+        // Allocate shared memory for tiled computation
+        float[] q_shared = context.allocateFloatLocalArray(headSize);
+        float[] k_tile = context.allocateFloatLocalArray(BLOCK_SIZE_C * headSize);
+        float[] v_tile = context.allocateFloatLocalArray(BLOCK_SIZE_C * headSize);
+        float[] s_tile = context.allocateFloatLocalArray(BLOCK_SIZE_C);
+        float[] shared_max = context.allocateFloatLocalArray(1);
+
+        // Per-thread output accumulation
+        float[] output = new float[headSize];
+        for (int i = 0; i < headSize; i++) {
+            output[i] = 0.0f;
+        }
+
+        // Thread-local accumulators for online softmax
+        float maxScore = Float.NEGATIVE_INFINITY;
+        float sumExp = 0.0f;
+
+        // Cooperatively load query vector into shared memory
+        for (int i = localTid; i < headSize; i += localSize) {
+            q_shared[i] = q.get(h * headSize + i);
+        }
+        context.localBarrier();
+
+        // Process sequence in tiles
+        for (int tileC = 0; tileC <= pos; tileC += BLOCK_SIZE_C) {
+            int tileEnd = Math.min(tileC + BLOCK_SIZE_C - 1, pos);
+
+            // Cooperatively load key and value vectors for this tile
+            for (int tIdxInSeq = tileC + localTid; tIdxInSeq <= tileEnd; tIdxInSeq += localSize) {
+                int k_v_idx_in_tile = tIdxInSeq - tileC;
+                int tileMemOffset = k_v_idx_in_tile * headSize;
+
+                for (int d = 0; d < headSize; d++) {
+                    int kvCacheAbsolutePos = tIdxInSeq;
+                    int kvOffset = loff + kvCacheAbsolutePos * kvDim + kvHeadIdx * headSize + d;
+                    k_tile[tileMemOffset + d] = key_cache.get(kvOffset);
+                    v_tile[tileMemOffset + d] = value_cache.get(kvOffset);
+                }
+            }
+            context.localBarrier();
+
+            // Cooperatively compute attention scores for this tile
+            for (int tIdxInSeq = tileC + localTid; tIdxInSeq <= tileEnd; tIdxInSeq += localSize) {
+                int score_idx_in_tile = tIdxInSeq - tileC;
+
+                float score = 0.0f;
+                for (int d = 0; d < headSize; d++) {
+                    score += q_shared[d] * k_tile[score_idx_in_tile * headSize + d];
+                }
+                score /= TornadoMath.sqrt(headSize);
+                s_tile[score_idx_in_tile] = score;
+            }
+            context.localBarrier();
+
+            // Find max score in this tile using reduction
+            float tileLocalMax = Float.NEGATIVE_INFINITY;
+            for (int i = 0; i <= tileEnd - tileC; i++) {
+                if (s_tile[i] > tileLocalMax) {
+                    tileLocalMax = s_tile[i];
+                }
+            }
+
+            // Thread 0 broadcasts the max
+            if (localTid == 0) {
+                shared_max[0] = tileLocalMax;
+            }
+            context.localBarrier();
+            float currentTileMax = shared_max[0];
+
+            // Update global max and rescale if needed
+            float newMax = Math.max(maxScore, currentTileMax);
+            if (newMax != maxScore && maxScore != Float.NEGATIVE_INFINITY) {
+                float scale = TornadoMath.exp(maxScore - newMax);
+                sumExp *= scale;
+                for (int d = 0; d < headSize; d++) {
+                    output[d] *= scale;
+                }
+            }
+            maxScore = newMax;
+
+            // Process each key-value pair in the tile
+            for (int t_idx_in_s_tile = 0; t_idx_in_s_tile <= tileEnd - tileC; t_idx_in_s_tile++) {
+                float expScore = TornadoMath.exp(s_tile[t_idx_in_s_tile] - maxScore);
+                sumExp += expScore;
+
+                // Accumulate weighted values
+                for (int d = 0; d < headSize; d++) {
+                    output[d] += expScore * v_tile[t_idx_in_s_tile * headSize + d];
+                }
+            }
+            context.localBarrier();
+        }
+
+        // Normalize and cooperatively write final results
+        float normFactor = (sumExp > 0.0f) ? (1.0f / sumExp) : 0.0f;
+        for (int d = localTid; d < headSize; d += localSize) {
+            xb.set(h * headSize + d, output[d] * normFactor);
+        }
+    }
+}
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/Qwen2TornadoVMLayerPlanner.java b/src/main/java/org/beehive/gpullama3/tornadovm/Qwen2TornadoVMLayerPlanner.java
@@ -83,7 +83,7 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
                             config.headSize())
                     .task("copyToCaches", TransformerComputeKernelsLayered::copyToCache,
                             state.wrapKeyCache, state.wrapK,  state.wrapValueCache, state.wrapV, state.positionHolder, config.kvDim(), layerIndex, config.contextLength())
-                    .task("parallel-attention", TransformerComputeKernelsLayered::processHeadsFlashAttention, context,
+                    .task("parallel-attention", Qwen2Kernels::processHeadsFlashAttention, context,
                             state.wrapQ, state.wrapKeyCache, state.wrapValueCache, state.wrapXb,
                             config.numberOfHeads(), config.headSize(), config.kvDim(), config.kvMul(),
                             state.positionHolder, layerIndex, config.contextLength())
@@ -190,12 +190,21 @@ private GridScheduler setupQwen2GridSchedulersLayeredNonNvidia() {
         rmsNormWorker.setLocalWork(32, 1, 1);         // Set local work size to 256 (standard efficient size)
 
         // Parallel attention worker configuration
-        // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.numberOfHeads,1,1], localWorkSize=[4,1,1])
-        // CUDA equivalent: kernel<<<dim3((config.numberOfHeads+3)/4,1,1), dim3(4,1,1)>>>
+        // Calculate optimal local work size based on head dimension
+        int optimalLocalSize = Math.min(config.headSize(), 64); // Start with 64 threads per head
+        if (config.headSize() % optimalLocalSize != 0) {
+            // Find largest divisor of headSize <= 64
+            for (int size = 64; size >= 1; size--) {
+                if (config.headSize() % size == 0) {
+                    optimalLocalSize = size;
+                    break;
+                }
+            }
+        }
+
         WorkerGrid parallelAttentionWorker = new WorkerGrid1D(config.numberOfHeads());
-        // the global group work size is numberOfHeads * localWorkGroupSize, where the localWorkGroupSize is currently 4
-        parallelAttentionWorker.setGlobalWork(config.numberOfHeads(), 1, 1);
-        parallelAttentionWorker.setLocalWork(1, 1, 1); // Set local work size to 4 (for parallel attention)
+        parallelAttentionWorker.setGlobalWork(config.numberOfHeads() * optimalLocalSize, 1, 1);
+        parallelAttentionWorker.setLocalWork(optimalLocalSize, 1, 1);
 
         // Copy to caches worker configuration
         // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.dim,1,1], localWorkSize=[128,1,1])