Add forward method for Qwen2

orionpapadakis · orionpapadakis · commit c4562ad2143c · 2025-08-06T16:30:22.000+03:00
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java
@@ -4,15 +4,18 @@
 import org.beehive.gpullama3.core.model.tensor.FloatTensor;
 import org.beehive.gpullama3.inference.state.Phi3State;
 import org.beehive.gpullama3.inference.state.State;
+import org.beehive.gpullama3.inference.weights.standard.Qwen2StandardWeights;
 import org.beehive.gpullama3.inference.weights.standard.Phi3StandardWeights;
 import org.beehive.gpullama3.inference.weights.standard.Qwen3StandardWeights;
 import org.beehive.gpullama3.inference.weights.standard.StandardWeights;
 import org.beehive.gpullama3.inference.weights.tornado.TornadoWeights;
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.Model;
+import org.beehive.gpullama3.model.qwen2.Qwen2Configuration;
 import org.beehive.gpullama3.model.phi3.Phi3Configuration;
 import org.beehive.gpullama3.model.qwen3.Qwen3Configuration;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
+
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
 
 import java.lang.foreign.MemorySegment;
@@ -176,6 +179,146 @@ public static FloatTensor forwardJava(Model model, State state, int token, int p
         return state.logits;
     }
 
+    public static FloatTensor forwardJavaQwen2(Model model, State state, int token, int position) {
+        final Qwen2Configuration config = (Qwen2Configuration) model.configuration();
+        final Qwen2StandardWeights weights = (Qwen2StandardWeights) model.weights();
+        int dim = config.dim();
+        int headSize = config.headSize();
+        int kvDim = (config.dim() * config.numberOfKeyValueHeads()) / config.numberOfHeads();
+        int kvMul = config.numberOfHeads() / config.numberOfKeyValueHeads(); // integer multiplier of the kv sharing in multiquery
+        float sqrtHeadSize = (float) Math.sqrt(headSize);
+
+        weights.token_embedding_table.copyTo(token * dim, state.x, 0, dim);
+
+        // forward all the layers
+        for (int l = 0; l < config.numberOfLayers(); l++) {
+            // attention rmsnorm
+            final int curLayer = l;
+            rmsnorm(state.xb, state.x, weights.rms_att_weight[curLayer], 0, dim, config.rmsNormEps());
+
+            // qkv matmuls for this position
+            weights.wq[l].matmul(state.xb, state.q, dim, dim);
+            weights.wk[l].matmul(state.xb, state.k, kvDim, dim);
+            weights.wv[l].matmul(state.xb, state.v, kvDim, dim);
+
+            if ((weights.q_bias != null && weights.q_bias[curLayer] != null)
+                    || (weights.k_bias != null && weights.k_bias[curLayer] != null)
+                    || (weights.v_bias != null && weights.v_bias[curLayer] != null)) {
+                if (weights.q_bias != null && weights.q_bias[curLayer] != null) {
+                    state.q.addInPlace(weights.q_bias[curLayer]);
+                }
+                if (weights.k_bias != null && weights.k_bias[curLayer] != null) {
+                    state.k.addInPlace(weights.k_bias[curLayer]);
+                }
+                if (weights.v_bias != null && weights.v_bias[curLayer] != null) {
+                    state.v.addInPlace(weights.v_bias[curLayer]);
+                }
+            }
+
+            // RoPE relative positional encoding: complex-valued rotate q and k in each head
+            // GPT-NeoX style RoPE, real/imaginary components are stored with a headSize/2 offset per head, instead of consecutive.
+            for (int h = 0; h < config.numberOfHeads(); ++h) {
+                int rotn = h < config.numberOfKeyValueHeads() ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
+                int poffset = h * headSize;
+                for (int i0 = 0; i0 < headSize; i0 += 2) {
+                    int ic = i0 / 2;
+                    float fcr = weights.freq_cis_real.getFloat((position) * (headSize / 2) + ic);
+                    float fci = weights.freq_cis_imag.getFloat((position) * (headSize / 2) + ic);
+                    for (int vi = 0; vi < rotn; vi++) {
+                        FloatTensor vec = (vi == 0) ? state.q : state.k; // the vector to rotate (query or key)
+                        float v0 = vec.getFloat(poffset + ic);
+                        float v1 = vec.getFloat(poffset + ic + headSize/2);
+                        vec.setFloat(poffset + ic, v0 * fcr - v1 * fci);
+                        vec.setFloat(poffset + ic + headSize/2, v0 * fci + v1 * fcr);
+                    }
+                }
+            }
+
+            // save key,value at this time step (position) to our kv cache
+            //int loff = l * config.seq_len * kvDim; // kv cache layer offset for convenience
+            state.k.copyTo(0, state.keyCache[curLayer], position * kvDim, kvDim);
+            state.v.copyTo(0, state.valueCache[curLayer], position * kvDim, kvDim);
+
+            // multihead attention. iterate over all heads
+            Parallel.parallelFor(0,  config.numberOfHeads(), h -> {
+                // get the query vector for this head
+                // float* q = s.q + h * headSize;
+                int qOffset = h * headSize;
+
+                // attention scores for this head
+                // float* att = s.att + h * config.seq_len;
+                int attOffset = h * config.contextLength();
+
+                // iterate over all timesteps, including the current one
+                for (int t = 0; t <= position; t++) {
+                    // get the key vector for this head and at this timestep
+                    // float* k = s.key_cache + loff + t * dim + h * headSize;
+                    int keyCacheOffset = /* loff + */ t * kvDim + (h / kvMul) * headSize;
+                    // calculate the attention score as the dot product of q and k
+                    float score = state.q.dot(qOffset, state.keyCache[curLayer], keyCacheOffset, headSize);
+                    score /= sqrtHeadSize;
+                    // save the score to the attention buffer
+                    state.att.setFloat(attOffset + t, score);
+                }
+
+                // softmax the scores to get attention weights, from 0..position inclusively
+                state.att.softmaxInPlace(attOffset, position + 1);
+
+                // weighted sum of the values, store back into xb
+                // float* xb = s.xb + h * headSize;
+                int xbOffset = h * headSize;
+                // memset(xb, 0, headSize * sizeof(float));
+                state.xb.fillInPlace(xbOffset, headSize, 0f);
+
+                for (int t = 0; t <= position; t++) {
+                    // get the value vector for this head and at this timestep
+                    // float* v = s.value_cache + loff + t * dim + h * headSize;C
+                    int vOffset = /* loff + */ t * kvDim + (h / kvMul) * headSize;
+                    // get the attention weight for this timestep
+                    float a = state.att.getFloat(attOffset + t);
+                    // accumulate the weighted value into xb
+                    state.xb.saxpyInPlace(xbOffset, state.valueCache[curLayer], vOffset, headSize, a);
+                }
+            });
+
+            // final matmul to get the output of the attention
+            weights.wo[l].matmul(state.xb, state.xb2, dim, dim);
+
+            // residual connection back into x
+            state.x.addInPlace(state.xb2);
+
+            // ffn rmsnorm
+            rmsnorm(state.xb, state.x, weights.rms_ffn_weight[curLayer], 0, dim, config.rmsNormEps());
+
+            // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))
+            // first calculate self.w1(x) and self.w3(x)
+            weights.w1[l].matmul(state.xb, state.hb, config.hiddenDim(), dim);
+            weights.w3[l].matmul(state.xb, state.hb2, config.hiddenDim(), dim);
+
+            // SwiGLU non-linearity
+            // silu(x)=x*σ(x), where σ(x) is the logistic sigmoid
+            state.hb.mapInPlace(value -> value / (float) (1.0 + Math.exp(-value)));
+
+            // elementwise multiply with w3(x)
+            state.hb.multiplyInPlace(state.hb2);
+
+            // final matmul to get the output of the ffn
+            weights.w2[l].matmul(state.hb, state.xb, dim, config.hiddenDim());
+
+            // residual connection
+            state.x.addInPlace(state.xb);
+
+        }
+
+        // final rmsnorm
+        rmsnorm(state.x, state.x, weights.rms_final_weight, 0, dim, config.rmsNormEps());
+
+        // classifier into logits
+        weights.wcls.matmul(state.x, state.logits, config.vocabularySize(), dim);
+
+        return state.logits;
+    }
+
     public static FloatTensor forwardJavaQwen3(Model model, State state, int token, int position) {
         // a few convenience variables
         final Qwen3Configuration config = (Qwen3Configuration) model.configuration();