beehive-lab
diff --git a/‎src/main/java/com/example/inference/InferenceCore.java‎
Lines changed: 115 additions & 2 deletions b/‎src/main/java/com/example/inference/InferenceCore.java‎
Lines changed: 115 additions & 2 deletions
diff --git a/‎src/main/java/com/example/inference/InferenceEngine.java‎
Lines changed: 151 additions & 15 deletions b/‎src/main/java/com/example/inference/InferenceEngine.java‎
Lines changed: 151 additions & 15 deletions
@@ -2,12 +2,15 @@
 
 import com.example.auxiliary.Parallel;
 import com.example.core.model.tensor.FloatTensor;
+import com.example.inference.state.Phi3State;
 import com.example.inference.state.State;
+import com.example.inference.weights.standard.Phi3StandardWeights;
 import com.example.inference.weights.standard.Qwen3StandardWeights;
 import com.example.inference.weights.standard.StandardWeights;
 import com.example.inference.weights.tornado.TornadoWeights;
 import com.example.model.Configuration;
 import com.example.model.Model;
+import com.example.model.phi3.Phi3Configuration;
 import com.example.model.qwen3.Qwen3Configuration;
 import com.example.tornadovm.TornadoVMMasterPlan;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
@@ -18,8 +21,7 @@
  * Low-level operations for model inference.
  *
  * <p>
- * This class provides core computational operations such as RMS normalization and
- * forward passes through model layers. It supports both CPU and GPU implementations.
+ * This class provides core computational operations such as RMS normalization and forward passes through model layers. It supports both CPU and GPU implementations.
  * </p>
  *
  * <p>
@@ -308,6 +310,117 @@ public static FloatTensor forwardJavaQwen3(Model model, State state, int token,
         return state.logits;
     }
 
+    public static FloatTensor forwardJavaPhi3(Model model, Phi3State state, int token, int position) {
+        Phi3Configuration config = (Phi3Configuration) model.configuration();
+        Phi3StandardWeights weights = (Phi3StandardWeights) model.weights();
+        int dim = config.dim();
+        int headSize = config.headSize();
+        int kvDim = (config.dim() * config.numberOfKeyValueHeads()) / config.numberOfHeads();
+        int kvMul = config.numberOfHeads() / config.numberOfKeyValueHeads(); // integer multiplier of the kv sharing in multiquery
+        float sqrtHeadSize = (float) Math.sqrt(headSize);
+
+        // copy the token embedding into x
+        weights.token_embedding_table.copyTo(token * dim, state.x, 0, dim);
+
+        // Phi3: op_size = num_heads * head_dim + 2 * (num_key_value_heads * head_dim)
+        final int opSize = dim + 2 * (config.numberOfKeyValueHeads() * headSize);
+
+        // forward all the layers
+        for (int l = 0; l < config.numberOfLayers(); l++) {
+            rmsnorm(state.xb, state.x, weights.rms_att_weight[l], 0, dim, config.rmsNormEps());
+
+            weights.wqkv[l].matmul(state.xb, state.qkv, opSize, dim);
+            state.qkv.copyTo(0, state.q, 0, dim);
+            // key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+            state.qkv.copyTo(dim, state.k, 0, config.numberOfKeyValueHeads() * headSize);
+            // value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+            state.qkv.copyTo(dim + config.numberOfKeyValueHeads() * headSize, state.v, 0, config.numberOfKeyValueHeads() * headSize);
+
+            int dimHalf = headSize / 2;
+            for (int i = 0; i < dim; i += 2) {
+                int head_dim = i % headSize;
+                int base = i - head_dim;
+                int ic = base + head_dim / 2;
+                float fcr = weights.freq_cis_real.getFloat(position * (headSize / 2) + (head_dim / 2));
+                float fci = weights.freq_cis_imag.getFloat(position * (headSize / 2) + (head_dim / 2));
+                int rotn = i < kvDim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
+                for (int v = 0; v < rotn; v++) {
+                    FloatTensor vec = v == 0 ? state.q : state.k; // the vector to rotate (query or key)
+                    float v0 = vec.getFloat(ic);
+                    float v1 = vec.getFloat(ic + dimHalf);
+                    vec.setFloat(ic, v0 * fcr - v1 * fci);
+                    vec.setFloat(ic + dimHalf, v0 * fci + v1 * fcr);
+                }
+            }
+
+            // save key,value at this time step (position) to our kv cache
+            state.k.copyTo(0, state.keyCache[l], position * kvDim, kvDim);
+            state.v.copyTo(0, state.valueCache[l], position * kvDim, kvDim);
+
+            int curLayer = l;
+
+            Parallel.parallelFor(0, config.numberOfHeads(), h -> {
+                int qOffset = h * headSize;
+
+                int attOffset = h * config.contextLength();
+
+                for (int t = 0; t <= position; t++) {
+                    int keyCacheOffset = /* loff + */ t * kvDim + (h / kvMul) * headSize;
+                    float score = state.q.dot(qOffset, state.keyCache[curLayer], keyCacheOffset, headSize);
+                    score /= sqrtHeadSize;
+                    state.att.setFloat(attOffset + t, score);
+                }
+
+                state.att.softmaxInPlace(attOffset, position + 1);
+
+                int xbOffset = h * headSize;
+                state.xb.fillInPlace(xbOffset, headSize, 0f);
+
+                for (int t = 0; t <= position; t++) {
+                    int vOffset = /* loff + */ t * kvDim + (h / kvMul) * headSize;
+                    float a = state.att.getFloat(attOffset + t);
+                    state.xb.saxpyInPlace(xbOffset, state.valueCache[curLayer], vOffset, headSize, a);
+                }
+            });
+
+            // final matmul to get the output of the attention
+            weights.wo[l].matmul(state.xb, state.xb2, dim, dim);
+
+            // residual connection back into x
+            state.x.addInPlace(state.xb2);
+
+            rmsnorm(state.xb, state.x, weights.rms_ffn_weight[l], 0, dim, config.rmsNormEps());
+
+            weights.wGateUp[l].matmul(state.xb, state.hb, 2 * config.hiddenDim(), dim);
+            copyChunk(state.hb, state.hbG, 2 * config.hiddenDim(), config.hiddenDim(), 2, 0);
+            copyChunk(state.hb, state.hbU, 2 * config.hiddenDim(), config.hiddenDim(), 2, 1);
+
+            state.hbG.mapInPlace(value -> value / (float) (1.0 + Math.exp(-value)));
+
+            state.hbU.multiplyInPlace(state.hbG);
+
+            weights.wDown[l].matmul(state.hbU, state.xb, dim, config.hiddenDim());
+
+            state.x.addInPlace(state.xb);
+        }
+
+        // final rmsnorm
+        rmsnorm(state.x, state.x, weights.rms_final_weight, 0, dim, config.rmsNormEps());
+
+        // classifier into logits
+        weights.wcls.matmul(state.x, state.logits, config.vocabularySize(), dim);
+
+        return state.logits;
+    }
+
+    static void copyChunk(FloatTensor in, FloatTensor out, int dim1In, int dim1Out, int nChunks, int chunkNo) {
+        assert (dim1In == dim1Out * nChunks);
+        final int startOffsetInDim1 = chunkNo * dim1Out;
+        Parallel.parallelFor(0, dim1Out, i -> {
+            out.setFloat(i, in.getFloat(startOffsetInDim1 + i));
+        });
+    }
+
     /**
      * Performs the initial embedding lookup and triggers the TornadoVM accelerated forward pass for an LLM token.
      *
 
@@ -9,6 +9,7 @@
 import com.example.tornadovm.TornadoVMMasterPlan;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
 
+import java.io.ByteArrayOutputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
@@ -18,8 +19,7 @@
  * Main entry point for LLM token generation.
  *
  * <p>
- * Orchestrates the complete inference process: ingests prompt tokens, then generates
- * new tokens until a stop condition is met. Supports both CPU and GPU execution.
+ * Orchestrates the complete inference process: ingests prompt tokens, then generates new tokens until a stop condition is met. Supports both CPU and GPU execution.
  * </p>
  *
  * <p>
@@ -42,19 +42,26 @@ private InferenceEngine() {
      * LLM generation entry point, ingest prompt tokens and generates new tokens.
      *
      * <p>
-     * All prompt tokens are ingested first, then inference starts, until a stop token is found.
-     * The returned tokens only include generated/inferred tokens.
+     * All prompt tokens are ingested first, then inference starts, until a stop token is found. The returned tokens only include generated/inferred tokens.
      *
-     * @param model            model to run inference (including weights, configuration, tokenizer ...)
-     * @param state            state of the model e.g. key/value caches ... this is mutated by this call
-     * @param startPosition    start prompt ingestion + inference at this position in the context e.g. useful if state was kept across calls (chained generation). 0 implies run with no previous context.
-     * @param promptTokens     prompt tokens to ingest, all the prompt tokens will be ingested, given there's enough capacity left in the context
-     * @param stopTokens       set of tokens that abort generation during inference, stop tokens do not affect prompt ingestion
-     * @param maxTokens        maximum number of tokens (can go up to {@link Configuration#contextLength context length}
-     *                         if this value is negative or greater than {@link Configuration#contextLength context length}
-     * @param sampler          {@link Sampler strategy} used to select tokens
-     * @param echo             debugging flag, prints ALL, prompt and inferred tokens, to {@link System#err stderr}
-     * @param onTokenGenerated callback, if non-null, it's called every time a token is inferred e.g. it's not called when ingesting prompt tokens
+     * @param model
+     *         model to run inference (including weights, configuration, tokenizer ...)
+     * @param state
+     *         state of the model e.g. key/value caches ... this is mutated by this call
+     * @param startPosition
+     *         start prompt ingestion + inference at this position in the context e.g. useful if state was kept across calls (chained generation). 0 implies run with no previous context.
+     * @param promptTokens
+     *         prompt tokens to ingest, all the prompt tokens will be ingested, given there's enough capacity left in the context
+     * @param stopTokens
+     *         set of tokens that abort generation during inference, stop tokens do not affect prompt ingestion
+     * @param maxTokens
+     *         maximum number of tokens (can go up to {@link Configuration#contextLength context length} if this value is negative or greater than {@link Configuration#contextLength context length}
+     * @param sampler
+     *         {@link Sampler strategy} used to select tokens
+     * @param echo
+     *         debugging flag, prints ALL, prompt and inferred tokens, to {@link System#err stderr}
+     * @param onTokenGenerated
+     *         callback, if non-null, it's called every time a token is inferred e.g. it's not called when ingesting prompt tokens
      * @return list of generated/inferred tokens, including the stop token, if any e.g. does not include any token from the prompt
      */
     public static List<Integer> generateTokensLlama(Model model, State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
@@ -214,6 +221,60 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st
         return generatedTokens;
     }
 
+    public static List<Integer> generateTokensPhi3(Model model, State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
+            IntConsumer onTokenGenerated) {
+
+        long startNanos = System.nanoTime();
+        if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) {
+            maxTokens = model.configuration().contextLength();
+        }
+        List<Integer> generatedTokens = new ArrayList<>(maxTokens);
+        int token = state.latestToken; // BOS?
+        int nextToken;
+        int promptIndex = 0;
+        ByteArrayOutputStream baos = new ByteArrayOutputStream(5);
+        for (int position = startPosition; position < maxTokens; ++position) {
+
+            model.forward(state, token, position);
+            if (promptIndex < promptTokens.size()) {
+                // Force-pick token from prompt.
+                nextToken = promptTokens.get(promptIndex++);
+                if (echo) {
+                    System.out.println("NextToken: " + nextToken);
+                    String decoded = model.tokenizer().decode(List.of(nextToken));
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+            } else {
+                nextToken = sampler.sampleToken(state.logits);
+                if (echo) {
+                    // log inferred token
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+                generatedTokens.add(nextToken);
+                if (onTokenGenerated != null) {
+                    onTokenGenerated.accept(nextToken);
+                }
+                if (stopTokens.contains(nextToken)) {
+                    break;
+                }
+            }
+            state.latestToken = token = nextToken;
+            if (position == 2000) {
+                break;
+            }
+        }
+
+        // Calculate and print performance metrics
+        long endNanos = System.nanoTime();
+        double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
+        int totalTokens = promptIndex + generatedTokens.size();
+
+        LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+
+        return generatedTokens;
+
+    }
+
     public static List<Integer> generateTokensGPULlama(Model model, State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
             IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
         // === Setup and Initialization ===
@@ -395,4 +456,79 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
 
         return generatedTokens;
     }
-}
+
+    public static List<Integer> generateTokensGPUPhi3(Model model, State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
+            IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
+        // Start timing the whole process
+        long startNanos = System.nanoTime();
+        long inferenceStartNanos = 0;
+
+        // Validate and adjust maxTokens if necessary
+        if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) {
+            maxTokens = model.configuration().contextLength();
+        }
+
+        // Storage for generated tokens
+        List<Integer> generatedTokens = new ArrayList<>();
+
+        // Initialize token variables
+        int currentToken = state.latestToken;
+        int nextToken;
+        int promptIndex = 0;
+        int pos = startPosition;
+
+        while (pos < maxTokens) {
+            // GPU Forward Pass
+            FloatArray logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMPlan);
+
+            // Handle token processing
+            if (promptIndex < promptTokens.size()) {
+                // We're still processing the prompt tokens
+                nextToken = promptTokens.get(promptIndex++);
+                if (echo) {
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+            } else {
+                // Mark the start of actual generation (after prompt processing)
+                if (inferenceStartNanos == 0) {
+                    inferenceStartNanos = System.nanoTime();
+                }
+
+                // Sample the next token
+                nextToken = sampler.sampleToken(logits);
+
+                // Output the token if echo is enabled
+                if (echo) {
+                    System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
+                }
+
+                // Track the generated token
+                generatedTokens.add(nextToken);
+
+                // Notify via callback if provided
+                if (onTokenGenerated != null) {
+                    onTokenGenerated.accept(nextToken);
+                }
+
+                // Check for stop condition
+                if (stopTokens.contains(nextToken)) {
+                    break;
+                }
+            }
+
+            // Update for next iteration
+            currentToken = nextToken;
+            state.latestToken = currentToken;
+            pos++;
+        }
+
+        // Calculate and print performance metrics
+        long endNanos = System.nanoTime();
+        double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
+        int totalTokens = promptIndex + generatedTokens.size();
+
+        LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+
+        return generatedTokens;
+    }
+}