Refactor TornadoVM integration and extend Mistral configuration.

mikepapadim · orionpapadakis · commit 89d5aa3da173 · 2025-06-11T18:38:14.000+03:00
Introduced `kvDim` and `kvMul` methods in `Configuration` and `MistralConfiguration` to enhance model configuration flexibility. Refactored TornadoVM classes to generalize handling of different models by replacing `Llama`-specific types with `Model` interface. Streamlined token generation logic to support conditional GPU execution with TornadoVM.
diff --git a/src/main/java/com/example/inference/engine/impl/Configuration.java b/src/main/java/com/example/inference/engine/impl/Configuration.java
@@ -32,4 +32,7 @@ public interface Configuration {
     /** Size of each attention head (derived from dim / numberOfHeads) */
     int headSize();
 
+    int kvDim();
+
+    int kvMul();
 }
diff --git a/src/main/java/com/example/inference/engine/impl/mistral/Mistral.java b/src/main/java/com/example/inference/engine/impl/mistral/Mistral.java
@@ -20,14 +20,13 @@
 import java.util.Set;
 import java.util.function.IntConsumer;
 
+import static com.example.LlamaApp.USE_TORNADOVM;
+
 /**
  * Llama class in mistral.java
  */
 public record Mistral(MistralConfiguration configuration, Tokenizer tokenizer, Weights weights) implements Model {
 
-    /* For explicit use */
-    private MistralTokenizer getAsMistralTokenizer() { return (MistralTokenizer) tokenizer; }
-
     static void rmsnorm(FloatTensor out, FloatTensor x, FloatBuffer weight, int size, float rmsNormEps) {
         // calculate sum of squares
         float ss = x.reduce(0, size, 0f, (acc, xi) -> acc + xi * xi);
@@ -163,15 +162,20 @@ static FloatTensor forward(Mistral model, State state, int token, int position)
         return state.logits;
     }
 
+    /* For explicit use */
+    private MistralTokenizer getAsMistralTokenizer() {
+        return (MistralTokenizer) tokenizer;
+    }
+
     @Override
-    public List<Integer> generateTokensGPU(State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens,
-                                           int maxTokens, Sampler sampler, boolean echo, IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
+    public List<Integer> generateTokensGPU(State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
+            IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
         throw new UnsupportedOperationException("Mistral.generateTokensGPU is not implemented yet");
     }
 
     @Override
-    public List<Integer> generateTokens(State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens,
-                                        int maxTokens, Sampler sampler, boolean echo, IntConsumer onTokenGenerated) {
+    public List<Integer> generateTokens(State state, int startPosition, List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
+            IntConsumer onTokenGenerated) {
         long startNanos = System.nanoTime();
         if (maxTokens < 0 || configuration.contextLength() < maxTokens) {
             maxTokens = configuration.contextLength();
@@ -248,14 +252,15 @@ public void runInteractive(Sampler sampler, Options options) {
             }
             conversationTokens.addAll(chatFormat.encodeMessage(userText, true, true));
             Set<Integer> stopTokens = chatFormat.getStopTokens();
-            List<Integer> responseTokens = generateTokens(state, startPosition, conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens, options.maxTokens(), sampler, options.echo(), token -> {
-                if (options.stream()) {
-                    int tokenType = mistralTokenizer.getTokenType(token);
-                    if (tokenType == 1 || tokenType == 6) {
-                        System.out.print(mistralTokenizer.decode(List.of(token)));
-                    }
-                }
-            });
+            List<Integer> responseTokens = generateTokens(state, startPosition, conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens, options.maxTokens(), sampler,
+                    options.echo(), token -> {
+                        if (options.stream()) {
+                            int tokenType = mistralTokenizer.getTokenType(token);
+                            if (tokenType == 1 || tokenType == 6) {
+                                System.out.print(mistralTokenizer.decode(List.of(token)));
+                            }
+                        }
+                    });
             // Include stop token in the prompt history, but not in the response displayed to the user.
             conversationTokens.addAll(responseTokens);
             startPosition = conversationTokens.size();
@@ -288,15 +293,26 @@ public void runInstructOnce(Sampler sampler, Options options) {
             promptTokens.addAll(chatFormat.encodeMessage(options.prompt(), true, true));
         }
 
+          List<Integer> responseTokens;
         Set<Integer> stopTokens = chatFormat.getStopTokens();
-        List<Integer> responseTokens = generateTokens(state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), token -> {
+        IntConsumer tokenConsumer = token -> {
             if (options.stream()) {
                 int tokenType = mistralTokenizer.getTokenType(token);
                 if (tokenType == 1 || tokenType == 6) {
                     System.out.print(mistralTokenizer.decode(List.of(token)));
                 }
             }
-        });
+        };
+
+        TornadoVMMasterPlan tornadoVMPlan = null;
+        if (USE_TORNADOVM) {
+            tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
+            // Call generateTokensGPU without the token consumer parameter
+            responseTokens = generateTokensGPU(state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
+        } else {
+            responseTokens = generateTokens(state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), tokenConsumer);
+        }
+
         if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
             responseTokens.removeLast();
         }
diff --git a/src/main/java/com/example/inference/engine/impl/mistral/MistralConfiguration.java b/src/main/java/com/example/inference/engine/impl/mistral/MistralConfiguration.java
@@ -2,11 +2,16 @@
 
 import com.example.inference.engine.impl.Configuration;
 
-public record MistralConfiguration(
-        int dim, int hiddenDim, int numberOfLayers, int numberOfHeads,
-        int numberOfKeyValueHeads, int vocabularySize, int contextLength,
-        boolean sharedWeights, float rmsNormEps, float ropeTheta
-) implements Configuration {
+public record MistralConfiguration(int dim, int hiddenDim, int numberOfLayers, int numberOfHeads, int numberOfKeyValueHeads, int vocabularySize, int contextLength, boolean sharedWeights,
+                                   float rmsNormEps, float ropeTheta) implements Configuration {
+
+    public int kvDim() {
+        return dim * numberOfKeyValueHeads / numberOfHeads;
+    }
+
+    public int kvMul() {
+        return numberOfHeads / numberOfKeyValueHeads;
+    }
 
     public int headSize() {
         return dim / numberOfHeads;
diff --git a/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java b/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java
@@ -1,6 +1,8 @@
 package com.example.tornadovm;
 
 import com.example.auxiliary.Tuple2;
+import com.example.inference.engine.impl.Configuration;
+import com.example.inference.engine.impl.Model;
 import com.example.inference.engine.impl.llama.LlamaConfiguration;
 import com.example.inference.engine.impl.llama.Llama;
 import com.example.loader.weights.State;
@@ -49,7 +51,7 @@ public class TornadoVMLayerPlanner {
         private static final int THREAD_SCALE_FOR_LOGITS = 8;
 
     private final State state;
-    private final LlamaConfiguration config;
+    private final Configuration config;
     private final Weights weights;
     private final KernelContext context;
 
@@ -61,7 +63,7 @@ public class TornadoVMLayerPlanner {
          * @param model
          *         The Llama model instance containing configuration and weights
          */
-        public TornadoVMLayerPlanner(State state, Llama model) {
+        public TornadoVMLayerPlanner(State state, Model model) {
             this.state = state;
             this.config = model.configuration();
             this.weights = model.weights();
diff --git a/src/main/java/com/example/tornadovm/TornadoVMMasterPlan.java b/src/main/java/com/example/tornadovm/TornadoVMMasterPlan.java
@@ -1,8 +1,8 @@
 package com.example.tornadovm;
 
 import com.example.auxiliary.Tuple2;
-import com.example.inference.engine.impl.llama.LlamaConfiguration;
-import com.example.inference.engine.impl.llama.Llama;
+import com.example.inference.engine.impl.Configuration;
+import com.example.inference.engine.impl.Model;
 import com.example.loader.weights.State;
 import uk.ac.manchester.tornado.api.GridScheduler;
 import uk.ac.manchester.tornado.api.ImmutableTaskGraph;
@@ -17,12 +17,12 @@ public class TornadoVMMasterPlan {
     private static final boolean ENABLE_TORNADOVM_INIT_TIME = Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "False"));
 
     private final State state;
-    private final LlamaConfiguration config;
+    private final Configuration config;
     public GridScheduler scheduler;
     public TornadoExecutionPlan executionPlan;
     List<ImmutableTaskGraph> taskGraphs;
 
-    public TornadoVMMasterPlan(State state, Llama model, boolean isNvidia) {
+    public TornadoVMMasterPlan(State state, Model model, boolean isNvidia) {
         TornadoVMLayerPlanner tornadoVMLayerPlanner = new TornadoVMLayerPlanner(state, model);
         Tuple2<List<ImmutableTaskGraph>, GridScheduler> tornadoVMPlan = isNvidia ? tornadoVMLayerPlanner.setupTornadoForwardPlanLayered() : tornadoVMLayerPlanner.setupTornadoForwardPlanLayeredNonNvidia();
         this.taskGraphs = tornadoVMPlan.getFirst();
@@ -43,7 +43,7 @@ public TornadoVMMasterPlan(State state, Llama model, boolean isNvidia) {
      * @param model The Llama model instance
      * @return The initialized TornadoVMMasterPlan ready for inference
      */
-    public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Llama model) {
+    public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) {
         // Initialize timing variables outside conditional blocks to avoid scope issues
         long startTime = System.nanoTime();
         long planCreationTime = 0;

Original file line number	Diff line number	Diff line change
`@@ -32,4 +32,7 @@ public interface Configuration {`
`32`	`32`	`/** Size of each attention head (derived from dim / numberOfHeads) */`
`33`	`33`	`int headSize();`
`34`	`34`
	`35`	`+ int kvDim();`
	`36`	`+`
	`37`	`+ int kvMul();`
`35`	`38`	`}`