Generalize instruct mode implementation for Llama and Mistral

orionpapadakis · orionpapadakis · commit 613062cddfb0 · 2025-06-11T18:38:14.000+03:00
diff --git a/src/main/java/com/example/inference/InferenceCore.java b/src/main/java/com/example/inference/InferenceCore.java
@@ -1,6 +1,6 @@
 package com.example.inference;
 
-import com.example.aux.Parallel;
+import com.example.auxiliary.Parallel;
 import com.example.core.model.tensor.FloatTensor;
 import com.example.loader.weights.State;
 import com.example.loader.weights.Weights;
diff --git a/src/main/java/com/example/inference/InferenceEngine.java b/src/main/java/com/example/inference/InferenceEngine.java
@@ -1,6 +1,6 @@
 package com.example.inference;
 
-import com.example.aux.LastRunMetrics;
+import com.example.auxiliary.LastRunMetrics;
 import com.example.inference.sampler.Sampler;
 import com.example.loader.weights.State;
 import com.example.model.Configuration;
diff --git a/src/main/java/com/example/loader/weights/ModelLoader.java b/src/main/java/com/example/loader/weights/ModelLoader.java
@@ -168,7 +168,7 @@ public static Mistral loadMistralModel(FileChannel fileChannel, GGUF gguf, int c
 
             Weights weights = null;
             if (loadWeights) {
-                Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensorsWithMapping(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
+                Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
                 weights = loadWeights(tensorEntries, config);
             }
             return new Mistral(config, tokenizer, weights);
diff --git a/src/main/java/com/example/model/Model.java b/src/main/java/com/example/model/Model.java
@@ -1,7 +1,7 @@
 package com.example.model;
 
-import com.example.aux.LastRunMetrics;
-import com.example.aux.format.ChatFormat;
+import com.example.auxiliary.LastRunMetrics;
+import com.example.auxiliary.format.ChatFormat;
 import com.example.inference.InferenceEngine;
 import com.example.inference.sampler.Sampler;
 import com.example.Options;
@@ -130,5 +130,60 @@ default void runInteractive(Sampler sampler, Options options) {
             }
         }
     }
-    void runInstructOnce(Sampler sampler, Options options);
+
+    /**
+     * Model agnostic default implementation for instruct mode.
+     * @param sampler
+     * @param options
+     */
+    default void runInstructOnce(Sampler sampler, Options options) {
+        State state = createNewState();
+        ChatFormat chatFormat = ChatFormat.create(tokenizer());
+        TornadoVMMasterPlan tornadoVMPlan = null;
+
+        List<Integer> promptTokens = new ArrayList<>();
+        promptTokens.add(chatFormat.getBeginOfText());
+
+        if (options.systemPrompt() != null) {
+            promptTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.SYSTEM, options.systemPrompt())));
+        }
+        promptTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.USER, options.prompt())));
+        promptTokens.addAll(chatFormat.encodeHeader(new ChatFormat.Message(ChatFormat.Role.ASSISTANT, "")));
+
+        List<Integer> responseTokens;
+
+        IntConsumer tokenConsumer = token -> {
+            if (options.stream()) {
+                if (tokenizer().shouldDisplayToken(token)) {
+                    System.out.print(tokenizer().decode(List.of(token)));
+                }
+            }
+        };
+
+        Set<Integer> stopTokens = chatFormat.getStopTokens();
+
+        if (USE_TORNADOVM) {
+            tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
+            // Call generateTokensGPU without the token consumer parameter
+            responseTokens = InferenceEngine.generateTokensGPU(this, state, 0, promptTokens, stopTokens,
+                    options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
+        } else {
+            responseTokens = InferenceEngine.generateTokens(this, state, 0, promptTokens, stopTokens,
+                    options.maxTokens(), sampler, options.echo(), tokenConsumer);
+        }
+
+        if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
+            responseTokens.removeLast();
+        }
+        if (!options.stream()) {
+            String responseText = tokenizer().decode(responseTokens);
+            System.out.println(responseText);
+        }
+
+        LastRunMetrics.printMetrics();
+
+        if (tornadoVMPlan != null) {
+            tornadoVMPlan.freeTornadoExecutionPlan();
+        }
+    }
 }
diff --git a/src/main/java/com/example/model/llama/Llama.java b/src/main/java/com/example/model/llama/Llama.java
@@ -1,24 +1,11 @@
 package com.example.model.llama;
 
-import com.example.auxiliary.LastRunMetrics;
-import com.example.auxiliary.format.LlamaChatFormat;
-import com.example.inference.InferenceEngine;
-import com.example.inference.sampler.Sampler;
 import com.example.model.Model;
-import com.example.Options;
 import com.example.loader.weights.ModelLoader;
 import com.example.loader.weights.State;
 import com.example.loader.weights.Weights;
 import com.example.tokenizer.impl.LlamaTokenizer;
 import com.example.tokenizer.impl.Tokenizer;
-import com.example.tornadovm.TornadoVMMasterPlan;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Set;
-import java.util.function.IntConsumer;
-
-import static com.example.LlamaApp.USE_TORNADOVM;
 
 public record Llama(LlamaConfiguration configuration, Tokenizer tokenizer, Weights weights) implements Model {
     private static final int BATCH_SIZE = Integer.getInteger("llama.BatchSize", 16);
@@ -45,57 +32,5 @@ public State createNewState(int batchsize) {
         return state;
     }
 
-    @Override
-    public void runInstructOnce(Sampler sampler, Options options) {
-        State state = createNewState();
-        LlamaChatFormat chatFormat = new LlamaChatFormat(getAsLlamaTokenizer());
-        TornadoVMMasterPlan tornadoVMPlan = null;
-
-        List<Integer> promptTokens = new ArrayList<>();
-        promptTokens.add(chatFormat.getBeginOfText());
-
-        if (options.systemPrompt() != null) {
-            promptTokens.addAll(chatFormat.encodeMessage(new LlamaChatFormat.Message(LlamaChatFormat.Role.SYSTEM, options.systemPrompt())));
-        }
-        promptTokens.addAll(chatFormat.encodeMessage(new LlamaChatFormat.Message(LlamaChatFormat.Role.USER, options.prompt())));
-        promptTokens.addAll(chatFormat.encodeHeader(new LlamaChatFormat.Message(LlamaChatFormat.Role.ASSISTANT, "")));
-        List<Integer> responseTokens;
-
-        // Define the token consumer
-        IntConsumer tokenConsumer = token -> {
-            if (options.stream()) {
-                if (!tokenizer.isSpecialToken(token)) {
-                    System.out.print(tokenizer.decode(List.of(token)));
-                }
-            }
-        };
-
-        Set<Integer> stopTokens = chatFormat.getStopTokens();
-        if (USE_TORNADOVM) {
-            tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
-            // Call generateTokensGPU without the token consumer parameter
-            responseTokens = InferenceEngine.generateTokensGPU(this, state, 0, promptTokens, stopTokens,
-                    options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
-        } else {
-            // CPU path still uses the token consumer
-            responseTokens = InferenceEngine.generateTokens(this, state, 0, promptTokens, stopTokens,
-                    options.maxTokens(), sampler, options.echo(), tokenConsumer);
-        }
-
-        if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
-            responseTokens.removeLast();
-        }
-        if (!options.stream()) {
-            String responseText = tokenizer.decode(responseTokens);
-            System.out.println(responseText);
-        }
-
-        LastRunMetrics.printMetrics();
-
-        if (tornadoVMPlan != null) {
-            tornadoVMPlan.freeTornadoExecutionPlan();
-        }
-    }
-
 }
 
diff --git a/src/main/java/com/example/model/mistral/Mistral.java b/src/main/java/com/example/model/mistral/Mistral.java
@@ -1,24 +1,11 @@
 package com.example.model.mistral;
 
-import com.example.auxiliary.LastRunMetrics;
-import com.example.auxiliary.format.MistralChatFormat;
-import com.example.inference.InferenceEngine;
-import com.example.inference.sampler.Sampler;
 import com.example.model.Model;
-import com.example.Options;
 import com.example.loader.weights.ModelLoader;
 import com.example.loader.weights.State;
 import com.example.loader.weights.Weights;
 import com.example.tokenizer.impl.MistralTokenizer;
 import com.example.tokenizer.impl.Tokenizer;
-import com.example.tornadovm.TornadoVMMasterPlan;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Set;
-import java.util.function.IntConsumer;
-
-import static com.example.LlamaApp.USE_TORNADOVM;
 
 /**
  * Llama class in mistral.java
@@ -45,54 +32,4 @@ public State createNewState(int batchsize) {
         return state;
     }
 
-    @Override
-    public void runInstructOnce(Sampler sampler, Options options) {
-        State state = createNewState();
-        MistralChatFormat chatFormat = new MistralChatFormat(getAsMistralTokenizer());
-        TornadoVMMasterPlan tornadoVMPlan = null;
-
-        List<Integer> promptTokens = new ArrayList<>();
-        promptTokens.add(chatFormat.getBeginOfText());
-
-        if (options.suffix() != null) {
-            promptTokens.addAll(chatFormat.encodeFillInTheMiddle(options.prompt(), options.suffix()));
-        } else {
-            promptTokens.addAll(chatFormat.encodeMessage(options.prompt(), true, true));
-        }
-
-          List<Integer> responseTokens;
-        Set<Integer> stopTokens = chatFormat.getStopTokens();
-        IntConsumer tokenConsumer = token -> {
-            if (options.stream()) {
-                int tokenType = getAsMistralTokenizer().getTokenType(token);
-                if (tokenType == 1 || tokenType == 6) {
-                    System.out.print(tokenizer.decode(List.of(token)));
-                }
-            }
-        };
-
-        if (USE_TORNADOVM) {
-            tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
-            // Call generateTokensGPU without the token consumer parameter
-            responseTokens = InferenceEngine.generateTokensGPU(this, state, 0, promptTokens, stopTokens,
-                    options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
-        } else {
-            responseTokens = InferenceEngine.generateTokens(this, state, 0, promptTokens, stopTokens,
-                    options.maxTokens(), sampler, options.echo(), tokenConsumer);
-        }
-
-        if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
-            responseTokens.removeLast();
-        }
-        if (!options.stream()) {
-            String responseText = tokenizer.decode(responseTokens);
-            System.out.println(responseText);
-        }
-
-        LastRunMetrics.printMetrics();
-
-        if (tornadoVMPlan != null) {
-            tornadoVMPlan.freeTornadoExecutionPlan();
-        }
-    }
 }

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ public static Mistral loadMistralModel(FileChannel fileChannel, GGUF gguf, int c`
`168`	`168`
`169`	`169`	`Weights weights = null;`
`170`	`170`	`if (loadWeights) {`
`171`		`- Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensorsWithMapping(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());`
	`171`	`+ Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());`
`172`	`172`	`weights = loadWeights(tensorEntries, config);`
`173`	`173`	`}`
`174`	`174`	`return new Mistral(config, tokenizer, weights);`