beehive-lab
diff --git a/‎src/main/java/com/example/LlamaApp.java‎
Lines changed: 10 additions & 147 deletions b/‎src/main/java/com/example/LlamaApp.java‎
Lines changed: 10 additions & 147 deletions
diff --git a/‎src/main/java/com/example/aot/AOT.java‎
Lines changed: 4 additions & 3 deletions b/‎src/main/java/com/example/aot/AOT.java‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/main/java/com/example/auxiliary/format/ChatFormat.java‎
Lines changed: 7 additions & 0 deletions b/‎src/main/java/com/example/auxiliary/format/ChatFormat.java‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/main/java/com/example/auxiliary/ChatFormat.java‎ renamed to ‎src/main/java/com/example/auxiliary/format/LlamaChatFormat.java‎
Lines changed: 17 additions & 15 deletions b/‎src/main/java/com/example/auxiliary/ChatFormat.java‎ renamed to ‎src/main/java/com/example/auxiliary/format/LlamaChatFormat.java‎
Lines changed: 17 additions & 15 deletions
diff --git a/‎src/main/java/com/example/auxiliary/format/MistralChatFormat.java‎
Lines changed: 73 additions & 0 deletions b/‎src/main/java/com/example/auxiliary/format/MistralChatFormat.java‎
Lines changed: 73 additions & 0 deletions
@@ -1,25 +1,18 @@
 package com.example;
 
 import com.example.aot.AOT;
-import com.example.auxiliary.ChatFormat;
 import com.example.core.model.tensor.FloatTensor;
 import com.example.inference.CategoricalSampler;
 import com.example.inference.Sampler;
 import com.example.inference.ToppSampler;
-import com.example.inference.engine.impl.Llama;
+import com.example.inference.engine.impl.Model;
 import com.example.inference.engine.impl.Options;
 import com.example.loader.weights.ModelLoader;
 import com.example.loader.weights.State;
 import com.example.tornadovm.FloatArrayUtils;
-import com.example.tornadovm.TornadoVMMasterPlan;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Scanner;
-import java.util.Set;
-import java.util.function.IntConsumer;
 import java.util.random.RandomGenerator;
 import java.util.random.RandomGeneratorFactory;
 
@@ -115,156 +108,26 @@ static Sampler selectSampler(int vocabularySize, float temperature, float topp,
         return sampler;
     }
 
-    static void runInteractive(Llama model, Sampler sampler, Options options) {
-        State state = null;
-        List<Integer> conversationTokens = new ArrayList<>();
-        ChatFormat chatFormat = new ChatFormat(model.tokenizer());
-        conversationTokens.add(chatFormat.beginOfText);
-        if (options.systemPrompt() != null) {
-            conversationTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.SYSTEM, options.systemPrompt())));
-        }
-        int startPosition = 0;
-        Scanner in = new Scanner(System.in);
-
-        // Initialize TornadoVM plan once at the beginning if GPU path is enabled
-        TornadoVMMasterPlan tornadoVMPlan = null;
-
-        try {
-            while (true) {
-                System.out.print("> ");
-                System.out.flush();
-                String userText = in.nextLine();
-                if (List.of("quit", "exit").contains(userText)) {
-                    break;
-                }
-                if (state == null) {
-                    state = model.createNewState();
-                }
-
-                if (USE_TORNADOVM && tornadoVMPlan == null) {
-                    tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, model);
-                }
-
-                conversationTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.USER, userText)));
-                conversationTokens.addAll(chatFormat.encodeHeader(new ChatFormat.Message(ChatFormat.Role.ASSISTANT, "")));
-                Set<Integer> stopTokens = chatFormat.getStopTokens();
-
-                List<Integer> responseTokens;
-                IntConsumer tokenConsumer = token -> {
-                    if (options.stream()) {
-                        if (!model.tokenizer().isSpecialToken(token)) {
-                            System.out.print(model.tokenizer().decode(List.of(token)));
-                        }
-                    }
-                };
-
-                // Choose between GPU and CPU path based on configuration
-                if (USE_TORNADOVM) {
-                    // GPU path using TornadoVM
-                    responseTokens = Llama.generateTokensGPU(model, state, startPosition, conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens, options.maxTokens(),
-                            sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
-                } else {
-                    // CPU path
-                    responseTokens = Llama.generateTokens(model, state, startPosition, conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens, options.maxTokens(), sampler,
-                            options.echo(), tokenConsumer);
-                }
-
-                // Include stop token in the prompt history, but not in the response displayed to the user.
-                conversationTokens.addAll(responseTokens);
-                startPosition = conversationTokens.size();
-                Integer stopToken = null;
-                if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
-                    stopToken = responseTokens.getLast();
-                    responseTokens.removeLast();
-                }
-                if (!options.stream()) {
-                    String responseText = model.tokenizer().decode(responseTokens);
-                    System.out.println(responseText);
-                }
-                if (stopToken == null) {
-                    System.err.println("\n Ran out of context length...\n Increase context length with by passing to llama-tornado --max-tokens XXX");
-                    break;
-                }
-                System.out.print("\n");
-
-                // Optionally print performance metrics after each response
-                if (SHOW_PERF_INTERACTIVE) {
-                    Llama.LastRunMetrics.printMetrics();
-                }
-            }
-        } finally {
-            // Clean up TornadoVM resources when exiting the chat loop
-            if (USE_TORNADOVM && tornadoVMPlan != null) {
-                try {
-                    tornadoVMPlan.freeTornadoExecutionPlan();
-                } catch (Exception e) {
-                    System.err.println("Error while cleaning up TornadoVM resources: " + e.getMessage());
-                }
-            }
-        }
-    }
-
-    static void runInstructOnce(Llama model, Sampler sampler, Options options) {
-        State state = model.createNewState();
-        ChatFormat chatFormat = new ChatFormat(model.tokenizer());
-        TornadoVMMasterPlan tornadoVMPlan = null;
-
-        List<Integer> promptTokens = new ArrayList<>();
-        promptTokens.add(chatFormat.beginOfText);
-        if (options.systemPrompt() != null) {
-            promptTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.SYSTEM, options.systemPrompt())));
-        }
-        promptTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.USER, options.prompt())));
-        promptTokens.addAll(chatFormat.encodeHeader(new ChatFormat.Message(ChatFormat.Role.ASSISTANT, "")));
-        List<Integer> responseTokens;
-
-        // Define the token consumer
-        IntConsumer tokenConsumer = token -> {
-            if (options.stream()) {
-                if (!model.tokenizer().isSpecialToken(token)) {
-                    System.out.print(model.tokenizer().decode(List.of(token)));
-                }
-            }
-        };
+    // moved to model and became non-static
+    //static void runInteractive(Model model, Sampler sampler, Options options)
 
-        Set<Integer> stopTokens = chatFormat.getStopTokens();
-        if (USE_TORNADOVM) {
-            tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, model);
-            // Call generateTokensGPU without the token consumer parameter
-            responseTokens = Llama.generateTokensGPU(model, state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
-        } else {
-            // CPU path still uses the token consumer
-            responseTokens = Llama.generateTokens(model, state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), tokenConsumer);
-        }
-
-        if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
-            responseTokens.removeLast();
-        }
-        if (!options.stream()) {
-            String responseText = model.tokenizer().decode(responseTokens);
-            System.out.println(responseText);
-        }
-
-        Llama.LastRunMetrics.printMetrics();
-
-        if (tornadoVMPlan != null) {
-            tornadoVMPlan.freeTornadoExecutionPlan();
-        }
-    }
+    // moved to model and became non-static
+    //static void runInstructOnce(Model model, Sampler sampler, Options options)
 
     public static void main(String[] args) throws IOException {
         Options options = Options.parseOptions(args);
-        Llama model;
+        Model model;
         if (USE_AOT) {
             model = AOT.tryUsePreLoaded(options.modelPath(), options.maxTokens());
         } else {
             model = ModelLoader.loadModel(options.modelPath(), options.maxTokens(), true);
         }
-        Sampler sampler = selectSampler(model.configuration().vocabularySize, options.temperature(), options.topp(), options.seed());
+        assert model != null;
+        Sampler sampler = selectSampler(model.configuration().vocabularySize(), options.temperature(), options.topp(), options.seed());
         if (options.interactive()) {
-            runInteractive(model, sampler, options);
+            model.runInteractive(sampler, options);
         } else {
-            runInstructOnce(model, sampler, options);
+            model.runInstructOnce(sampler, options);
         }
     }
 }
 
@@ -3,8 +3,9 @@
 import com.example.auxiliary.Timer;
 import com.example.core.model.GGUF;
 import com.example.core.model.tensor.GGMLTensorEntry;
-import com.example.inference.engine.impl.Llama;
+import com.example.inference.engine.impl.Model;
 import com.example.inference.engine.impl.Options;
+import com.example.inference.engine.impl.llama.Llama;
 import com.example.loader.weights.ModelLoader;
 import com.example.loader.weights.Weights;
 
@@ -45,7 +46,7 @@ private static PartialModel preLoadGGUF(String modelPath) {
             try (FileChannel fileChannel = FileChannel.open(path, StandardOpenOption.READ)) {
                 return new PartialModel(
                         path.getFileName().toString(),
-                        ModelLoader.loadModel(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false),
+                        ModelLoader.loadLlamaModel(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false),
                         gguf.getTensorDataOffset(),
                         gguf.getTensorInfos()
                 );
@@ -60,7 +61,7 @@ private static PartialModel preLoadGGUF(String modelPath) {
      * The file name (base name) must match with the preloaded file name.
      * No checksum/hash is checked for performance reasons.
      */
-    public static com.example.inference.engine.impl.Llama tryUsePreLoaded(Path modelPath, int contextLength) throws IOException {
+    public static Model tryUsePreLoaded(Path modelPath, int contextLength) throws IOException {
         AOT.PartialModel preLoaded = AOT.PRELOADED_GGUF;
         if (preLoaded == null) {
             return null; // no pre-loaded model stored
 
@@ -0,0 +1,7 @@
+package com.example.auxiliary.format;
+
+import com.example.tokenizer.impl.Tokenizer;
+
+public interface ChatFormat {
+
+}
@@ -1,15 +1,16 @@
-package com.example.auxiliary;
+package com.example.auxiliary.format;
 
+import com.example.tokenizer.impl.LlamaTokenizer;
 import com.example.tokenizer.impl.Tokenizer;
 
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
-public class ChatFormat {
+public class LlamaChatFormat implements ChatFormat {
 
-    final Tokenizer tokenizer;
+    final LlamaTokenizer tokenizer;
     public final int beginOfText;
     final int endHeader;
     final int startHeader;
@@ -18,7 +19,7 @@ public class ChatFormat {
     final int endOfMessage;
     final Set<Integer> stopTokens;
 
-    public ChatFormat(Tokenizer tokenizer) {
+    public LlamaChatFormat(LlamaTokenizer tokenizer) {
         this.tokenizer = tokenizer;
         Map<String, Integer> specialTokens = this.tokenizer.getSpecialTokens();
         this.beginOfText = specialTokens.get("<|begin_of_text|>");
@@ -38,42 +39,43 @@ public Set<Integer> getStopTokens() {
         return stopTokens;
     }
 
-    public List<Integer> encodeHeader(ChatFormat.Message message) {
+    public List<Integer> encodeHeader(LlamaChatFormat.Message message) {
         List<Integer> tokens = new ArrayList<>();
+        LlamaTokenizer llamaTokenizer = (LlamaTokenizer) this.tokenizer;
         tokens.add(startHeader);
-        tokens.addAll(this.tokenizer.encodeAsList(message.role().name()));
+        tokens.addAll(llamaTokenizer.encodeAsList(message.role().name()));
         tokens.add(endHeader);
-        tokens.addAll(this.tokenizer.encodeAsList("\n"));
+        tokens.addAll(llamaTokenizer.encodeAsList("\n"));
         return tokens;
     }
 
-    public List<Integer> encodeMessage(ChatFormat.Message message) {
+    public List<Integer> encodeMessage(LlamaChatFormat.Message message) {
         List<Integer> tokens = this.encodeHeader(message);
         tokens.addAll(this.tokenizer.encodeAsList(message.content().strip()));
         tokens.add(endOfTurn);
         return tokens;
     }
 
-    public List<Integer> encodeDialogPrompt(boolean appendAssistantTurn, List<ChatFormat.Message> dialog) {
+    public List<Integer> encodeDialogPrompt(boolean appendAssistantTurn, List<LlamaChatFormat.Message> dialog) {
         List<Integer> tokens = new ArrayList<>();
         tokens.add(beginOfText);
-        for (ChatFormat.Message message : dialog) {
+        for (LlamaChatFormat.Message message : dialog) {
             tokens.addAll(this.encodeMessage(message));
         }
         if (appendAssistantTurn) {
             // Add the start of an assistant message for the model to complete.
-            tokens.addAll(this.encodeHeader(new ChatFormat.Message(ChatFormat.Role.ASSISTANT, "")));
+            tokens.addAll(this.encodeHeader(new LlamaChatFormat.Message(LlamaChatFormat.Role.ASSISTANT, "")));
         }
         return tokens;
     }
 
-    public record Message(ChatFormat.Role role, String content) {
+    public record Message(LlamaChatFormat.Role role, String content) {
     }
 
     public record Role(String name) {
-        public static ChatFormat.Role SYSTEM = new ChatFormat.Role("system");
-        public static ChatFormat.Role USER = new ChatFormat.Role("user");
-        public static ChatFormat.Role ASSISTANT = new ChatFormat.Role("assistant");
+        public static LlamaChatFormat.Role SYSTEM = new LlamaChatFormat.Role("system");
+        public static LlamaChatFormat.Role USER = new LlamaChatFormat.Role("user");
+        public static LlamaChatFormat.Role ASSISTANT = new LlamaChatFormat.Role("assistant");
 
         @Override
         public String toString() {
 
@@ -0,0 +1,73 @@
+package com.example.auxiliary.format;
+
+import com.example.tokenizer.impl.MistralTokenizer;
+
+import java.util.*;
+
+public class MistralChatFormat implements ChatFormat {
+
+    protected final MistralTokenizer tokenizer;
+    protected final int unknownToken;
+    protected final int beginOfText;
+    protected final int endOfText;
+    protected final int beginOfInstruction;
+    protected final int endOfInstruction;
+    protected final int toolCalls;
+    protected final int beginOfAvailableTools;
+    protected final int endOfAvailableTools;
+    protected final int beginOfToolResults;
+    protected final int endOfToolResults;
+    protected final int prefix;
+    protected final int middle;
+    protected final int suffix;
+
+    public MistralChatFormat(MistralTokenizer tokenizer) {
+        this.tokenizer = tokenizer;
+        Map<String, Integer> specialTokens = this.tokenizer.getSpecialTokens();
+        this.unknownToken = specialTokens.get("<unk>");
+        this.beginOfText = specialTokens.get("<s>");
+        this.endOfText = specialTokens.get("</s>");
+        this.beginOfInstruction = specialTokens.get("[INST]");
+        this.endOfInstruction = specialTokens.get("[/INST]");
+        this.toolCalls = specialTokens.get("[TOOL_CALLS]");
+        this.beginOfAvailableTools = specialTokens.get("[AVAILABLE_TOOLS]");
+        this.endOfAvailableTools = specialTokens.get("[/AVAILABLE_TOOLS]");
+        this.beginOfToolResults = specialTokens.get("[TOOL_RESULTS]");
+        this.endOfToolResults = specialTokens.get("[/TOOL_RESULTS]");
+        // Only Codestral supports FIM tokens.
+        this.prefix = specialTokens.getOrDefault("[PREFIX]", unknownToken);
+        this.suffix = specialTokens.getOrDefault("[SUFFIX]", unknownToken);
+        this.middle = specialTokens.getOrDefault("[MIDDLE]", unknownToken);
+    }
+
+    public int getBeginOfText() { return beginOfText; }
+
+    public Set<Integer> getStopTokens() {
+        return Set.of(endOfText);
+    }
+
+    public List<Integer> encodeMessage(String userMessage, boolean addHeader, boolean addFooter) {
+        List<Integer> tokens = new ArrayList<>();
+        if (addHeader) {
+            tokens.add(this.beginOfInstruction);
+        }
+        if (userMessage != null) {
+            tokens.addAll(this.tokenizer.encodeAsList(userMessage.strip()));
+        }
+        if (addFooter) {
+            tokens.add(endOfInstruction);
+        }
+        return tokens;
+    }
+
+    public List<Integer> encodeFillInTheMiddle(String prefix, String suffix) {
+        List<Integer> tokens = new ArrayList<>();
+        // dummy - empty string set to comply with encode method signature.
+        final Set<String> EMPTY_STRING_SET = Collections.emptySet();
+        tokens.add(this.suffix);
+        tokens.addAll(tokenizer.encode(suffix, EMPTY_STRING_SET));
+        tokens.add(this.prefix);
+        tokens.addAll(tokenizer.encode(prefix, EMPTY_STRING_SET));
+        return tokens;
+    }
+}