beehive-lab
diff --git a/‎src/main/java/com/example/auxiliary/format/ChatFormat.java‎
Lines changed: 53 additions & 1 deletion b/‎src/main/java/com/example/auxiliary/format/ChatFormat.java‎
Lines changed: 53 additions & 1 deletion
diff --git a/‎src/main/java/com/example/auxiliary/format/LlamaChatFormat.java‎
Lines changed: 16 additions & 36 deletions b/‎src/main/java/com/example/auxiliary/format/LlamaChatFormat.java‎
Lines changed: 16 additions & 36 deletions
diff --git a/‎src/main/java/com/example/auxiliary/format/MistralChatFormat.java‎
Lines changed: 20 additions & 3 deletions b/‎src/main/java/com/example/auxiliary/format/MistralChatFormat.java‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎src/main/java/com/example/model/Model.java‎
Lines changed: 108 additions & 1 deletion b/‎src/main/java/com/example/model/Model.java‎
Lines changed: 108 additions & 1 deletion
@@ -1,9 +1,61 @@
 package com.example.auxiliary.format;
 
-import com.example.tokenizer.impl.Tokenizer;
+import com.example.tokenizer.impl.LlamaTokenizer;
+import com.example.tokenizer.impl.MistralTokenizer;
+
+import java.util.List;
+import java.util.Set;
 
 public interface ChatFormat {
 
+    static ChatFormat create(Object tokenizer) {
+        if (tokenizer instanceof LlamaTokenizer llamaTokenizer) {
+            return new LlamaChatFormat(llamaTokenizer);
+        } else if (tokenizer instanceof MistralTokenizer mistralTokenizer) {
+            return new MistralChatFormat(mistralTokenizer);
+        } else {
+            throw new IllegalArgumentException("Unsupported tokenizer type: " + tokenizer.getClass().getName());
+        }
+    }
+
+    List<Integer> encodeHeader(Message message);
+    List<Integer> encodeMessage(Message message);
     int getBeginOfText();
+    Set<Integer> getStopTokens();
+
+    /**
+     * Represents a single message in a LLM chat session.
+     *
+     * Each message is associated with a specific role (system, user, or assistant)
+     * and contains the textual content of that message.
+     *
+     * @param role the participant who issued the message (SYSTEM, USER, or ASSISTANT).
+     * @param content the textual content of the message
+     */
+    record Message(Role role, String content) {
+    }
+
+    /**
+     * Represents the role of a participant in a LLM chat conversation
+     *
+     * There are three standard roles:
+     * <ul>
+     * <li><strong>SYSTEM</strong> - sets the behavior and context of the assistant at the start of the conversation.</li>
+     * <li><strong>USER</strong> - represents input from the human user.</li>
+     * <li><strong>ASSISTANT</strong> - represents output from the AI assistant.</li>
+     * </ul>
+     *
+     * @param name the string representation of the role
+     */
+    record Role(String name) {
+        public static Role SYSTEM = new Role("system");
+        public static Role USER = new Role("user");
+        public static Role ASSISTANT = new Role("assistant");
+
+        @Override
+        public String toString() {
+            return name;
+        }
+    }
 
 }
@@ -21,7 +21,7 @@ public class LlamaChatFormat implements ChatFormat {
 
     public LlamaChatFormat(LlamaTokenizer tokenizer) {
         this.tokenizer = tokenizer;
-        Map<String, Integer> specialTokens = this.tokenizer.getSpecialTokens();
+        Map<String, Integer> specialTokens = tokenizer.getSpecialTokens();
         this.beginOfText = specialTokens.get("<|begin_of_text|>");
         this.startHeader = specialTokens.get("<|start_header_id|>");
         this.endHeader = specialTokens.get("<|end_header_id|>");
@@ -31,60 +31,40 @@ public LlamaChatFormat(LlamaTokenizer tokenizer) {
         this.stopTokens = Set.of(endOfText, endOfTurn);
     }
 
-    public Tokenizer getTokenizer() {
-        return tokenizer;
-    }
+    @Override
+    public int getBeginOfText() { return beginOfText; }
 
-    public Set<Integer> getStopTokens() {
-        return stopTokens;
-    }
+    @Override
+    public Set<Integer> getStopTokens() { return stopTokens; }
 
-    public List<Integer> encodeHeader(LlamaChatFormat.Message message) {
+    @Override
+    public List<Integer> encodeHeader(Message message) {
         List<Integer> tokens = new ArrayList<>();
-        LlamaTokenizer llamaTokenizer = (LlamaTokenizer) this.tokenizer;
         tokens.add(startHeader);
-        tokens.addAll(llamaTokenizer.encodeAsList(message.role().name()));
+        tokens.addAll(tokenizer.encodeAsList(message.role().name()));
         tokens.add(endHeader);
-        tokens.addAll(llamaTokenizer.encodeAsList("\n"));
+        tokens.addAll(tokenizer.encodeAsList("\n"));
         return tokens;
     }
 
-    public List<Integer> encodeMessage(LlamaChatFormat.Message message) {
-        List<Integer> tokens = this.encodeHeader(message);
-        tokens.addAll(this.tokenizer.encodeAsList(message.content().strip()));
+    @Override
+    public List<Integer> encodeMessage(Message message) {
+        List<Integer> tokens = encodeHeader(message);
+        tokens.addAll(tokenizer.encodeAsList(message.content().strip()));
         tokens.add(endOfTurn);
         return tokens;
     }
 
-    public List<Integer> encodeDialogPrompt(boolean appendAssistantTurn, List<LlamaChatFormat.Message> dialog) {
+    public List<Integer> encodeDialogPrompt(boolean appendAssistantTurn, List<Message> dialog) {
         List<Integer> tokens = new ArrayList<>();
         tokens.add(beginOfText);
         for (LlamaChatFormat.Message message : dialog) {
-            tokens.addAll(this.encodeMessage(message));
+            tokens.addAll(encodeMessage(message));
         }
         if (appendAssistantTurn) {
             // Add the start of an assistant message for the model to complete.
-            tokens.addAll(this.encodeHeader(new LlamaChatFormat.Message(LlamaChatFormat.Role.ASSISTANT, "")));
+            tokens.addAll(encodeHeader(new Message(ChatFormat.Role.ASSISTANT, "")));
         }
         return tokens;
     }
-
-    @Override
-    public int getBeginOfText() {
-        return beginOfText;
-    }
-
-    public record Message(LlamaChatFormat.Role role, String content) {
-    }
-
-    public record Role(String name) {
-        public static LlamaChatFormat.Role SYSTEM = new LlamaChatFormat.Role("system");
-        public static LlamaChatFormat.Role USER = new LlamaChatFormat.Role("user");
-        public static LlamaChatFormat.Role ASSISTANT = new LlamaChatFormat.Role("assistant");
-
-        @Override
-        public String toString() {
-            return name;
-        }
-    }
 }
@@ -23,7 +23,7 @@ public class MistralChatFormat implements ChatFormat {
 
     public MistralChatFormat(MistralTokenizer tokenizer) {
         this.tokenizer = tokenizer;
-        Map<String, Integer> specialTokens = this.tokenizer.getSpecialTokens();
+        Map<String, Integer> specialTokens = tokenizer.getSpecialTokens();
         this.unknownToken = specialTokens.get("<unk>");
         this.beginOfText = specialTokens.get("<s>");
         this.endOfText = specialTokens.get("</s>");
@@ -43,8 +43,25 @@ public MistralChatFormat(MistralTokenizer tokenizer) {
     @Override
     public int getBeginOfText() { return beginOfText; }
 
-    public Set<Integer> getStopTokens() {
-        return Set.of(endOfText);
+    @Override
+    public Set<Integer> getStopTokens() { return Set.of(endOfText); }
+
+    @Override
+    public List<Integer> encodeHeader(Message message) {
+        List<Integer> tokens = new ArrayList<>();
+        tokens.add(beginOfInstruction);
+        tokens.addAll(tokenizer.encodeAsList(message.role().name()));
+        tokens.add(endOfInstruction);
+        return tokens;
+    }
+
+    @Override
+    public List<Integer> encodeMessage(Message message) {
+        List<Integer> tokens = encodeHeader(message);
+        //tokens.add(beginOfInstruction);
+        tokens.addAll(tokenizer.encodeAsList(message.content().strip()));
+        tokens.add(endOfInstruction);
+        return tokens;
     }
 
     public List<Integer> encodeMessage(String userMessage, boolean addHeader, boolean addFooter) {
 
@@ -1,5 +1,8 @@
 package com.example.model;
 
+import com.example.aux.LastRunMetrics;
+import com.example.aux.format.ChatFormat;
+import com.example.inference.InferenceEngine;
 import com.example.inference.sampler.Sampler;
 import com.example.Options;
 import com.example.loader.weights.ModelLoader.ModelType;
@@ -8,10 +11,15 @@
 import com.example.tokenizer.impl.Tokenizer;
 import com.example.tornadovm.TornadoVMMasterPlan;
 
+import java.util.ArrayList;
 import java.util.List;
+import java.util.Scanner;
 import java.util.Set;
 import java.util.function.IntConsumer;
 
+import static com.example.LlamaApp.SHOW_PERF_INTERACTIVE;
+import static com.example.LlamaApp.USE_TORNADOVM;
+
 public interface Model {
     Configuration configuration();
     Tokenizer tokenizer();
@@ -22,6 +30,105 @@ public interface Model {
     State createNewState();
     State createNewState(int batchsize);
 
-    void runInteractive(Sampler sampler, Options options);
+    /**
+     * Model agnostic default implementation for interactive mode.
+     * @param sampler
+     * @param options
+     */
+    default void runInteractive(Sampler sampler, Options options) {
+        State state = null;
+        List<Integer> conversationTokens = new ArrayList<>();
+
+        ChatFormat chatFormat = ChatFormat.create(tokenizer());
+        conversationTokens.add(chatFormat.getBeginOfText());
+
+        if (options.systemPrompt() != null) {
+            conversationTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.SYSTEM, options.systemPrompt())));
+        }
+
+        int startPosition = 0;
+        Scanner in = new Scanner(System.in);
+
+        // Initialize TornadoVM plan once at the beginning if GPU path is enabled
+        TornadoVMMasterPlan tornadoVMPlan = null;
+
+        try {
+            while (true) {
+                System.out.print("> ");
+                System.out.flush();
+                String userText = in.nextLine();
+                if (List.of("quit", "exit").contains(userText)) {
+                    break;
+                }
+                if (state == null) {
+                    // State allocation can take some time for large context sizes,
+                    // allocate the model state only after printing the user '>' prompt.
+                    state = createNewState();
+                }
+
+                if (USE_TORNADOVM && tornadoVMPlan == null) {
+                    tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
+                }
+
+                conversationTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.USER, userText)));
+                conversationTokens.addAll(chatFormat.encodeHeader(new ChatFormat.Message(ChatFormat.Role.ASSISTANT, "")));
+                Set<Integer> stopTokens = chatFormat.getStopTokens();
+
+                List<Integer> responseTokens;
+                IntConsumer tokenConsumer = token -> {
+                    if (options.stream()) {
+                        if (tokenizer().shouldDisplayToken(token)) {
+                            System.out.print(tokenizer().decode(List.of(token)));
+                        }
+                    }
+                };
+
+                // Choose between GPU and CPU path based on configuration
+                if (USE_TORNADOVM) {
+                    // GPU path using TornadoVM
+                    responseTokens = InferenceEngine.generateTokensGPU(this, state, startPosition,
+                            conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens,
+                            options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
+                } else {
+                    // CPU path
+                    responseTokens = InferenceEngine.generateTokens(this, state, startPosition,
+                            conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens,
+                            options.maxTokens(), sampler, options.echo(), tokenConsumer);
+                }
+
+                // Include stop token in the prompt history, but not in the response displayed to the user.
+                conversationTokens.addAll(responseTokens);
+                startPosition = conversationTokens.size();
+                Integer stopToken = null;
+                if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
+                    stopToken = responseTokens.getLast();
+                    responseTokens.removeLast();
+                }
+                if (!options.stream()) {
+                    String responseText = tokenizer().decode(responseTokens);
+                    System.out.println(responseText);
+                }
+                if (stopToken == null) {
+                    System.err.println("\n Ran out of context length...\n Increase context length with by passing to llama-tornado --max-tokens XXX");
+                    break;
+                }
+                System.out.print("\n");
+
+                // Optionally print performance metrics after each response
+                if (SHOW_PERF_INTERACTIVE) {
+                    LastRunMetrics.printMetrics();
+                }
+            }
+        } finally {
+            // Clean up TornadoVM resources when exiting the chat loop
+            if (USE_TORNADOVM && tornadoVMPlan != null) {
+                try {
+                    tornadoVMPlan.freeTornadoExecutionPlan();
+                } catch (Exception e) {
+                    System.err.println("Error while cleaning up TornadoVM resources: " + e.getMessage());
+                }
+            }
+        }
+    }
     void runInstructOnce(Sampler sampler, Options options);
 }