beehive-lab
diff --git a/‎src/main/java/org/beehive/gpullama3/LlamaApp.java‎
Lines changed: 46 additions & 33 deletions b/‎src/main/java/org/beehive/gpullama3/LlamaApp.java‎
Lines changed: 46 additions & 33 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/Options.java‎
Lines changed: 34 additions & 6 deletions b/‎src/main/java/org/beehive/gpullama3/Options.java‎
Lines changed: 34 additions & 6 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/core/model/GGUF.java‎
Lines changed: 2 additions & 1 deletion b/‎src/main/java/org/beehive/gpullama3/core/model/GGUF.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/main/java/org/beehive/gpullama3/model/Model.java‎
Lines changed: 86 additions & 12 deletions b/‎src/main/java/org/beehive/gpullama3/model/Model.java‎
Lines changed: 86 additions & 12 deletions
@@ -1,12 +1,13 @@
 package org.beehive.gpullama3;
 
 import org.beehive.gpullama3.aot.AOT;
+import org.beehive.gpullama3.auxiliary.LastRunMetrics;
 import org.beehive.gpullama3.core.model.tensor.FloatTensor;
 import org.beehive.gpullama3.inference.sampler.CategoricalSampler;
 import org.beehive.gpullama3.inference.sampler.Sampler;
 import org.beehive.gpullama3.inference.sampler.ToppSampler;
-import org.beehive.gpullama3.model.loader.ModelLoader;
 import org.beehive.gpullama3.model.Model;
+import org.beehive.gpullama3.model.loader.ModelLoader;
 import org.beehive.gpullama3.tornadovm.FloatArrayUtils;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
 
@@ -18,7 +19,6 @@ public class LlamaApp {
     // Configuration flags for hardware acceleration and optimizations
     public static final boolean USE_VECTOR_API = Boolean.parseBoolean(System.getProperty("llama.VectorAPI", "true"));   // Enable Java Vector API for CPU acceleration
     public static final boolean USE_AOT = Boolean.parseBoolean(System.getProperty("llama.AOT", "false"));               // Use Ahead-of-Time compilation
-    public static final boolean USE_TORNADOVM = Boolean.parseBoolean(System.getProperty("use.tornadovm", "false"));     // Use TornadoVM for GPU acceleration
     public static final boolean SHOW_PERF_INTERACTIVE = Boolean.parseBoolean(System.getProperty("llama.ShowPerfInteractive", "true")); // Show performance metrics in interactive mode
 
     /**
@@ -36,27 +36,29 @@ public class LlamaApp {
      * <p>The method handles both {@link FloatTensor} and {@link FloatArray} logits types
      * to support both CPU and GPU execution paths.</p>
      *
-     * @param vocabularySize The size of the model's vocabulary
-     * @param temperature A value controlling randomness in sampling:
-     *                   <ul>
-     *                     <li>0.0f: No randomness (greedy sampling)</li>
-     *                     <li>1.0f: Standard sampling from unmodified distribution</li>
-     *                     <li>&lt;1.0f: More deterministic (sharper distribution)</li>
-     *                     <li>&gt;1.0f: More random (flatter distribution)</li>
-     *                   </ul>
-     * @param topp The cumulative probability threshold for nucleus sampling (0.0-1.0).
-     *            <ul>
-     *              <li>Values ≤0 or ≥1: Disables top-p sampling</li>
-     *              <li>Values in (0,1): Restricts sampling to tokens comprising the top p probability mass</li>
-     *            </ul>
-     * @param rngSeed Seed value for the random number generator to ensure reproducibility
-     *
-     * @return A configured {@link Sampler} that implements the selected sampling strategy
-     *         and handles both tensor and array-based logits
-     *
-     * @throws IllegalArgumentException if logits are of an unsupported type
+     * @param vocabularySize
+     *         The size of the model's vocabulary
+     * @param temperature
+     *         A value controlling randomness in sampling:
+     *         <ul>
+     *           <li>0.0f: No randomness (greedy sampling)</li>
+     *           <li>1.0f: Standard sampling from unmodified distribution</li>
+     *           <li>&lt;1.0f: More deterministic (sharper distribution)</li>
+     *           <li>&gt;1.0f: More random (flatter distribution)</li>
+     *         </ul>
+     * @param topp
+     *         The cumulative probability threshold for nucleus sampling (0.0-1.0).
+     *         <ul>
+     *           <li>Values ≤0 or ≥1: Disables top-p sampling</li>
+     *           <li>Values in (0,1): Restricts sampling to tokens comprising the top p probability mass</li>
+     *         </ul>
+     * @param rngSeed
+     *         Seed value for the random number generator to ensure reproducibility
+     * @return A configured {@link Sampler} that implements the selected sampling strategy and handles both tensor and array-based logits
+     * @throws IllegalArgumentException
+     *         if logits are of an unsupported type
      */
-    static Sampler selectSampler(int vocabularySize, float temperature, float topp, long rngSeed) {
+    public static Sampler selectSampler(int vocabularySize, float temperature, float topp, long rngSeed) {
         Sampler sampler;
         if (temperature == 0.0f) {
             // greedy argmax sampling: take the token with the highest probability
@@ -109,14 +111,16 @@ static Sampler selectSampler(int vocabularySize, float temperature, float topp,
     /**
      * Loads the language model based on the given options.
      * <p>
-     * If Ahead-of-Time (AOT) mode is enabled, attempts to use a pre-loaded compiled model.
-     * Otherwise, loads the model from the specified path using the model loader.
+     * If Ahead-of-Time (AOT) mode is enabled, attempts to use a pre-loaded compiled model. Otherwise, loads the model from the specified path using the model loader.
      * </p>
      *
-     * @param options the parsed CLI options containing model path and max token limit
+     * @param options
+     *         the parsed CLI options containing model path and max token limit
      * @return the loaded {@link Model} instance
-     * @throws IOException if the model fails to load
-     * @throws IllegalStateException if AOT loading is enabled but the preloaded model is unavailable
+     * @throws IOException
+     *         if the model fails to load
+     * @throws IllegalStateException
+     *         if AOT loading is enabled but the preloaded model is unavailable
      */
     private static Model loadModel(Options options) throws IOException {
         if (USE_AOT) {
@@ -133,25 +137,34 @@ private static Sampler createSampler(Model model, Options options) {
         return selectSampler(model.configuration().vocabularySize(), options.temperature(), options.topp(), options.seed());
     }
 
+    private static void runSingleInstruction(Model model, Sampler sampler, Options options) {
+        String response = model.runInstructOnce(sampler, options);
+        System.out.println(response);
+        if (SHOW_PERF_INTERACTIVE) {
+            LastRunMetrics.printMetrics();
+        }
+    }
+
     /**
      * Entry point for running the LLaMA-based model with provided command-line arguments.
      *
      * <p>Initializes model options, loads the appropriate model (either AOT or on-demand),
-     * configures the sampler, and runs either in interactive or single-instruction mode
-     * based on the input options.</p>
+     * configures the sampler, and runs either in interactive or single-instruction mode based on the input options.</p>
      *
-     * @param args command-line arguments used to configure model path, temperature, seed, etc.
-     * @throws IOException if model loading or file operations fail.
+     * @param args
+     *         command-line arguments used to configure model path, temperature, seed, etc.
+     * @throws IOException
+     *         if model loading or file operations fail.
      */
-    public static void main(String[] args) throws IOException {
+    static void main(String[] args) throws IOException {
         Options options = Options.parseOptions(args);
         Model model = loadModel(options);
         Sampler sampler = createSampler(model, options);
 
         if (options.interactive()) {
             model.runInteractive(sampler, options);
         } else {
-            model.runInstructOnce(sampler, options);
+            runSingleInstruction(model, sampler, options);
         }
     }
 }
 
@@ -4,13 +4,12 @@
 import java.nio.file.Path;
 import java.nio.file.Paths;
 
-public record Options(Path modelPath, String prompt, String systemPrompt, String suffix, boolean interactive,
-                      float temperature, float topp, long seed, int maxTokens, boolean stream, boolean echo) {
+public record Options(Path modelPath, String prompt, String systemPrompt, String suffix, boolean interactive, float temperature, float topp, long seed, int maxTokens, boolean stream, boolean echo,
+                      boolean useTornadovm) {
 
     public static final int DEFAULT_MAX_TOKENS = 1024;
 
     public Options {
-        require(modelPath != null, "Missing argument: --model <path> is required");
         require(interactive || prompt != null, "Missing argument: --prompt is required in --instruct mode e.g. --prompt \"Why is the sky blue?\"");
         require(0 <= temperature, "Invalid argument: --temperature must be non-negative");
         require(0 <= topp && topp <= 1, "Invalid argument: --top-p must be within [0, 1]");
@@ -25,6 +24,10 @@ static void require(boolean condition, String messageFormat, Object... args) {
         }
     }
 
+    private static boolean getDefaultTornadoVM() {
+        return Boolean.parseBoolean(System.getProperty("use.tornadovm", "false"));
+    }
+
     static void printUsage(PrintStream out) {
         out.println("Usage:  jbang Llama3.java [options]");
         out.println();
@@ -44,19 +47,36 @@ static void printUsage(PrintStream out) {
         out.println();
     }
 
-    public static Options parseOptions(String[] args) {
+    public static Options getDefaultOptions() {
         String prompt = "Tell me a story with Java"; // Hardcoded for testing
         String systemPrompt = null;
         String suffix = null;
         float temperature = 0.1f;
         float topp = 0.95f;
         Path modelPath = null;
         long seed = System.nanoTime();
-        // Keep max context length small for low-memory devices.
         int maxTokens = DEFAULT_MAX_TOKENS;
         boolean interactive = false;
         boolean stream = true;
         boolean echo = false;
+        boolean useTornadoVM = getDefaultTornadoVM();
+
+        return new Options(modelPath, prompt, systemPrompt, suffix, interactive, temperature, topp, seed, maxTokens, stream, echo, useTornadoVM);
+    }
+
+    public static Options parseOptions(String[] args) {
+        String prompt = "Tell me a story with Java"; // Hardcoded for testing
+        String systemPrompt = null;
+        String suffix = null;
+        float temperature = 0.1f;
+        float topp = 0.95f;
+        Path modelPath = null;
+        long seed = System.nanoTime();
+        int maxTokens = DEFAULT_MAX_TOKENS;
+        boolean interactive = false;
+        boolean stream = false;
+        boolean echo = false;
+        Boolean useTornadovm = null; // null means not specified via command line
 
         for (int i = 0; i < args.length; i++) {
             String optionName = args[i];
@@ -90,11 +110,19 @@ public static Options parseOptions(String[] args) {
                         case "--max-tokens", "-n" -> maxTokens = Integer.parseInt(nextArg);
                         case "--stream" -> stream = Boolean.parseBoolean(nextArg);
                         case "--echo" -> echo = Boolean.parseBoolean(nextArg);
+                        case "--use-tornadovm" -> useTornadovm = Boolean.parseBoolean(nextArg);
                         default -> require(false, "Unknown option: %s", optionName);
                     }
                 }
             }
         }
-        return new Options(modelPath, prompt, systemPrompt, suffix, interactive, temperature, topp, seed, maxTokens, stream, echo);
+
+        require(modelPath != null, "Missing argument: --model <path> is required");
+
+        if (useTornadovm == null) {
+            useTornadovm = getDefaultTornadoVM();
+        }
+
+        return new Options(modelPath, prompt, systemPrompt, suffix, interactive, temperature, topp, seed, maxTokens, stream, echo, useTornadovm);
     }
 }
@@ -45,7 +45,8 @@ public static GGUF loadModel(Path modelPath) throws IOException {
         }
 
         // second check to make sure that nothing goes wrong during model loading
-        try (FileChannel fileChannel = FileChannel.open(modelPath); var ignored = Timer.log("Parse " + modelPath)) {
+        try (FileChannel fileChannel = FileChannel.open(modelPath);
+        ) {
             GGUF gguf = new GGUF();
             gguf.loadModelImpl(fileChannel);
             return gguf;
 
@@ -13,10 +13,10 @@
 import java.util.List;
 import java.util.Scanner;
 import java.util.Set;
+import java.util.function.Consumer;
 import java.util.function.IntConsumer;
 
 import static org.beehive.gpullama3.LlamaApp.SHOW_PERF_INTERACTIVE;
-import static org.beehive.gpullama3.LlamaApp.USE_TORNADOVM;
 
 public interface Model {
 
@@ -92,7 +92,7 @@ default void runInteractive(Sampler sampler, Options options) {
         Scanner in = new Scanner(System.in);
 
         // Initialize TornadoVM plan once at the beginning if GPU path is enabled
-        if (USE_TORNADOVM && tornadoVMPlan == null) {
+        if (Options.getDefaultOptions().useTornadovm() && tornadoVMPlan == null) {
             tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
         }
 
@@ -131,7 +131,7 @@ default void runInteractive(Sampler sampler, Options options) {
                 };
 
                 // Choose between GPU and CPU path based on configuration
-                if (USE_TORNADOVM) {
+                if (Options.getDefaultOptions().useTornadovm()) {
                     // GPU path using TornadoVM
                     responseTokens = generateTokensGPU(state, startPosition, conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens, options.maxTokens(), sampler,
                             options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
@@ -170,7 +170,7 @@ default void runInteractive(Sampler sampler, Options options) {
             }
         } finally {
             // Clean up TornadoVM resources when exiting the chat loop
-            if (USE_TORNADOVM && tornadoVMPlan != null) {
+            if (Options.getDefaultOptions().useTornadovm() && tornadoVMPlan != null) {
                 try {
                     tornadoVMPlan.freeTornadoExecutionPlan();
                 } catch (Exception e) {
@@ -185,7 +185,7 @@ default void runInteractive(Sampler sampler, Options options) {
      * @param sampler
      * @param options
      */
-    default void runInstructOnce(Sampler sampler, Options options) {
+    default String runInstructOnce(Sampler sampler, Options options) {
         State state = createNewState();
         ChatFormat chatFormat = chatFormat();
         TornadoVMMasterPlan tornadoVMPlan = null;
@@ -201,7 +201,7 @@ default void runInstructOnce(Sampler sampler, Options options) {
         }
 
         // Initialize TornadoVM plan once at the beginning if GPU path is enabled
-        if (USE_TORNADOVM && tornadoVMPlan == null) {
+        if (Options.getDefaultOptions().useTornadovm() && tornadoVMPlan == null) {
             tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
         }
 
@@ -231,9 +231,8 @@ default void runInstructOnce(Sampler sampler, Options options) {
 
         Set<Integer> stopTokens = chatFormat.getStopTokens();
 
-        if (USE_TORNADOVM) {
-            // GPU path using TornadoVM
-            // Call generateTokensGPU without the token consumer parameter
+        if (Options.getDefaultOptions().useTornadovm()) {
+            // GPU path using TornadoVM - Call generateTokensGPU without the token consumer parameter
             responseTokens = generateTokensGPU(state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
         } else {
             // CPU path
@@ -243,19 +242,94 @@ default void runInstructOnce(Sampler sampler, Options options) {
         if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
             responseTokens.removeLast();
         }
+
+        String responseText = "";
         if (!options.stream()) {
-            String responseText = tokenizer().decode(responseTokens);
+             responseText = tokenizer().decode(responseTokens);
             // Add the forced <think>\n prefix for non-streaming output
             if (shouldIncludeReasoning()) {
                 responseText = "<think>\n" + responseText;
             }
-            System.out.println(responseText);
         }
 
-        LastRunMetrics.printMetrics();
+        if (tornadoVMPlan != null) {
+            tornadoVMPlan.freeTornadoExecutionPlan();
+        }
+
+        return responseText;
+    }
+
+    default String runInstructOnceLangChain4J(Sampler sampler, Options options, Consumer<String> tokenCallback) {
+        State state = createNewState();
+        ChatFormat chatFormat = chatFormat();
+        TornadoVMMasterPlan tornadoVMPlan = null;
+
+        List<Integer> promptTokens = new ArrayList<>();
+
+        if (shouldAddBeginOfText()) {
+            promptTokens.add(chatFormat.getBeginOfText());
+        }
+
+        if (shouldAddSystemPrompt() && options.systemPrompt() != null) {
+            promptTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.SYSTEM, options.systemPrompt())));
+        }
+
+        // Initialize TornadoVM plan once at the beginning if GPU path is enabled
+        if (Options.getDefaultOptions().useTornadovm() && tornadoVMPlan == null) {
+            tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, this);
+        }
+
+        promptTokens.addAll(chatFormat.encodeMessage(new ChatFormat.Message(ChatFormat.Role.USER, options.prompt())));
+        promptTokens.addAll(chatFormat.encodeHeader(new ChatFormat.Message(ChatFormat.Role.ASSISTANT, "")));
+
+        if (shouldIncludeReasoning()) {
+            List<Integer> thinkStartTokens = tokenizer().encode("<think>\n", tokenizer().getSpecialTokens().keySet());
+            promptTokens.addAll(thinkStartTokens);
+
+            // If streaming, immediately output the think start
+            if (options.stream()) {
+                System.out.print("<think>\n");
+            }
+        }
+
+        List<Integer> responseTokens;
+
+        IntConsumer tokenConsumer = token -> {
+            if (tokenizer().shouldDisplayToken(token)) {
+                String piece = tokenizer().decode(List.of(token));
+                if (options.stream() && tokenCallback != null) {
+                    tokenCallback.accept(piece);  // ✅ send to LangChain4j handler
+                }
+            }
+        };
+
+        Set<Integer> stopTokens = chatFormat.getStopTokens();
+
+        if (Options.getDefaultOptions().useTornadovm()) {
+            // GPU path using TornadoVM Call generateTokensGPU without the token consumer parameter
+            responseTokens = generateTokensGPU(state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan);
+        } else {
+            // CPU path
+            responseTokens = generateTokens(state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), tokenConsumer);
+        }
+
+        if (!responseTokens.isEmpty() && stopTokens.contains(responseTokens.getLast())) {
+            responseTokens.removeLast();
+        }
+
+        String responseText = tokenizer().decode(responseTokens);
+
+        if (!options.stream()) {
+            responseText = tokenizer().decode(responseTokens);
+            if (shouldIncludeReasoning()) {
+                responseText = "<think>\n" + responseText;
+            }
+        }
 
         if (tornadoVMPlan != null) {
             tornadoVMPlan.freeTornadoExecutionPlan();
         }
+
+        return responseText;
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,8 @@ public static GGUF loadModel(Path modelPath) throws IOException {`
`45`	`45`	`}`
`46`	`46`
`47`	`47`	`// second check to make sure that nothing goes wrong during model loading`
`48`		`- try (FileChannel fileChannel = FileChannel.open(modelPath); var ignored = Timer.log("Parse " + modelPath)) {`
	`48`	`+ try (FileChannel fileChannel = FileChannel.open(modelPath);`
	`49`	`+ ) {`
`49`	`50`	`GGUF gguf = new GGUF();`
`50`	`51`	`gguf.loadModelImpl(fileChannel);`
`51`	`52`	`return gguf;`