beehive-lab
diff --git a/‎src/main/java/com/example/aot/AOT.java‎
Lines changed: 5 additions & 10 deletions b/‎src/main/java/com/example/aot/AOT.java‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎src/main/java/com/example/auxiliary/Utf8Mask.java‎
Lines changed: 2 additions & 0 deletions b/‎src/main/java/com/example/auxiliary/Utf8Mask.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/main/java/com/example/inference/state/LlamaState.java‎
Lines changed: 13 additions & 3 deletions b/‎src/main/java/com/example/inference/state/LlamaState.java‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎src/main/java/com/example/inference/state/Qwen3State.java‎
Lines changed: 14 additions & 7 deletions b/‎src/main/java/com/example/inference/state/Qwen3State.java‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎src/main/java/com/example/inference/state/State.java‎
Lines changed: 16 additions & 2 deletions b/‎src/main/java/com/example/inference/state/State.java‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎src/main/java/com/example/inference/weights/Weights.java‎
Lines changed: 10 additions & 0 deletions b/‎src/main/java/com/example/inference/weights/Weights.java‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/main/java/com/example/inference/weights/standard/LlamaStandardWeights.java‎
Lines changed: 60 additions & 3 deletions b/‎src/main/java/com/example/inference/weights/standard/LlamaStandardWeights.java‎
Lines changed: 60 additions & 3 deletions
diff --git a/‎src/main/java/com/example/inference/weights/standard/Qwen3StandardWeights.java‎
Lines changed: 61 additions & 6 deletions b/‎src/main/java/com/example/inference/weights/standard/Qwen3StandardWeights.java‎
Lines changed: 61 additions & 6 deletions
@@ -32,8 +32,8 @@ public final class AOT {
 
     static LlamaModelLoader modelLoader;
 
-
-    record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {}
+    record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {
+    }
 
     private static final PartialModel PRELOADED_GGUF = preLoadGGUF(System.getProperty("llama.PreloadGGUF"));
 
@@ -49,12 +49,8 @@ private static PartialModel preLoadGGUF(String modelPath) {
             GGUF gguf = GGUF.loadModel(path);
             try (FileChannel fileChannel = FileChannel.open(path, StandardOpenOption.READ)) {
                 modelLoader = new LlamaModelLoader(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false);
-                return new PartialModel(
-                        path.getFileName().toString(),
-                        modelLoader.loadModel(), // TODO: needs proper handling for AOT
-                        gguf.getTensorDataOffset(),
-                        gguf.getTensorInfos()
-                );
+                return new PartialModel(path.getFileName().toString(), modelLoader.loadModel(), // TODO: needs proper handling for AOT
+                        gguf.getTensorDataOffset(), gguf.getTensorInfos());
             }
         } catch (IOException e) {
             throw new RuntimeException(e);
@@ -78,8 +74,7 @@ public static Model tryUsePreLoaded(Path modelPath, int contextLength) throws IO
             return null;
         }
         Llama baseModel = preLoaded.model();
-        try (var timer = Timer.log("Load tensors from pre-loaded model");
-                var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
+        try (var timer = Timer.log("Load tensors from pre-loaded model"); var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
             // Load only the tensors (mmap slices).
             Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, preLoaded.tensorDataOffset(), preLoaded.tensorInfos());
             Weights weights = modelLoader.loadWeights(tensorEntries, baseModel.configuration());
 
@@ -2,9 +2,11 @@
 
 /** mask of a byte-sequence in UTF-8 encoding */
 public record Utf8Mask(int mask, int pattern, int len) {
+    //@formatter:off
     public static final Utf8Mask[] MASKS = {
             new Utf8Mask(0b11100000, 0b11000000, 2),
             new Utf8Mask(0b11110000, 0b11100000, 3),
             new Utf8Mask(0b11111000, 0b11110000, 4)
     };
+    //@formatter:on
 }
@@ -8,6 +8,16 @@
 
 import java.util.stream.Stream;
 
+/**
+ * Represents the state of the Llama model during inference.
+ * This class extends {@link State} to include model-specific functionalities
+ * and configurations tailored for the Llama model.
+ *
+ * <p><b>Note 1:</b> LlamaState contains additional fields for TornadoVM wrappers
+ * to enable GPU-accelerated processing of the model.</p>
+ *
+ * <p><b>Note 2:</b> This state implementation is also used for the Mistral model.</p>
+ */
 public final class LlamaState extends State {
 
     public LlamaState(Configuration config, int batchsize) {
@@ -56,9 +66,9 @@ protected StateFields createStateFields(Configuration config) {
         fields.positionHolder = new IntArray(1);
 
         // Temporary arrays
-        fields.temp = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
+        fields.temp = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
 
         return fields;
     }
 
@@ -9,6 +9,15 @@
 
 import java.util.stream.Stream;
 
+/**
+ * Represents the state of the Qwen3 model during inference.
+ * This class extends {@link State} to include model-specific functionalities
+ * and configurations tailored for the Qwen3 model.
+ *
+ * <p><b>Note 1:</b> Qwen3State contains additional fields for TornadoVM wrappers
+ * to enable GPU-accelerated processing of the model.</p>
+ *
+ */
 public final class Qwen3State extends State {
 
     // Qwen3 specific fields
@@ -52,10 +61,8 @@ protected StateFields createStateFields(Configuration configuration) {
         fields.logits = ArrayFloatTensor.allocate(config.vocabularySize());
 
         // Key-value cache with Qwen3 dimensions
-        fields.keyCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa))
-                .limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
-        fields.valueCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa))
-                .limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
+        fields.keyCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa)).limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
+        fields.valueCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa)).limit(config.numberOfLayers()).toArray(FloatTensor[]::new);
 
         // TornadoVM wrappers with Qwen3-specific sizes
         fields.wrapX = new FloatArray(config.dim());
@@ -76,9 +83,9 @@ protected StateFields createStateFields(Configuration configuration) {
         fields.positionHolder = new IntArray(1);
 
         // Temporary arrays
-        fields.temp = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
-        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize-1) / localSize));
+        fields.temp = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempFFN = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
+        fields.tempLogits = new FloatArray(1 + ((config.dim() + localSize - 1) / localSize));
 
         return fields;
     }
 
@@ -6,9 +6,23 @@
 import uk.ac.manchester.tornado.api.types.arrays.IntArray;
 
 /**
- * Base class for State
+ * Represents the base state structure used during LLM inference.
+ * This class provides a common foundation for handling state-related data and functionalities
+ * that can be extended by model-specific implementations.
+ *
+ * <p><b>Key Responsibilities:</b></p>
+ * <ul>
+ *   <li>Defines core structures to store and access model state data required for computation.</li>
+ *   <li>Can be extended by model-specific state classes (e.g., {@link LlamaState}, {@link Qwen3State}).</li>
+ * </ul>
+ *
+ * <p><b>Usage:</b> Extend `State` to implement model-specific state configurations
+ * while reusing the common structure and functionality provided by this class.</p>
+ *
+ * <p><b>Note:</b> This class is designed to be generic and does not include any
+ * model-specific behavior or fields. Those should be implemented in subclasses.</p>
  */
-public abstract class State{
+public abstract class State {
 
     // current wave of activations
     public final FloatTensor x;         // activation at current time stamp (dim,)
 
@@ -2,6 +2,16 @@
 
 import com.example.core.model.GGMLType;
 
+/**
+ * The GPULlama3.java utilizes two distinct weight types:
+ * <ul>
+ *   <li><b>StandardWeights:</b> Designed for standard Java-based inference on the CPU.</li>
+ *   <li><b>TornadoWeights:</b> Optimized for GPU-accelerated inference using TornadoVM.</li>
+ * </ul>
+ *
+ * The packages <code>weights.standard</code> and <code>weights.tornado</code> define
+ * base classes and model-specific implementations for weights in their respective formats.
+ */
 public interface Weights {
 
     GGMLType getWeightType();
 
@@ -3,12 +3,69 @@
 import com.example.core.model.GGMLType;
 import com.example.core.model.tensor.FloatTensor;
 
+/**
+ * A model-specific implementation of {@link StandardWeights} for the Llama model.
+ * This class encapsulates the weights required for performing inference
+ * using the Llama model in the standard CPU-based format.
+ *
+ * <p><b>Note:</b> This weight format is also used for the Mistral model.</p>
+ */
 public class LlamaStandardWeights extends StandardWeights {
 
-    public LlamaStandardWeights(FloatTensor token_embedding_table, FloatTensor[] rms_att_weight, FloatTensor[] wq, FloatTensor[] wk, FloatTensor[] wv, FloatTensor[] wo, FloatTensor[] rms_ffn_weight,
-            FloatTensor[] w1, FloatTensor[] w2, FloatTensor[] w3, FloatTensor rms_final_weight, FloatTensor freq_cis_real, FloatTensor freq_cis_imag, FloatTensor wcls, GGMLType weightType) {
-        super(token_embedding_table, rms_att_weight, wq, wk, wv, wo, rms_ffn_weight, w1, w2, w3, rms_final_weight, freq_cis_real, freq_cis_imag, wcls, weightType);
+    // @formatter:off
+    /**
+     * Constructor for LlamaStandardWeights.
+     *
+     * @param token_embedding_table  The token embedding table tensor.
+     * @param rms_att_weight         Array of RMS attention weights tensors.
+     * @param wq                     Array of query weight tensors.
+     * @param wk                     Array of key weight tensors.
+     * @param wv                     Array of value weight tensors.
+     * @param wo                     Array of output weight tensors.
+     * @param rms_ffn_weight         Array of RMS feed-forward network weights.
+     * @param w1                     Array of first feed-forward layer weights.
+     * @param w2                     Array of second feed-forward layer weights.
+     * @param w3                     Array of third feed-forward layer weights.
+     * @param rms_final_weight       Final RMS weight tensor.
+     * @param freq_cis_real          Real part of frequency cis tensor.
+     * @param freq_cis_imag          Imaginary part of frequency cis tensor.
+     * @param wcls                   Class token weight tensor.
+     * @param weightType             The GGML weight type.
+     */
+    public LlamaStandardWeights(
+            FloatTensor token_embedding_table,
+            FloatTensor[] rms_att_weight,
+            FloatTensor[] wq,
+            FloatTensor[] wk,
+            FloatTensor[] wv,
+            FloatTensor[] wo,
+            FloatTensor[] rms_ffn_weight,
+            FloatTensor[] w1,
+            FloatTensor[] w2,
+            FloatTensor[] w3,
+            FloatTensor rms_final_weight,
+            FloatTensor freq_cis_real,
+            FloatTensor freq_cis_imag,
+            FloatTensor wcls,
+            GGMLType weightType) {
+        // call to StandardWeights constructor
+        super(token_embedding_table,
+                rms_att_weight,
+                wq,
+                wk,
+                wv,
+                wo,
+                rms_ffn_weight,
+                w1,
+                w2,
+                w3,
+                rms_final_weight,
+                freq_cis_real,
+                freq_cis_imag,
+                wcls,
+                weightType);
     }
+    // @formatter:on
 
     @Override
     public GGMLType getWeightType() {
 
@@ -3,20 +3,75 @@
 import com.example.core.model.GGMLType;
 import com.example.core.model.tensor.FloatTensor;
 
+/**
+ * A model-specific implementation of {@link StandardWeights} for the Qwen-3 model.
+ * This class defines the weights required for performing inference
+ * using the Qwen-3 model in the standard CPU-based format.
+ */
 public class Qwen3StandardWeights extends StandardWeights {
     public final FloatTensor[] attnKNorm, attnQNorm;
 
-    public Qwen3StandardWeights(FloatTensor token_embedding_table, FloatTensor[] rms_att_weight,
-            FloatTensor[] wq, FloatTensor[] wk, FloatTensor[] wv, FloatTensor[] wo,
-            FloatTensor[] attnKNorm, FloatTensor[] attnQNorm,
+    // @formatter:off
+    /**
+     * Constructor for {@code Qwen3StandardWeights}.
+     *
+     * @param token_embedding_table The token embedding table, used to map tokens to embeddings.
+     * @param rms_att_weight        The array of Root Mean Square (RMS) attention weights.
+     * @param wq                    The array of query weight tensors for attention layers.
+     * @param wk                    The array of key weight tensors for attention layers.
+     * @param wv                    The array of value weight tensors for attention layers.
+     * @param wo                    The array of output weight tensors for attention layers.
+     * @param attnKNorm             The array of normalization tensors for attention keys.
+     * @param attnQNorm             The array of normalization tensors for attention queries.
+     * @param rms_ffn_weight        The array of RMS weights for feed-forward neural network layers.
+     * @param w1                    The array of first weight tensors for feed-forward layers.
+     * @param w2                    The array of second weight tensors for feed-forward layers.
+     * @param w3                    The array of third weight tensors for feed-forward layers.
+     * @param rms_final_weight      The RMS weight used for final output normalization.
+     * @param freq_cis_real         The real part of the frequency position encodings.
+     * @param freq_cis_imag         The imaginary part of the frequency position encodings.
+     * @param wcls                  The weight tensor for the classification head.
+     * @param weightType            The type of the weights, defined as {@link GGMLType}.
+     */
+    public Qwen3StandardWeights(
+            FloatTensor token_embedding_table,
+            FloatTensor[] rms_att_weight,
+            FloatTensor[] wq,
+            FloatTensor[] wk,
+            FloatTensor[] wv,
+            FloatTensor[] wo,
+            FloatTensor[] attnKNorm,
+            FloatTensor[] attnQNorm,
             FloatTensor[] rms_ffn_weight,
-            FloatTensor[] w1, FloatTensor[] w2, FloatTensor[] w3,
-            FloatTensor rms_final_weight, FloatTensor freq_cis_real, FloatTensor freq_cis_imag, FloatTensor wcls, GGMLType weightType) {
+            FloatTensor[] w1,
+            FloatTensor[] w2,
+            FloatTensor[] w3,
+            FloatTensor rms_final_weight,
+            FloatTensor freq_cis_real,
+            FloatTensor freq_cis_imag,
+            FloatTensor wcls,
+            GGMLType weightType) {
         // call to StandardWeights constructor
-        super(token_embedding_table, rms_att_weight, wq, wk, wv, wo, rms_ffn_weight, w1, w2, w3, rms_final_weight, freq_cis_real, freq_cis_imag, wcls, weightType);
+        super(token_embedding_table,
+                rms_att_weight,
+                wq,
+                wk,
+                wv,
+                wo,
+                rms_ffn_weight,
+                w1,
+                w2,
+                w3,
+                rms_final_weight,
+                freq_cis_real,
+                freq_cis_imag,
+                wcls,
+                weightType);
+        // init Qwen3-specific fields
         this.attnKNorm = attnKNorm;
         this.attnQNorm = attnQNorm;
     }
+    // @formatter:on
 
     @Override
     public GGMLType getWeightType() {