beehive-lab
diff --git a/‎src/main/java/com/example/inference/InferenceCore.java‎
Lines changed: 199 additions & 0 deletions b/‎src/main/java/com/example/inference/InferenceCore.java‎
Lines changed: 199 additions & 0 deletions
@@ -0,0 +1,199 @@
+package com.example.inference;
+
+import com.example.aux.Parallel;
+import com.example.core.model.tensor.FloatTensor;
+import com.example.loader.weights.State;
+import com.example.loader.weights.Weights;
+import com.example.model.Configuration;
+import com.example.model.Model;
+import com.example.tornadovm.TornadoVMMasterPlan;
+import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
+
+import java.lang.foreign.MemorySegment;
+import java.nio.FloatBuffer;
+
+/**
+ * Low-level operations for model inference.
+ *
+ * <p>
+ * Provides core computational operations: RMS normalization and forward passes
+ * through model layers. Supports both CPU and GPU implementations.
+ */
+
+public final class InferenceCore {
+
+    private InferenceCore() {
+        // prevent instantiation
+    }
+
+    public static void rmsnorm(FloatTensor out, FloatTensor x, FloatBuffer weight, int size, float rmsNormEps) {
+        // calculate sum of squares
+        float ss = x.reduce(0, size, 0f, (acc, xi) -> acc + xi * xi);
+        ss /= size;
+        ss += rmsNormEps;
+        ss = (float) (1.0 / Math.sqrt(ss));
+        // normalize and scale
+        final float finalss = ss; // for the lambda
+        out.mapWithIndexInPlace(0, size, (value, index) -> weight.get(index) * (finalss * x.getFloat(index)));
+    }
+
+    public static FloatTensor forwardJava(Model model, State state, int token, int position) {
+        // a few convenience variables
+        final Configuration config = model.configuration();
+        final Weights weights = model.weights();
+        int dim = config.dim();
+        int headSize = config.headSize();
+        int kvDim = (config.dim() * config.numberOfKeyValueHeads()) / config.numberOfHeads();
+        int kvMul = config.numberOfHeads() / config.numberOfKeyValueHeads(); // integer multiplier of the kv sharing in multiquery
+        float sqrtHeadSize = (float) Math.sqrt(headSize);
+
+        // copy the token embedding into x
+        weights.token_embedding_table.copyTo(token * dim, state.x, 0, dim);
+
+        // forward all the layers
+        for (int l = 0; l < config.numberOfLayers(); l++) {
+            // attention rmsnorm
+            rmsnorm(state.xb, state.x, weights.rms_att_weight[l], dim, config.rmsNormEps());
+
+            // qkv matmuls for this position
+
+            weights.wq[l].matmul(state.xb, state.q, dim, dim);
+            weights.wk[l].matmul(state.xb, state.k, kvDim, dim);
+            weights.wv[l].matmul(state.xb, state.v, kvDim, dim);
+
+            // RoPE relative positional encoding: complex-valued rotate q and k in each head
+            for (int i = 0; i < dim; i += 2) {
+                int head_dim = i % headSize;
+                float fcr = weights.freq_cis_real.get(position * (headSize / 2) + (head_dim / 2));
+                float fci = weights.freq_cis_imag.get(position * (headSize / 2) + (head_dim / 2));
+                int rotn = i < kvDim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
+                for (int v = 0; v < rotn; v++) {
+                    FloatTensor vec = v == 0 ? state.q : state.k; // the vector to rotate (query or key)
+                    float v0 = vec.getFloat(i);
+                    float v1 = vec.getFloat(i + 1);
+                    vec.setFloat(i, v0 * fcr - v1 * fci);
+                    vec.setFloat(i + 1, v0 * fci + v1 * fcr);
+                }
+            }
+
+            // save key,value at this time step (position) to our kv cache
+            //int loff = l * config.seq_len * kvDim;
+            // kv cache layer offset for convenience
+            state.k.copyTo(0, state.keyCache[l], position * kvDim, kvDim);
+            state.v.copyTo(0, state.valueCache[l], position * kvDim, kvDim);
+
+            int curLayer = l;
+
+            // multihead attention. iterate over all heads
+            Parallel.parallelFor(0, config.numberOfHeads(), h -> {
+                // get the query vector for this head
+                // float* q = s.q + h * headSize;
+                int qOffset = h * headSize;
+
+                // attention scores for this head
+                // float* att = s.att + h * config.seq_len;
+                int attOffset = h * config.contextLength();
+
+                // iterate over all timesteps, including the current one
+                for (int t = 0; t <= position; t++) {
+                    // get the key vector for this head and at this timestep
+                    // float* k = s.key_cache + loff + t * dim + h * headSize;
+                    int keyCacheOffset = /* loff + */ t * kvDim + (h / kvMul) * headSize;
+                    // calculate the attention score as the dot product of q and k
+                    float score = state.q.dot(qOffset, state.keyCache[curLayer], keyCacheOffset, headSize);
+                    score /= sqrtHeadSize;
+                    // save the score to the attention buffer
+                    state.att.setFloat(attOffset + t, score);
+                }
+
+                // softmax the scores to get attention weights, from 0..position inclusively
+                state.att.softmaxInPlace(attOffset, position + 1);
+
+                // weighted sum of the values, store back into xb
+                // float* xb = s.xb + h * headSize;
+                int xbOffset = h * headSize;
+                // memset(xb, 0, headSize * sizeof(float));
+                state.xb.fillInPlace(xbOffset, headSize, 0f);
+
+                for (int t = 0; t <= position; t++) {
+                    // get the value vector for this head and at this timestep
+                    // float* v = s.value_cache + loff + t * dim + h * headSize;
+                    int vOffset = /* loff + */ t * kvDim + (h / kvMul) * headSize;
+                    // get the attention weight for this timestep
+                    float a = state.att.getFloat(attOffset + t);
+                    // accumulate the weighted value into xb
+                    state.xb.saxpyInPlace(xbOffset, state.valueCache[curLayer], vOffset, headSize, a);
+                }
+            });
+
+            // final matmul to get the output of the attention
+            weights.wo[l].matmul(state.xb, state.xb2, dim, dim);
+
+            // residual connection back into x
+            state.x.addInPlace(state.xb2);
+
+            // ffn rmsnorm
+            rmsnorm(state.xb, state.x, weights.rms_ffn_weight[l], dim, config.rmsNormEps());
+
+            // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))
+            // first calculate self.w1(x) and self.w3(x)
+            weights.w1[l].matmul(state.xb, state.hb, config.hiddenDim(), dim);
+            weights.w3[l].matmul(state.xb, state.hb2, config.hiddenDim(), dim);
+
+            // SwiGLU non-linearity
+            // silu(x)=x*σ(x), where σ(x) is the logistic sigmoid
+            state.hb.mapInPlace(value -> value / (float) (1.0 + Math.exp(-value)));
+
+            // elementwise multiply with w3(x)
+            state.hb.multiplyInPlace(state.hb2);
+
+            // final matmul to get the output of the ffn
+            weights.w2[l].matmul(state.hb, state.xb, dim, config.hiddenDim());
+
+            // residual connection
+            state.x.addInPlace(state.xb);
+        }
+
+        rmsnorm(state.x, state.x, weights.rms_final_weight, dim, config.rmsNormEps());
+
+        weights.wcls.matmul(state.x, state.logits, config.vocabularySize(), dim);
+
+        return state.logits;
+    }
+
+    /**
+     * Performs the initial embedding lookup and triggers the TornadoVM accelerated forward pass for an LLM token.
+     *
+     * <p>This method handles the first phase of processing a token through the transformer model:
+     * <ol>
+     *   <li>Copies the token embedding from the model's embedding table to the state's buffer</li>
+     *   <li>Delegates the transformer layer processing to TornadoVM through the master plan</li>
+     * </ol>
+     *
+     * <p>The token embedding lookup happens on the CPU using {@link MemorySegment} operations,
+     * while the subsequent transformer layers processing is offloaded to the accelerator through
+     * TornadoVM for improved performance.
+     *
+     * @param model
+     *         The Llama model containing weights and configuration parameters
+     * @param state
+     *         The current execution state holding input/output tensors and temporary buffers
+     * @param token
+     *         The input token ID to process
+     * @param position
+     *         The position of this token in the sequence context window
+     * @param tornadoVMMasterPlan
+     *         The execution plan for TornadoVM acceleration
+     * @return FloatTensor containing the output logits for token prediction
+     */
+    public static FloatArray forwardTornadoVM(Model model, State state, int token, int position, TornadoVMMasterPlan tornadoVMMasterPlan) {
+        final Configuration configuration = model.configuration();
+        final Weights weights = model.weights();
+
+        MemorySegment.copy(weights.tokenEmbeddingTable.getSegment(), token * configuration.dim() * Float.BYTES, state.wrapX.getSegment(), 0, configuration.dim() * Float.BYTES);
+
+        return tornadoVMMasterPlan.tornadoVMForwardExecuteLayered(position);
+    }
+
+
+}