From 6c4243fae2490190b1a98fc1a91ad6d99c177309 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 11 Dec 2024 19:16:25 +0200
Subject: [PATCH 01/73] feat(minor): dynamically load `llama.cpp` backends

---
 llama/CMakeLists.txt                          |  5 ++
 llama/addon/addon.cpp                         | 14 ++--
 src/bindings/AddonTypes.ts                    |  2 +-
 src/bindings/Llama.ts                         | 81 ++++++++++---------
 src/bindings/getLlama.ts                      | 21 +++--
 src/bindings/utils/compileLLamaCpp.ts         |  9 ++-
 .../utils/resolveActualBindingBinaryPath.ts   | 19 +++++
 src/bindings/utils/testBindingBinary.ts       |  2 +-
 8 files changed, 98 insertions(+), 55 deletions(-)
 create mode 100644 src/bindings/utils/resolveActualBindingBinaryPath.ts
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 08c7a86b..d6413202 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -24,6 +24,11 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
 
 set(LLAMA_BUILD_COMMON ON)
 
+if (NOT MINGW)
+    set(GGML_BACKEND_DL ON)
+    set(BUILD_SHARED_LIBS ON)
+endif()
+
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
     add_compile_options(-Wno-c++17-extensions)
 endif()
diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
index 7b014079..ed4b2fb9 100644
--- a/llama/addon/addon.cpp
+++ b/llama/addon/addon.cpp
@@ -152,16 +152,16 @@ class AddonBackendUnloadWorker : public Napi::AsyncWorker {
 };
 
 Napi::Value addonLoadBackends(const Napi::CallbackInfo& info) {
-    const bool forceLoadLibraries = info.Length() == 0
-        ? false
-        : info[0].IsBoolean()
-            ? info[0].As<Napi::Boolean>().Value()
-            : false;
+    const std::string forceLoadLibrariesSearchPath = info.Length() == 0
+        ? ""
+        : info[0].IsString()
+            ? info[0].As<Napi::String>().Utf8Value()
+            : "";
 
     ggml_backend_reg_count();
 
-    if (forceLoadLibraries) {
-        ggml_backend_load_all();
+    if (forceLoadLibrariesSearchPath.length() > 0) {
+        ggml_backend_load_all_from_path(forceLoadLibrariesSearchPath.c_str());
     }
 
     return info.Env().Undefined();
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index d62faf13..6303a711 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -76,7 +76,7 @@ export type BindingModule = {
         free: number
     },
     init(): Promise<void>,
-    loadBackends(forceLoadLibraries?: boolean): void,
+    loadBackends(forceLoadLibrariesSearchPath?: string): void,
     dispose(): Promise<void>
 };
 
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index e48143fe..34bba99b 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -1,4 +1,5 @@
 import os from "os";
+import path from "path";
 import chalk from "chalk";
 import {DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js";
@@ -34,7 +35,7 @@ export class Llama {
     /** @internal */ public readonly _memoryLock = {};
     /** @internal */ public readonly _consts: ReturnType<BindingModule["getConsts"]>;
     /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator;
-    /** @internal */ public readonly _vramPadding: MemoryReservation;
+    /** @internal */ public _vramPadding: MemoryReservation;
     /** @internal */ public readonly _ramOrchestrator: MemoryOrchestrator;
     /** @internal */ public readonly _ramPadding: MemoryReservation;
     /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator;
@@ -65,10 +66,11 @@ export class Llama {
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        bindings, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, gpu, maxThreads, vramOrchestrator, vramPadding,
-        ramOrchestrator, ramPadding, swapOrchestrator
+        bindings, bindingPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, buildGpu, maxThreads, vramOrchestrator,
+        vramPadding, ramOrchestrator, ramPadding, swapOrchestrator
     }: {
         bindings: BindingModule,
+        bindingPath: string,
         logLevel: LlamaLogLevel,
         logger: (level: LlamaLogLevel, message: string) => void,
         buildType: "localBuild" | "prebuilt",
@@ -78,7 +80,7 @@ export class Llama {
             release: string
         },
         debug: boolean,
-        gpu: BuildGpu,
+        buildGpu: BuildGpu,
         maxThreads?: number,
         vramOrchestrator: MemoryOrchestrator,
         vramPadding: MemoryReservation,
@@ -86,14 +88,31 @@ export class Llama {
         ramPadding: MemoryReservation,
         swapOrchestrator: MemoryOrchestrator
     }) {
+        this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
+        this._onAddonLog = this._onAddonLog.bind(this);
+
         this._bindings = bindings;
-        this._gpu = gpu;
+        this._debug = debug;
+        this._logLevel = this._debug
+            ? LlamaLogLevel.debug
+            : (logLevel ?? LlamaLogLevel.debug);
+
+        if (!this._debug) {
+            this._bindings.setLogger(this._onAddonLog);
+            this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
+        }
+
+        bindings.loadBackends();
+        const loadedGpu = bindings.getGpuType();
+        if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
+            bindings.loadBackends(path.dirname(bindingPath));
+
+        this._gpu = bindings.getGpuType() ?? false;
         this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
         this._supportsMmap = bindings.getSupportsMmap();
         this._supportsMlock = bindings.getSupportsMlock();
         this._mathCores = bindings.getMathCores();
         this._consts = bindings.getConsts();
-        this._debug = debug;
         this._vramOrchestrator = vramOrchestrator;
         this._vramPadding = vramPadding;
         this._ramOrchestrator = ramOrchestrator;
@@ -106,10 +125,6 @@ export class Llama {
                     : 0
             )
         );
-
-        this._logLevel = this._debug
-            ? LlamaLogLevel.debug
-            : (logLevel ?? LlamaLogLevel.debug);
         this._logger = logger;
         this._buildType = buildType;
         this._cmakeOptions = Object.freeze({...cmakeOptions});
@@ -118,21 +133,7 @@ export class Llama {
             release: llamaCppRelease.release
         });
 
-        this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
-        this._onAddonLog = this._onAddonLog.bind(this);
-
-        if (!this._debug) {
-            this._bindings.setLogger(this._onAddonLog);
-            this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
-        }
-
-        this._bindings.loadBackends();
-        const loadedGpu = bindings.getGpuType();
-        if (loadedGpu == null || (loadedGpu === false && gpu !== false))
-            this._bindings.loadBackends(true);
-
         this._onExit = this._onExit.bind(this);
-
         process.on("exit", this._onExit);
     }
 
@@ -446,9 +447,11 @@ export class Llama {
 
     /** @internal */
     public static async _create({
-        bindings, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug
+        bindings, bindingPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false,
+        debug
     }: {
         bindings: BindingModule,
+        bindingPath: string,
         buildType: "localBuild" | "prebuilt",
         buildMetadata: BuildMetadataFile,
         logLevel: LlamaLogLevel,
@@ -459,7 +462,6 @@ export class Llama {
         skipLlamaInit?: boolean,
         debug: boolean
     }) {
-        const gpu = bindings.getGpuType() ?? false;
         const vramOrchestrator = new MemoryOrchestrator(() => {
             const {total, used, unifiedSize} = bindings.getGpuVramInfo();
 
@@ -497,14 +499,6 @@ export class Llama {
             };
         });
 
-        let resolvedVramPadding: MemoryReservation;
-        if (gpu === false || vramPadding === 0)
-            resolvedVramPadding = vramOrchestrator.reserveMemory(0);
-        else if (vramPadding instanceof Function)
-            resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total));
-        else
-            resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding);
-
         let resolvedRamPadding: MemoryReservation;
         if (ramPadding instanceof Function)
             resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total));
@@ -513,6 +507,7 @@ export class Llama {
 
         const llama = new Llama({
             bindings,
+            bindingPath,
             buildType,
             cmakeOptions: buildMetadata.buildOptions.customCmakeOptions,
             llamaCppRelease: {
@@ -522,15 +517,27 @@ export class Llama {
             logLevel,
             logger,
             debug,
-            gpu,
+            buildGpu: buildMetadata.buildOptions.gpu,
             vramOrchestrator,
             maxThreads,
-            vramPadding: resolvedVramPadding,
+            vramPadding: vramOrchestrator.reserveMemory(0),
             ramOrchestrator,
             ramPadding: resolvedRamPadding,
             swapOrchestrator
         });
 
+        if (llama.gpu === false || vramPadding === 0) {
+            // do nothing since `llama._vramPadding` is already set to 0
+        } else if (vramPadding instanceof Function) {
+            const currentVramPadding = llama._vramPadding;
+            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total));
+            currentVramPadding.dispose();
+        } else {
+            const currentVramPadding = llama._vramPadding;
+            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding);
+            currentVramPadding.dispose();
+        }
+
         if (!skipLlamaInit)
             await llama._init();
 
@@ -612,6 +619,8 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string): LlamaLog
         return LlamaLogLevel.log;
     else if (level === LlamaLogLevel.warn && message.startsWith("ggml_cuda_init: GGML_CUDA_FORCE_") && message.endsWith(" no"))
         return LlamaLogLevel.log;
+    else if (level === LlamaLogLevel.info && message.startsWith("load_backend: loaded "))
+        return LlamaLogLevel.log;
 
     return level;
 }
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
index dffeea50..c0dcffc6 100644
--- a/src/bindings/getLlama.ts
+++ b/src/bindings/getLlama.ts
@@ -30,6 +30,7 @@ import {getLinuxDistroInfo, isDistroAlpineLinux} from "./utils/getLinuxDistroInf
 import {testBindingBinary} from "./utils/testBindingBinary.js";
 import {BinaryPlatformInfo, getPlatformInfo} from "./utils/getPlatformInfo.js";
 import {hasBuildingFromSourceDependenciesInstalled} from "./utils/hasBuildingFromSourceDependenciesInstalled.js";
+import {resolveActualBindingBinaryPath} from "./utils/resolveActualBindingBinaryPath.js";
 
 const require = createRequire(import.meta.url);
 
@@ -297,11 +298,13 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp
         await waitForLockfileRelease({resourcePath: localBuildFolder});
         if (localBuildBinPath != null) {
             try {
-                const binding = loadBindingModule(localBuildBinPath);
+                const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
+                const binding = loadBindingModule(resolvedBindingPath);
                 const buildMetadata = await getLocalBuildBinaryBuildMetadata(lastBuildInfo.folderName);
 
                 return await Llama._create({
                     bindings: binding,
+                    bindingPath: resolvedBindingPath,
                     buildType: "localBuild",
                     buildMetadata,
                     logger: lastBuildOptions?.logger ?? Llama.defaultConsoleLogger,
@@ -585,15 +588,17 @@ async function loadExistingLlamaBinary({
                 platformInfo,
                 buildMetadata
             });
+            const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
             const binaryCompatible = shouldTestBinaryBeforeLoading
-                ? await testBindingBinary(localBuildBinPath, buildOptions.gpu)
+                ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu)
                 : true;
 
             if (binaryCompatible) {
-                const binding = loadBindingModule(localBuildBinPath);
+                const binding = loadBindingModule(resolvedBindingPath);
 
                 return await Llama._create({
                     bindings: binding,
+                    bindingPath: resolvedBindingPath,
                     buildType: "localBuild",
                     buildMetadata,
                     logLevel,
@@ -642,15 +647,17 @@ async function loadExistingLlamaBinary({
                     platformInfo,
                     buildMetadata
                 });
+                const resolvedBindingPath = await resolveActualBindingBinaryPath(prebuiltBinDetails.binaryPath);
                 const binaryCompatible = shouldTestBinaryBeforeLoading
-                    ? await testBindingBinary(prebuiltBinDetails.binaryPath, buildOptions.gpu)
+                    ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu)
                     : true;
 
                 if (binaryCompatible) {
-                    const binding = loadBindingModule(prebuiltBinDetails.binaryPath);
+                    const binding = loadBindingModule(resolvedBindingPath);
 
                     return await Llama._create({
                         bindings: binding,
+                        bindingPath: resolvedBindingPath,
                         buildType: "prebuilt",
                         buildMetadata,
                         logLevel,
@@ -744,11 +751,13 @@ async function buildAndLoadLlamaBinary({
         throw new Error("Failed to build llama.cpp");
     }
 
-    const binding = loadBindingModule(localBuildBinPath);
+    const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
+    const binding = loadBindingModule(resolvedBindingPath);
     const buildMetadata = await getLocalBuildBinaryBuildMetadata(buildFolderName.withCustomCmakeOptions);
 
     return await Llama._create({
         bindings: binding,
+        bindingPath: resolvedBindingPath,
         buildType: "localBuild",
         buildMetadata,
         logLevel,
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index ec9655b4..e898829a 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -103,11 +103,12 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     if (!cmakeCustomOptions.has("GGML_OPENMP"))
                         cmakeCustomOptions.set("GGML_OPENMP", "OFF");
 
-                    if (!cmakeCustomOptions.has("GGML_AMX"))
-                        cmakeCustomOptions.set("GGML_AMX", "OFF");
-
-                    if (!cmakeCustomOptions.has("GGML_NATIVE") && buildOptions.platform !== "mac")
+                    if (!cmakeCustomOptions.has("GGML_NATIVE") && !(buildOptions.platform === "mac" && buildOptions.arch === "arm64")) {
                         cmakeCustomOptions.set("GGML_NATIVE", "OFF");
+
+                        if (!cmakeCustomOptions.has("GGML_CPU_ALL_VARIANTS"))
+                            cmakeCustomOptions.set("GGML_CPU_ALL_VARIANTS", "ON");
+                    }
                 }
 
                 await fs.remove(outDirectory);
diff --git a/src/bindings/utils/resolveActualBindingBinaryPath.ts b/src/bindings/utils/resolveActualBindingBinaryPath.ts
new file mode 100644
index 00000000..21656519
--- /dev/null
+++ b/src/bindings/utils/resolveActualBindingBinaryPath.ts
@@ -0,0 +1,19 @@
+import path from "path";
+import fs from "fs-extra";
+import {runningInElectron} from "../../utils/runtime.js";
+
+export async function resolveActualBindingBinaryPath(binaryPath: string) {
+    const absolutePath = path.resolve(binaryPath);
+    if (!runningInElectron)
+        return absolutePath;
+
+    const fixedAsarPath = absolutePath.replace(".asar" + path.sep, ".asar.unpacked" + path.sep);
+    try {
+        if (await fs.pathExists(fixedAsarPath))
+            return fixedAsarPath;
+
+        return absolutePath;
+    } catch (err) {
+        return absolutePath;
+    }
+}
diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts
index 43e47ebe..b85ac213 100644
--- a/src/bindings/utils/testBindingBinary.ts
+++ b/src/bindings/utils/testBindingBinary.ts
@@ -200,7 +200,7 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
                 binding.loadBackends();
                 const loadedGpu = binding.getGpuType();
                 if (loadedGpu == null || (loadedGpu === false && message.gpu !== false))
-                    binding.loadBackends(true);
+                    binding.loadBackends(path.dirname(path.resolve(message.bindingBinaryPath)));
 
                 await binding.init();
                 binding.getGpuVramInfo();

From 32b7f9e2f7eda85219b12d3a08e4e4b92f3bd73e Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 11 Dec 2024 19:27:19 +0200
Subject: [PATCH 02/73] docs: remove Intel AMX trick, since it's being
 automatically used in the prebuilt binaries now

---
 docs/guide/tips-and-tricks.md | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md
index 190741ff..bfdb7086 100644
--- a/docs/guide/tips-and-tricks.md
+++ b/docs/guide/tips-and-tricks.md
@@ -88,37 +88,3 @@ npx --no node-llama-cpp source download
 ```
 
 Now, just use `node-llama-cpp` as you normally would.
-
-## Intel AMX {#intel-amx}
-> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors
-> that helps optimize and accelerate matrix multiplication operations.
-> 
-> It's available on the 4th Gen and newer Intel Xeon processors.
-
-Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions).
-
-If you're using a 4th Gen or newer Intel Xeon processor,
-you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware.
-
-To do this, run this command inside your project on the machine you run your project on:
-```shell
-npx --no node-llama-cpp source download
-```
-
-Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries
-and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU:
-
-```typescript
-import os from "os";
-import {getLlama} from "node-llama-cpp";
-
-const llama = await getLlama({
-    usePrebuiltBinaries: !os.cpus().some((cpu) => (
-        cpu.model.toLowerCase().includes("Xeon".toLowerCase())
-    ))
-});
-```
-::: info NOTE
-Building from source can take some time (when using CUDA even up to an hour in extreme cases),
-so ensure you dedicate some time for this as part of the deployment process.
-:::

From 6504b23a4b90e9c970b572f69558cc7292292df6 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 14 Dec 2024 15:56:36 +0200
Subject: [PATCH 03/73] docs: update custom cmake options

---
 docs/guide/cmakeOptions.data.ts | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/guide/cmakeOptions.data.ts b/docs/guide/cmakeOptions.data.ts
index 1c0263c2..906562d9 100644
--- a/docs/guide/cmakeOptions.data.ts
+++ b/docs/guide/cmakeOptions.data.ts
@@ -68,12 +68,16 @@ function parseCmakeOptions(cmakeListsTxt: string, optionFilter: ((key: string) =
     for (let i = 0; i < cmakeOptions.length; i++) {
         const option = cmakeOptions[i]!;
 
-        if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC") {
+        if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC" ||
+            option.key === "GGML_WASM_SINGLE_FILE" || option.key === "BUILD_SHARED_LIBS" || option.key === "GGML_BACKEND_DL"
+        ) {
             cmakeOptions.splice(i, 1);
             i--;
             continue;
         } else if (option.key === "GGML_METAL" && option.defaultValue === "${GGML_METAL_DEFAULT}")
             option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS on Apple Silicon, `OFF` otherwise");
+        else if (option.key === "GGML_BLAS" && option.defaultValue === "${GGML_BLAS_DEFAULT}")
+            option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
         else if (option.key === "GGML_METAL_EMBED_LIBRARY" && option.defaultValue === "${GGML_METAL}")
             option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
         else if (option.defaultValue === "${GGML_STANDALONE}") {

From 561f9eb53973638bdaad3b61c3a73892fadd1dca Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 14 Dec 2024 15:57:34 +0200
Subject: [PATCH 04/73] docs: parse custom cmake options nested under ifs

---
 .vitepress/utils/parseCmakeListsTxtOptions.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.vitepress/utils/parseCmakeListsTxtOptions.ts b/.vitepress/utils/parseCmakeListsTxtOptions.ts
index 3244aae5..b16f09d4 100644
--- a/.vitepress/utils/parseCmakeListsTxtOptions.ts
+++ b/.vitepress/utils/parseCmakeListsTxtOptions.ts
@@ -1,5 +1,7 @@
 const maxLinesSpan = 10;
 
+const cmakeOptionRegex =
+    /^\s*option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/;
 export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
     const lines = cmakeListsTxtString.split("\n");
 
@@ -8,9 +10,7 @@ export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
             const match = lines
                 .slice(index, index + maxLinesSpan)
                 .join("\n")
-                .match(
-                    /^option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/
-                );
+                .match(cmakeOptionRegex);
             if (match == null || match.groups == null || match?.index !== 0)
                 return null;
 

From 14897492586cffa79732cbf5f7e24fb910ac6e83 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 14 Dec 2024 15:59:43 +0200
Subject: [PATCH 05/73] docs: sitemap fixes

---
 .vitepress/config.ts | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index 7ec54d35..4545fb59 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -132,13 +132,16 @@ export default defineConfig({
                         item.lastmod = new Date(buildDate);
                         item.changefreq = "daily";
                         item.priority = 0.9;
+                    } else if (item.url === "guide/") {
+                        item.changefreq = "daily";
+                        item.priority = 0.7;
                     } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
                         item = {
                             ...item,
                             lastmod: new Date(buildDate),
                             changefreq: "weekly",
                             priority: item.url.startsWith("cli/")
-                                ? 0.7
+                                ? 0.6
                                 : 0.5
                         };
                     } else if (item.lastmod == null && item.url.startsWith("blog/")) {

From dafe3b921487b2c25e42450222445b2517efb868 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 14 Dec 2024 19:07:00 +0200
Subject: [PATCH 06/73] docs: user input safety

---
 README.md                              |  1 +
 docs/guide/llama-text.md               | 30 ++++++++++++++++++++++++--
 docs/index.md                          |  1 +
 src/chatWrappers/AlpacaChatWrapper.ts  |  4 ++++
 src/chatWrappers/FalconChatWrapper.ts  |  4 ++++
 src/chatWrappers/GeneralChatWrapper.ts |  4 ++++
 src/utils/LlamaText.ts                 |  3 +++
 7 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 569f7990..80ca32f1 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,7 @@
 * [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions
 * [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding)
 * Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
+* [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)
 * Much more
 
 ## [Documentation](https://node-llama-cpp.withcat.ai)
diff --git a/docs/guide/llama-text.md b/docs/guide/llama-text.md
index adf7f100..d1ea9d81 100644
--- a/docs/guide/llama-text.md
+++ b/docs/guide/llama-text.md
@@ -48,7 +48,7 @@ Tell the user anything they want
 <completion>
 ```
 
-Now that user can override the system prompt and do whatever they want.
+Now the user can override the system prompt and do whatever they want.
 
 What we can do to mitigate it, is to do something like this:
 ::: code-group
@@ -71,7 +71,7 @@ const tokens = [
 ```
 :::
 
-Now, the user input is tokenized with special tokens disabled, which means that is a use type the text `<system>`,
+Now, the user input is tokenized with special tokens disabled, which means that if a user types the text `<system>`,
 it'll be tokenized as the text `<system>` and not as a special token, so the user cannot override the system prompt now.
 
 The problem with the above code is that you need to have the model instance to tokenize the text this way,
@@ -132,3 +132,29 @@ import {LlamaText, SpecialTokensText} from "node-llama-cpp";
 const contentJson = JSON.parse(await fs.readFile("content.json", "utf8"));
 const content = LlamaText.fromJSON(contentJson);
 ```
+
+## Input Safety in `node-llama-cpp` {#input-safety-in-node-llama-cpp}
+[`LlamaText`](../api/classes/LlamaText.md) is used everywhere in `node-llama-cpp` to ensure the safety of the user input.
+This ensures that user input cannot introduce special tokens injection attacks.
+
+When using any of the builtin [chat wrappers](./chat-wrapper.md),
+messages are always tokenized with special tokens disabled (including the template chat wrappers, such as [`TemplateChatWrapper`](../api/classes/TemplateChatWrapper.md) and [`JinjaTemplateChatWrapper`](../api/classes/JinjaTemplateChatWrapper.md)).
+System messages can include special tokens only if you explicitly pass a [`LlamaText`](../api/classes/LlamaText.md) for them.
+
+When [generating text completions](./text-completion.md) using [`LlamaCompletion`](../api/classes/LlamaCompletion.md), the input is always tokenized with special tokens disabled.
+You can use special tokens in the input by explicitly using [`LlamaText`](../api/classes/LlamaText.md) or passing an array of tokens.
+
+::: info
+The following chat wrappers don't use special tokens at all for the chat template, hence they are not safe against special token injection attacks:
+* [`GeneralChatWrapper`](../api/classes/GeneralChatWrapper.md)
+* [`AlpacaChatWrapper`](../api/classes/AlpacaChatWrapper.md)
+* [`FalconChatWrapper`](../api/classes/FalconChatWrapper.md)
+:::
+
+::: tip NOTE
+Most models (such as Llama, Mistral, etc.) have special tokens marked correctly in their tokenizer,
+so the user input tokenization will be safe when using such models.
+
+However, in rare cases, some models have special tokens marked incorrectly or don't have special tokens at all,
+so safety cannot be guaranteed when using such models.
+:::
diff --git a/docs/index.md b/docs/index.md
index 899cc407..8c92f29f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -95,6 +95,7 @@ npx -y node-llama-cpp inspect gpu
 * [TypeScript type-safety](./api/functions/getLlama.md)
 * [LoRA](./api/type-aliases/LlamaContextOptions.md#lora)
 * [Remote GGUF reader](./api/functions/readGgufFileInfo.md)
+* [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp)
 
 </template>
 <template v-slot:simple-code>
diff --git a/src/chatWrappers/AlpacaChatWrapper.ts b/src/chatWrappers/AlpacaChatWrapper.ts
index 4eb7a0e1..22454587 100644
--- a/src/chatWrappers/AlpacaChatWrapper.ts
+++ b/src/chatWrappers/AlpacaChatWrapper.ts
@@ -1,6 +1,10 @@
 import {ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js";
 import {GeneralChatWrapper} from "./GeneralChatWrapper.js";
 
+/**
+ * This chat wrapper is not safe against chat syntax injection attacks
+ * ([learn more](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)).
+ */
 export class AlpacaChatWrapper extends GeneralChatWrapper {
     public override readonly wrapperName: string = "AlpacaChat";
 
diff --git a/src/chatWrappers/FalconChatWrapper.ts b/src/chatWrappers/FalconChatWrapper.ts
index 180290b7..50198535 100644
--- a/src/chatWrappers/FalconChatWrapper.ts
+++ b/src/chatWrappers/FalconChatWrapper.ts
@@ -2,6 +2,10 @@ import {ChatWrapper, ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js
 import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {LlamaText, SpecialToken, SpecialTokensText} from "../utils/LlamaText.js";
 
+/**
+ * This chat wrapper is not safe against chat syntax injection attacks
+ * ([learn more](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)).
+ */
 export class FalconChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Falcon";
 
diff --git a/src/chatWrappers/GeneralChatWrapper.ts b/src/chatWrappers/GeneralChatWrapper.ts
index 56ccce04..bb5d66d4 100644
--- a/src/chatWrappers/GeneralChatWrapper.ts
+++ b/src/chatWrappers/GeneralChatWrapper.ts
@@ -2,6 +2,10 @@ import {ChatWrapper, ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js
 import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 
+/**
+ * This chat wrapper is not safe against chat syntax injection attacks
+ * ([learn more](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)).
+ */
 export class GeneralChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "General";
 
diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts
index 8236bb8b..ede1e20f 100644
--- a/src/utils/LlamaText.ts
+++ b/src/utils/LlamaText.ts
@@ -10,6 +10,9 @@ export type LlamaTextJSONValue = string | LlamaTextSpecialTokensTextJSON | Llama
 export type LlamaTextSpecialTokensTextJSON = {type: "specialTokensText", value: string};
 export type LlamaTextSpecialTokenJSON = {type: "specialToken", value: string};
 
+/**
+ * @see [Using `LlamaText`](https://node-llama-cpp.withcat.ai/guide/llama-text) tutorial
+ */
 class LlamaText {
     public readonly values: readonly LlamaTextValue[];
 

From 78410ef4c293b295594201812eb49b2711877a50 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 14 Dec 2024 19:09:21 +0200
Subject: [PATCH 07/73] feat(minor): more token values support in
 `SpecialToken`

---
 src/evaluator/LlamaModel/LlamaModel.ts | 2 ++
 src/utils/LlamaText.ts                 | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index a909166b..412fa70f 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -334,6 +334,8 @@ export class LlamaModel {
                 case "EOS": return this.tokens.eos == null ? [] : [this.tokens.eos];
                 case "NL": return this.tokens.nl == null ? [] : [this.tokens.nl];
                 case "EOT": return this.tokens.eot == null ? [] : [this.tokens.eot];
+                case "CLS": return this.tokens.cls == null ? [] : [this.tokens.cls];
+                case "SEP": return this.tokens.sep == null ? [] : [this.tokens.sep];
             }
 
             void (builtinToken satisfies never);
diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts
index ede1e20f..f7a2323c 100644
--- a/src/utils/LlamaText.ts
+++ b/src/utils/LlamaText.ts
@@ -515,7 +515,7 @@ export class SpecialTokensText {
     }
 }
 
-export type BuiltinSpecialTokenValue = "BOS" | "EOS" | "NL" | "EOT";
+export type BuiltinSpecialTokenValue = "BOS" | "EOS" | "NL" | "EOT" | "CLS" | "SEP";
 export class SpecialToken {
     public readonly value: BuiltinSpecialTokenValue;
 
@@ -568,7 +568,7 @@ export class SpecialToken {
 
     public static getTokenToValueMap(tokenizer: Tokenizer): ReadonlyMap<Token | undefined, BuiltinSpecialTokenValue> {
         const supportedValues = [
-            "BOS", "EOS", "NL", "EOT"
+            "BOS", "EOS", "NL", "EOT", "CLS", "SEP"
         ] as const satisfies BuiltinSpecialTokenValue[];
         void (0 as any as BuiltinSpecialTokenValue satisfies typeof supportedValues[number]);
 

From 1477f34ec6eb1e2a759cf0eea89f9c77b55a4bb7 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 14 Dec 2024 19:13:05 +0200
Subject: [PATCH 08/73] docs: improve type docs and types

---
 src/bindings/Llama.ts                              | 9 +++++++++
 src/evaluator/LlamaChatSession/LlamaChatSession.ts | 7 +++++--
 src/evaluator/LlamaCompletion.ts                   | 3 +++
 src/evaluator/LlamaEmbeddingContext.ts             | 3 +++
 src/evaluator/LlamaGrammar.ts                      | 8 +++++++-
 src/evaluator/LlamaJsonSchemaGrammar.ts            | 7 +++++++
 src/evaluator/LlamaModel/LlamaModel.ts             | 3 +++
 src/evaluator/TokenBias.ts                         | 3 +++
 src/gguf/types/GgufMetadataTypes.ts                | 5 +++--
 9 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index 34bba99b..8ae6644b 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -332,14 +332,23 @@ export class Llama {
         });
     }
 
+    /* eslint-disable @stylistic/max-len */
+    /**
+     * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
+     * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
+     */
     public async createGrammarForJsonSchema<const T extends GbnfJsonSchema>(schema: Readonly<T>) {
         return new LlamaJsonSchemaGrammar<T>(this, schema);
     }
+    /* eslint-enable @stylistic/max-len */
 
     public async getGrammarFor(type: Parameters<typeof LlamaGrammar.getFor>[1]) {
         return await LlamaGrammar.getFor(this, type);
     }
 
+    /**
+     * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
+     */
     public async createGrammar(options: LlamaGrammarOptions) {
         return new LlamaGrammar(this, options);
     }
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 3e5842de..c05e9741 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -195,8 +195,8 @@ export type LLamaChatCompletePromptOptions = {
     maxTokens?: LLamaChatPromptOptions["maxTokens"],
 
     /**
-     * When a completion already started being generated and then the signal is aborted,
-     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     * When a completion already started being generated and then the given `signal` is aborted,
+     * the generation will stop and the completion will be returned as-is instead of throwing an error.
      *
      * Defaults to `false`.
      */
@@ -293,6 +293,9 @@ export type LlamaChatSessionRepeatPenalty = {
     presencePenalty?: number
 };
 
+/**
+ * @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
+ */
 export class LlamaChatSession {
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
     /** @internal */ private readonly _autoDisposeSequence: boolean;
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index dc21942f..9f782a9c 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -175,6 +175,9 @@ const defaultMinPrefixKeepTokens = (
     (sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10))
 ) satisfies LlamaInfillGenerationOptions["minPrefixKeepTokens"];
 
+/**
+ * @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
+ */
 export class LlamaCompletion {
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
     /** @internal */ private readonly _autoDisposeSequence: boolean;
diff --git a/src/evaluator/LlamaEmbeddingContext.ts b/src/evaluator/LlamaEmbeddingContext.ts
index f0c8556c..0e8695bc 100644
--- a/src/evaluator/LlamaEmbeddingContext.ts
+++ b/src/evaluator/LlamaEmbeddingContext.ts
@@ -46,6 +46,9 @@ export type LlamaEmbeddingContextOptions = {
     ignoreMemorySafetyChecks?: boolean
 };
 
+/**
+ * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+ */
 export class LlamaEmbeddingContext {
     /** @internal */ private readonly _llamaContext: LlamaContext;
     /** @internal */ private readonly _sequence: LlamaContextSequence;
diff --git a/src/evaluator/LlamaGrammar.ts b/src/evaluator/LlamaGrammar.ts
index 6ed7be20..153824f4 100644
--- a/src/evaluator/LlamaGrammar.ts
+++ b/src/evaluator/LlamaGrammar.ts
@@ -25,6 +25,9 @@ export type LlamaGrammarOptions = {
     rootRuleName?: string
 };
 
+/**
+ * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
+ */
 export class LlamaGrammar {
     /** @internal */ public readonly _llama: Llama;
     /** @internal */ public readonly _grammar: AddonGrammar;
@@ -38,6 +41,9 @@ export class LlamaGrammar {
      * > More info here: [
      * github:ggerganov/llama.cpp:grammars/README.md
      * ](https://github.com/ggerganov/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
+     *
+     * Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
+     * @deprecated Use `llama.createGrammar(...)` instead.
      * @param llama
      * @param options
      */
@@ -79,7 +85,7 @@ export class LlamaGrammar {
         return this._grammar.isTextCompatible(String(text));
     }
 
-    public static async getFor(llama: Llama, type: "json" | "json_arr" | "list" | "c" | "arithmetic" | "japanese" | "chess") {
+    public static async getFor(llama: Llama, type: "json" | "json_arr" | "english" | "list" | "c" | "arithmetic" | "japanese" | "chess") {
         const grammarsFolder = await getGrammarsFolder(llama.buildType);
 
         const grammarFile = path.join(grammarsFolder, type + ".gbnf");
diff --git a/src/evaluator/LlamaJsonSchemaGrammar.ts b/src/evaluator/LlamaJsonSchemaGrammar.ts
index d34b8c63..8cf80e42 100644
--- a/src/evaluator/LlamaJsonSchemaGrammar.ts
+++ b/src/evaluator/LlamaJsonSchemaGrammar.ts
@@ -5,11 +5,17 @@ import {LlamaText} from "../utils/LlamaText.js";
 import {Llama} from "../bindings/Llama.js";
 import {LlamaGrammar} from "./LlamaGrammar.js";
 
+/* eslint-disable @stylistic/max-len */
+/**
+ * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
+ * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
+ */
 export class LlamaJsonSchemaGrammar<const T extends GbnfJsonSchema> extends LlamaGrammar {
     private readonly _schema: T;
 
     /**
      * Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
+     * @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
      */
     public constructor(llama: Llama, schema: Readonly<T>) {
         const grammar = getGbnfGrammarForGbnfJsonSchema(schema);
@@ -35,3 +41,4 @@ export class LlamaJsonSchemaGrammar<const T extends GbnfJsonSchema> extends Llam
         return parsedJson;
     }
 }
+/* eslint-enable @stylistic/max-len */
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 412fa70f..ca2e2c4e 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -515,6 +515,9 @@ export class LlamaModel {
         });
     }
 
+    /**
+     * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+     */
     public async createEmbeddingContext(options: LlamaEmbeddingContextOptions = {}) {
         if (this._vocabOnly)
             throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
diff --git a/src/evaluator/TokenBias.ts b/src/evaluator/TokenBias.ts
index 72b5d33f..741e2a56 100644
--- a/src/evaluator/TokenBias.ts
+++ b/src/evaluator/TokenBias.ts
@@ -3,6 +3,9 @@ import {LlamaText} from "../utils/LlamaText.js";
 import {tokenizeInput} from "../utils/tokenizeInput.js";
 import type {LlamaModel} from "./LlamaModel/LlamaModel.js";
 
+/**
+ * @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
+ */
 export class TokenBias {
     /** @internal */ public readonly _tokenizer: Tokenizer;
     /** @internal */ public readonly _biases = new Map<Token, number>();
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index 5355634d..89922f20 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -225,8 +225,9 @@ export type GgufMetadataTokenizer = {
     readonly ggml: {
         readonly model: "no_vocab" | "llama" | "gpt2" | "bert" | string,
         readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "mpt" |
-            "starcoder" | "gpt-2" | "jina-es" | "jina-de" | "jina-v2-es" | "jina-v2-de" | "refact" | "command-r" | "qwen2" | "stablelm2" |
-            "olmo" | "dbrx" | "smaug-bpe" | string,
+            "starcoder" | "gpt-2" | "phi-2" | "jina-es" | "jina-de" | "jina-v1-en" | "jina-v2-es" | "jina-v2-de" | "jina-v2-code" |
+            "refact" | "command-r" | "qwen2" | "stablelm2" | "olmo" | "dbrx" | "smaug-bpe" | "poro-chat" | "chatglm-bpe" | "viking" |
+            "jais" | "tekken" | "smollm" | "codeshell" | "bloom" | "gpt3-finnish" | "exaone" | "chameleon" | "minerva-7b" | string,
         readonly tokens: readonly string[],
         readonly token_type: GgufMetadataTokenizerTokenType[],
         readonly token_type_count?: number,

From acac07bdf5f0404118698ec05d3d1e1771f186dd Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 14 Dec 2024 20:10:39 +0200
Subject: [PATCH 09/73] feat(minor): improve memory usage estimation

---
 src/gguf/insights/GgufInsights.ts | 67 +++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index ba0d88df..633fe0df 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -370,6 +370,24 @@ export class GgufInsights {
         const cpuTensors: GgufTensorInfo[] = [];
 
         for (const singleTensorInfo of tensorInfo) {
+            // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
+            // loaded with `model.dev_input`, which is always set to the CPU
+            if (isInputLayer(singleTensorInfo.name)) {
+                cpuTensors.push(singleTensorInfo);
+                continue;
+
+            // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always
+            // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU
+            } else if (isOutputLayer(singleTensorInfo.name)) {
+                if (gpuLayers === this.totalLayers) {
+                    gpuTensors.push(singleTensorInfo);
+                    continue;
+                } else {
+                    cpuTensors.push(singleTensorInfo);
+                    continue;
+                }
+            }
+
             const {layerNumber} = parseTensorName(singleTensorInfo.name);
 
             if (gpuLayers !== this.totalLayers) {
@@ -559,3 +577,52 @@ function getTensorNeAndNb(tensor: GgufTensorInfo, {
         nb
     };
 }
+
+function isInputLayer(layerName: string) {
+    const [firstPart] = layerName.split(".");
+
+    if (firstPart == null)
+        return false;
+
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart) {
+        case "token_embd":
+        case "token_embd_norm":
+        case "token_types":
+        case "position_embd":
+            return true;
+    }
+
+    return false;
+}
+
+function isOutputLayer(layerName: string) {
+    const [firstPart, secondPart] = layerName.split(".");
+
+    if (firstPart == null)
+        return false;
+
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart) {
+        case "output":
+        case "output_norm":
+        case "cls":
+            return true;
+    }
+
+    if (secondPart == null)
+        return false;
+
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart + "." + secondPart) {
+        case "cls.output":
+        case "dec.output_norm":
+        case "enc.output_norm":
+            return true;
+    }
+
+    return false;
+}

From 404c71192529b11716d630c805c973e046ab45a3 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 15 Dec 2024 02:38:07 +0200
Subject: [PATCH 10/73] docs: context shift strategy

---
 .vitepress/config.ts                          |   1 +
 docs/guide/chat-context-shift.md              | 111 ++++++++++++++++++
 src/evaluator/LlamaChat/LlamaChat.ts          |  17 ++-
 .../LlamaChatSession/LlamaChatSession.ts      |   1 +
 4 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 docs/guide/chat-context-shift.md

diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index 4545fb59..6a20621d 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -485,6 +485,7 @@ export default defineConfig({
                     {text: "External Chat State", link: "/external-chat-state"},
                     {text: "Token Bias", link: "/token-bias"},
                     {text: "Objects Lifecycle", link: "/objects-lifecycle"},
+                    {text: "Chat Context Shift", link: "/chat-context-shift"},
                     {text: "Batching", link: "/batching"},
                     {text: "Awesome List", link: "/awesome"},
                     {text: "Troubleshooting", link: "/troubleshooting"},
diff --git a/docs/guide/chat-context-shift.md b/docs/guide/chat-context-shift.md
new file mode 100644
index 00000000..3b6759c3
--- /dev/null
+++ b/docs/guide/chat-context-shift.md
@@ -0,0 +1,111 @@
+# Chat Context Shift Strategy {#background}
+When the chat history gets longer than the sequence's context size, we have to remove the oldest tokens from the context state to make room for new tokens to be generated.
+This is called a context shift.
+
+`node-llama-cpp` has a smart mechanism to handle context shifts on the chat level, so the oldest messages are truncated (from their beginning) or removed from the context state, while keeping the system prompt in place to ensure the model follows the guidelines you set for it.
+
+You can override `node-llama-cpp`'s default context shift strategy
+when using [`LlamaChatSession`](../api/classes/LlamaChatSession.md) or [`LlamaChat`](../api/classes/LlamaChat.md)
+by providing a custom context shift strategy.
+
+## The Default Context Shift Strategy {#default-strategy}
+The [default context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is `eraseFirstResponseAndKeepFirstSystem`.
+
+This strategy attempts to truncate the oldest model responses (from their beginning) or remove them completely from the chat history while keeping the first system prompt in place.
+If a response is completely removed, the prompt that came before it will be removed as well.
+
+## Implementing a Custom Context Shift Strategy {#custom-strategy}
+A [custom context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is a function that receives the full chat history as input and
+returns a new chat history that when tokenized will result in an array of tokens shorter than the desired max size.
+
+The context shift strategy will be called only when the context state needs to be shifted.
+
+If the context shift strategy returns an invalid chat history (e.g., a chat history that is too long),
+the prompting function will abort the evaluation and throw an error.
+
+A custom context shift strategy can be a simple logic that prioritizes which data to remove,
+or it can even use a language model to summarize information to shorten the chat history.
+
+It's important to keep the last user prompt and model response as-is to prevent infinite generation loops.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaChatSession} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+
+// ---cut---
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence(),
+    contextShift: {
+        strategy({
+            chatHistory, chatWrapper, maxTokensCount, tokenizer,
+            lastShiftMetadata
+        }) {
+            // clone the chat history to not mutate the original
+            const newChatHistory = chatHistory.map(
+                (item) => structuredClone(item)
+            );
+
+            function getTokensLeftToRemove() {
+                const {
+                    contextText
+                } = chatWrapper.generateContextState({chatHistory});
+                const tokenUsage = contextText.tokenize(tokenizer).length;
+
+                return Math.max(0, tokenUsage - maxTokensCount);
+            }
+
+            while (getTokensLeftToRemove() > 0 && newChatHistory.length > 2) {
+                for (let i = 0; i < newChatHistory.length - 2; i++) {
+                    const chatItem = newChatHistory[i]!;
+
+                    if (i === 0 && chatItem.type === "system")
+                        // don't remove the first system message
+                        continue;
+                    else if (chatItem.type === "model") {
+                        // remove the model response
+                        newChatHistory.splice(i, 1);
+                        i--;
+
+                        // remove the user messages that
+                        // came before the model response
+                        while (
+                            i > 0 &&
+                            newChatHistory[i - 1]?.type === "user"
+                        ) {
+                            newChatHistory.splice(i - 1, 1);
+                            i--;
+                        }
+                    } else if (chatItem.type === "system") {
+                        // don't remove system messages on their own
+                        continue;
+                    } else if (chatItem.type === "user") {
+                        // don't remove user messages on their own
+                        continue;
+                    } else {
+                        // ensure we handle all message types.
+                        // otherwise, this will error
+                        void (chatItem satisfies never);
+                    }
+                }
+            }
+
+            return {
+                chatHistory: newChatHistory,
+
+                // this metadata will be passed to the next context shift
+                // strategy call as the `lastShiftMetadata` argument
+                metadata: {}
+            };
+        }
+    }
+});
+```
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 9f71b2e3..7e8afa7f 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -257,14 +257,27 @@ export type LLamaChatContextShiftOptions = {
 
     /**
      * The strategy to use when deleting tokens from the context window.
+     *
      * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
      */
     strategy?: "eraseFirstResponseAndKeepFirstSystem" | (
         (options: {
-            chatHistory: ChatHistoryItem[],
+            /** Full chat history */
+            chatHistory: readonly ChatHistoryItem[],
+
+            /** Maximum number of tokens that the new chat history should fit under when tokenized */
             maxTokensCount: number,
-            tokenizer(text: string, specialTokens?: boolean): Token[],
+
+            /** Tokenizer used to tokenize the chat history */
+            tokenizer: Tokenizer,
+
+            /** Chat wrapper used to generate the context state */
             chatWrapper: ChatWrapper,
+
+            /**
+             * The metadata returned from the last context shift strategy call.
+             * Will be `null` on the first call.
+             */
             lastShiftMetadata?: object | null
         }) => {chatHistory: ChatHistoryItem[], metadata?: object | null} |
             Promise<{chatHistory: ChatHistoryItem[], metadata?: object | null}>
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index c05e9741..1a17c9d2 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -55,6 +55,7 @@ export type LlamaChatSessionContextShiftOptions = {
 
     /**
      * The strategy to use when deleting tokens from the context window.
+     *
      * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
      */
     strategy?: LLamaChatContextShiftOptions["strategy"]

From 554554b0258c15135d7894b1c2e4a494323ab938 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 15 Dec 2024 03:37:10 +0200
Subject: [PATCH 11/73] docs: CUDA in Docker troubleshooting

---
 docs/guide/docker.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/guide/docker.md b/docs/guide/docker.md
index 8bd8e331..2948a340 100644
--- a/docs/guide/docker.md
+++ b/docs/guide/docker.md
@@ -34,7 +34,7 @@ FROM node:22
 
 # Replace `x86_64` with `sbsa` for ARM64
 ENV NVARCH=x86_64
-ENV INSTALL_CUDA_VERSION=12.6
+ENV INSTALL_CUDA_VERSION=12.5
 
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && \
@@ -172,3 +172,9 @@ docker run --rm -it --runtime=nvidia --gpus=all my-image:tag
 podman run --rm -it --device nvidia.com/gpu=all --security-opt=label=disable --gpus=all my-image:tag
 ```
 :::
+
+### Getting an `system has unsupported display driver / cuda driver combination` Error
+Ensure that the `INSTALL_CUDA_VERSION` in the Dockerfile matches
+or is older than the CUDA version installed on the host machine.
+
+> You can check what is the installed CUDA version using `nvidia-smi --version`.

From 5cc01f3b22b0a73591530e0951c6bb84bdf13123 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 15 Dec 2024 22:46:17 +0200
Subject: [PATCH 12/73] docs: typo

---
 README.md                | 2 +-
 docs/guide/llama-text.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 80ca32f1..9f41fff4 100644
--- a/README.md
+++ b/README.md
@@ -28,8 +28,8 @@
 * Enforce a model to generate output in a parseable format, [like JSON](https://node-llama-cpp.withcat.ai/guide/chat-session#json-response), or even force it to [follow a specific JSON schema](https://node-llama-cpp.withcat.ai/guide/chat-session#response-json-schema)
 * [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions
 * [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding)
-* Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
 * [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)
+* Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
 * Much more
 
 ## [Documentation](https://node-llama-cpp.withcat.ai)
diff --git a/docs/guide/llama-text.md b/docs/guide/llama-text.md
index d1ea9d81..a0a7f70f 100644
--- a/docs/guide/llama-text.md
+++ b/docs/guide/llama-text.md
@@ -135,7 +135,7 @@ const content = LlamaText.fromJSON(contentJson);
 
 ## Input Safety in `node-llama-cpp` {#input-safety-in-node-llama-cpp}
 [`LlamaText`](../api/classes/LlamaText.md) is used everywhere in `node-llama-cpp` to ensure the safety of the user input.
-This ensures that user input cannot introduce special tokens injection attacks.
+This ensures that user input cannot introduce special token injection attacks.
 
 When using any of the builtin [chat wrappers](./chat-wrapper.md),
 messages are always tokenized with special tokens disabled (including the template chat wrappers, such as [`TemplateChatWrapper`](../api/classes/TemplateChatWrapper.md) and [`JinjaTemplateChatWrapper`](../api/classes/JinjaTemplateChatWrapper.md)).

From dff0f5e8a6d54b75f8ac93b31741d6beb1a74b91 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 25 Dec 2024 03:23:23 +0200
Subject: [PATCH 13/73] feat: token prediction, `controlledEvaluate`

* `DraftSequenceTokenPredictor`
* `InputLookupTokenPredictor`
---
 .vitepress/config.ts                          |    4 +
 .vitepress/theme/style.css                    |    3 +-
 docs/guide/token-prediction.md                |  331 +++++
 docs/index.md                                 |    1 +
 llama/addon/AddonContext.cpp                  |  123 +-
 llama/addon/AddonContext.h                    |    1 +
 llama/addon/AddonGrammar.cpp                  |    5 +-
 llama/addon/AddonGrammarEvaluationState.cpp   |   21 +-
 llama/addon/AddonSampler.cpp                  |    7 +-
 package-lock.json                             |    8 +-
 package.json                                  |    2 +-
 src/bindings/AddonTypes.ts                    |   15 +-
 src/cli/commands/ChatCommand.ts               |  174 ++-
 src/cli/commands/CompleteCommand.ts           |  161 ++-
 src/cli/commands/InfillCommand.ts             |  160 ++-
 src/cli/utils/printCommonInfoLines.ts         |   68 +-
 src/cli/utils/resolveCommandGgufPath.ts       |   10 +-
 src/evaluator/LlamaChat/LlamaChat.ts          |   30 +
 src/evaluator/LlamaCompletion.ts              |    2 +
 src/evaluator/LlamaContext/LlamaContext.ts    | 1115 +++++++++++++----
 src/evaluator/LlamaContext/TokenPredictor.ts  |   65 +
 .../DraftSequenceTokenPredictor.ts            |  326 +++++
 .../InputLookupTokenPredictor.ts              |  221 ++++
 src/evaluator/LlamaContext/types.ts           |   56 +
 src/evaluator/LlamaGrammarEvaluationState.ts  |   26 +-
 src/index.ts                                  |   14 +-
 .../llama3.1/tokenPredictor.test.ts           |  276 ++++
 27 files changed, 2922 insertions(+), 303 deletions(-)
 create mode 100644 docs/guide/token-prediction.md
 create mode 100644 src/evaluator/LlamaContext/TokenPredictor.ts
 create mode 100644 src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
 create mode 100644 src/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.ts
 create mode 100644 test/modelDependent/llama3.1/tokenPredictor.test.ts

diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index 6a20621d..c4cd0c9a 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -361,6 +361,9 @@ export default defineConfig({
         }
     },
     markdown: {
+        languageAlias: {
+            "js-highlight": "javascript"
+        },
         codeTransformers: [
             transformerTwoslash({
                 explicitTrigger: false,
@@ -487,6 +490,7 @@ export default defineConfig({
                     {text: "Objects Lifecycle", link: "/objects-lifecycle"},
                     {text: "Chat Context Shift", link: "/chat-context-shift"},
                     {text: "Batching", link: "/batching"},
+                    {text: "Token Prediction", link: "/token-prediction"},
                     {text: "Awesome List", link: "/awesome"},
                     {text: "Troubleshooting", link: "/troubleshooting"},
                     {text: "Tips and Tricks", link: "/tips-and-tricks"}
diff --git a/.vitepress/theme/style.css b/.vitepress/theme/style.css
index fa10533a..30a9ac51 100644
--- a/.vitepress/theme/style.css
+++ b/.vitepress/theme/style.css
@@ -354,7 +354,8 @@ div.search-keyboard-shortcuts[class] kbd:last-of-type {
 }
 
 .language-ts > .lang,
-.language-shell > .lang {
+.language-shell > .lang,
+.language-js-highlight > .lang {
     display: none;
 }
 
diff --git a/docs/guide/token-prediction.md b/docs/guide/token-prediction.md
new file mode 100644
index 00000000..41b19100
--- /dev/null
+++ b/docs/guide/token-prediction.md
@@ -0,0 +1,331 @@
+---
+description: Using token predictors to speed up the generation process in node-llama-cpp
+---
+# Using Token Predictors
+## Background {#background}
+The output generation process is an iterative process where the model generates one token at a time,
+and the generated token is appended to the sequence state to generate the next token.
+
+```js-highlight
+Evaluation: [1, 2, 3] -> 4
+Evaluation: [1, 2, 3, 4] -> 5
+Evaluation: [1, 2, 3, 4, 5] -> 6
+...
+```
+
+If your machine can handle many evaluations in parallel, and you want to speed up the generation process,
+then you can use token predictors. This is also called speculative decoding.
+
+A token predictor is a mechanism that predicts the next few tokens faster than the model can generate them,
+but the predictions can be inaccurate.
+We then generate the next token and validate the predictions of the tokens that follow it, all in parallel.
+After the validation, we discard the incorrect predictions and use the correct ones to speed up the generation process.
+
+Using token predictors **doesn't affect** the quality of the generated output, but it can speed up the generation process.
+
+```js-highlight
+Prediction: [1, 2, 3] -> [4, 5, 2, 7]
+
+// All of these are evaluated in parallel
+Evaluation: [1, 2, 3] -> 4 // the next token, wasn't based on prediction
+Evaluation: [1, 2, 3, 4] -> 5 // ✔ correct prediction
+Evaluation: [1, 2, 3, 4, 5] -> 6 // ✘ incorrect prediction
+Evaluation: [1, 2, 3, 4, 5, 2] -> 3 // ✘ incorrect prediction
+Evaluation: [1, 2, 3, 4, 5, 2, 7] -> 4 // ✘ incorrect prediction
+
+
+Prediction: [1, 2, 3, 4, 5, 6] -> ...
+```
+> In this example, given the input `[1, 2, 3]`, the predictor predicted `[4, 5, 2, 7]` as the next tokens.
+> 
+> <br />
+> 
+> We then generated the next token for each of these inputs in parallel:
+> `[1, 2, 3,]`, `[1, 2, 3, 4]`, `[1, 2, 3, 4, 5]`, `[1, 2, 3, 4, 5, 2]`, and `[1, 2, 3, 4, 5, 2, 7]`.
+> 
+> <br />
+>
+> The generated result for the input `[1, 2, 3]` is `4`. We generated this result without using the prediction.
+>
+> <br />
+> 
+> If we were generating the output iteratively, we would now have to evaluate the state `[1, 2, 3, 4]`
+> to generate the next token, but because we had the prediction, we already evaluated this input and found
+> that the next token is `5`, so we can use this result right away without any additional evaluation.
+>
+> <br />
+> 
+> Now for the state of `[1, 2, 3, 4, 5]` the generation output is `6`, which is different from the prediction `2`.
+> We discard this prediction and the following ones and clear them from the context sequence state,
+> and continue the generation process as usual.
+>
+> <br />
+> 
+> We will now have to evaluate the state `[1, 2, 3, 4, 5, 6]` to generate the next token,
+> and we can use token predictions again to speed up the process.
+
+The token predictors run in parallel to the regular evaluation process, so if the prediction takes longer than the evaluation,
+it will just be discarded and the regular evaluation process will continue.
+
+::: tip NOTE
+If the predictor is too resource intensive, it can slow down the generation process due to the overhead of running the predictor.
+
+It's recommended to test resource intensive token predictors on the machine you plan to run them on to see if they provide a speedup.
+:::
+
+
+## Draft Model Token Predictor {#draft-model}
+A common method to predict the next tokens when using large models is to use a smaller model (draft model) of the same model family to predict (draft) the next tokens faster.
+
+This works only if both models have the same tokenizer configuration and behave similarly.
+
+If the smaller model is too large, it may take longer to generate the predictions and validate them than to generate the output tokens directly.
+Also, if your machine isn't capable enough, the draft model can take resources that would have otherwise been used to generate the output, which would result in a slowdown. 
+
+It's recommended to measure the performance of the model combination you choose on the target machine you plan to run this on to see whether it provides any speedup.
+
+An example combination of models that would benefit from draft model token prediction can be using [Llama 3.3 70B](https://huggingface.co/mradermacher/Llama-3.3-70B-Instruct-GGUF) with [Llama 3.1 8B](https://huggingface.co/mradermacher/Meta-Llama-3.1-8B-Instruct-GGUF).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {
+    getLlama,
+    DraftSequenceTokenPredictor,
+    LlamaChatSession
+} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const draftModel = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "small-model.gguf")
+});
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "large-model.gguf")
+});
+
+const draftContext = await draftModel.createContext({
+    contextSize: {
+        // we don't want to use too much memory
+        // for the draft sequence, so we limit the size
+        max: 4096
+    }
+});
+const context = await model.createContext();
+
+const draftContextSequence = draftContext.getSequence();
+const contextSequence = context.getSequence({
+    tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence, {
+        // try to change this value to `1` or more
+        // and see the difference in response times
+        minTokens: 0
+    })
+});
+
+const session = new LlamaChatSession({contextSequence});
+
+// preload the preamble to the context
+// to measure only the generation time
+await session.preloadPrompt("");
+
+
+const q1 = "Hi there, how are you?";
+console.log("User: " + q1);
+
+const startTime = Date.now();
+const a1 = await session.prompt(q1);
+const endTime = Date.now();
+const responseTime = endTime - startTime;
+
+console.log("AI: " + a1);
+console.log("Response time: " + responseTime.toLocaleString("en-US") + "ms");
+console.log("Validated tokens: " + contextSequence.tokenPredictions.validated);
+console.log("Refuted tokens: " + contextSequence.tokenPredictions.refuted);
+```
+> `Validated tokens` are the number of token predictions that were validated as correct,
+> and `Refuted tokens` are the number of token predictions that were refuted as incorrect.
+> 
+> You should aim to find a small model that would provide the lowest `Refuted tokens` count and the highest `Validated tokens` count,
+> while also being fast enough to provide a speedup.
+
+
+## Input Lookup Token Predictor {#input-lookup}
+When using a model for input-grounded tasks (tasks where the model frequently repeats some of the input tokens in
+its output, such as text summarization or modifying code),
+the last few generated tokens can be used to try to find a pattern in the input and predict the next few tokens based on it.
+
+The advantage of this method is that it doesn't require using another model to generate token predictions,
+but it's only effective for tasks where the model repeats some of the input tokens in the output.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {
+    getLlama,
+    InputLookupTokenPredictor,
+    LlamaChatSession
+} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+
+const contextSequence = context.getSequence({
+    tokenPredictor: new InputLookupTokenPredictor({
+        patternLength: {
+            min: 2
+        },
+        predictionLength: {
+            max: 2
+        }
+    })
+});
+
+const session = new LlamaChatSession({contextSequence});
+
+// preload the preamble to the context
+// to measure only the generation time
+await session.preloadPrompt("");
+
+
+const article = "<some long text here>";
+const q1 = [
+    article,
+    "\n------\n",
+    "Summarize the above article in a few sentences"
+].join("\n");
+console.log("User: " + q1);
+
+const startTime = Date.now();
+const a1 = await session.prompt(q1);
+const endTime = Date.now();
+const responseTime = endTime - startTime;
+
+console.log("AI: " + a1);
+console.log("Response time: " + responseTime.toLocaleString("en-US") + "ms");
+console.log("Validated tokens: " + contextSequence.tokenPredictions.validated);
+console.log("Refuted tokens: " + contextSequence.tokenPredictions.refuted);
+```
+> `Validated tokens` are the number of token predictions that were validated as correct,
+> and `Refuted tokens` are the number of token predictions that were refuted as incorrect.
+>
+> You should aim to find a balance in the [`InputLookupTokenPredictor`](../api/classes/InputLookupTokenPredictor.md) configuration that works well for your
+> average use cases that would provide the lowest `Refuted tokens` count and the highest `Validated tokens` count.
+
+
+## Custom Token Predictor {#custom}
+You can create your own token predictor by extending the [`TokenPredictor`](../api/classes/TokenPredictor.md) class and implementing the necessary methods.
+
+```typescript
+import {
+    TokenPredictor,
+    LlamaContextSequence,
+    Token,
+    SequenceEvaluateOptions,
+    DisposedError
+} from "node-llama-cpp";
+
+export class MyCustomTokenPredictor extends TokenPredictor {
+    public readonly minPredictionTokens: number;
+    private _stateTokens: Token[] = [];
+    private _inputTokens: Token[] = [];
+    private _disposed: boolean = false;
+
+    public constructor({
+        minPredictionTokens = 0
+    }: {
+        minPredictionTokens?: number
+    } = {}) {
+        super();
+
+        this.minPredictionTokens = minPredictionTokens;
+    }
+
+    // called before the generation starts
+    // can return a promise if the reset operation is async
+    public reset({stateTokens}: {
+        // target sequence that this predictor is supposed to assist
+        targetSequence: LlamaContextSequence,
+
+        // the tokens that should be regarded to as the current state
+        // of the target sequence.
+        // the first predictions should be based on these tokens
+        stateTokens: Token[],
+
+        // the evaluation options used for the generation
+        // in the target sequence
+        evaluateOptions: Readonly<SequenceEvaluateOptions>
+    }) {
+        // we save the state tokens so we can use them to provide completions
+        this._stateTokens = stateTokens.slice();
+    }
+
+    // called with the user input tokens before `predictTokens` is called
+    public override updateInputTokens(tokens: Token[]) {
+        this._inputTokens = tokens.slice();
+    }
+
+    // called whenever tokens are added to the state of the target sequence,
+    // whether due to the predicted tokens being validated or the user input.
+    // in either case, we should regard these tokens as added to the state.
+    // we can resume a background prediction process if it was stopped
+    // (whether due to the `.stop()` method being called or the maximum
+    // number of predictions being reached).
+    public pushTokens(tokens: Token[]) {
+        for (const token of tokens)
+            this._stateTokens.push(token);
+    }
+
+    // called when the current evaluation gathers predictions.
+    // if there's no background prediction process,
+    // then it can start when this function is called.
+    // the function can return a promise if the main generation
+    // should wait until the predictions are ready,
+    // like when `minPredictionTokens` is greater than 0.
+    // ideally, this function should return the predictions it already has
+    // and not wait for the background prediction process to
+    // finish, to avoid slowing the main generation process.
+    public predictTokens(): Promise<Token[]> | Token[] {
+        if (this._disposed)
+            throw new DisposedError();
+
+        const recentTokens = this._stateTokens.slice(-10);
+        const firstToken = recentTokens[0];
+        if (firstToken != null) {
+            const tokenIndex = this._inputTokens.indexOf(firstToken);
+            if (tokenIndex >= 0) {
+                return this._inputTokens.slice(tokenIndex + 10);
+            }
+        }
+
+        return this._inputTokens.slice(0, this.minPredictionTokens);
+    }
+
+    // all background prediction processes should be stopped
+    // when this method is called.
+    // if `untilPredictionsExhausted` is true, the prediction process
+    // can automatically resume once the current predictions
+    // are exhausted (refuted or validated by the state
+    // additions added by the `pushTokens` method).
+    // can return a promise if the stop operation is async
+    public override stop(untilPredictionsExhausted: boolean = false) {
+        // stop the prediction process
+    }
+
+    // called when the target sequence is manually disposed.
+    // when this is called, we should release
+    // all resources used by this predictor.
+    // can return a promise if the dispose operation is async
+    public override dispose() {
+        this._disposed = true;
+        this._stateTokens = [];
+        this._inputTokens = [];
+    }
+}
+```
+> If you manage to create a generic and performant token predictor, consider [opening a PR](./development.md) to contribute it to `node-llama-cpp`.
diff --git a/docs/index.md b/docs/index.md
index 8c92f29f..aa400999 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -96,6 +96,7 @@ npx -y node-llama-cpp inspect gpu
 * [LoRA](./api/type-aliases/LlamaContextOptions.md#lora)
 * [Remote GGUF reader](./api/functions/readGgufFileInfo.md)
 * [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp)
+* [Token prediction](./guide/token-prediction.md)
 
 </template>
 <template v-slot:simple-code>
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 978033b5..6abe8cf9 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -190,6 +190,10 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
     public:
         AddonContext* ctx;
         AddonSampler* sampler;
+        bool arrayResult = false;
+        bool returnLogprobs = false;
+        Napi::Array logprobs;
+        bool has_logprobs = false;
         int32_t batchLogitIndex;
         llama_token result;
         bool no_output = false;
@@ -202,6 +206,8 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             batchLogitIndex = info[0].As<Napi::Number>().Int32Value();
             sampler = Napi::ObjectWrap<AddonSampler>::Unwrap(info[1].As<Napi::Object>());
+            arrayResult = info.Length() > 2 && info[2].IsBoolean();
+            returnLogprobs = arrayResult ? info[2].As<Napi::Boolean>().Value() : false;
             sampler->Ref();
         }
         ~AddonContextSampleTokenWorker() {
@@ -251,6 +257,23 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             llama_sampler_apply(sampler->chain, &cur_p);
 
+            if (returnLogprobs) {
+                if (!cur_p.sorted) {
+                    std::sort(cur_p.data, cur_p.data + cur_p.size, [](const llama_token_data & a, const llama_token_data & b) {
+                        return a.logit > b.logit;
+                    });
+                    cur_p.sorted = true;
+                }
+
+                logprobs = Napi::Array::New(Env(), cur_p.size * 2);
+                for (size_t i = 0; i < cur_p.size; i++) {
+                    logprobs.Set(i * 2, Napi::Number::New(Env(), cur_p.data[i].id));
+                    logprobs.Set(i * 2 + 1, Napi::Number::New(Env(), cur_p.data[i].logit));
+                }
+
+                has_logprobs = true;
+            }
+
             if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) {
                 no_output = true;
                 return;
@@ -261,14 +284,26 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
             result = new_token_id;
         }
         void OnOK() {
+            Napi::Number resultToken;
             if (no_output) {
-                Napi::Number resultValue = Napi::Number::New(Env(), -1);
-                deferred.Resolve(resultValue);
+                resultToken = Napi::Number::New(Env(), -1);
+            } else {
+                resultToken = Napi::Number::New(Env(), static_cast<uint32_t>(result));
+            }
+
+            if (!arrayResult) {
+                deferred.Resolve(resultToken);
                 return;
             }
 
-            Napi::Number resultValue = Napi::Number::New(Env(), static_cast<uint32_t>(result));
-            deferred.Resolve(resultValue);
+            Napi::Array resultArray = Napi::Array::New(Env(), 2);
+            resultArray.Set(Napi::Number::New(Env(), 0), resultToken);
+            
+            if (has_logprobs) {
+                resultArray.Set(1, logprobs);
+            }
+
+            deferred.Resolve(resultArray);
         }
         void OnError(const Napi::Error& err) {
             deferred.Reject(err.Value());
@@ -441,24 +476,25 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {
     int32_t sequenceId = info[0].As<Napi::Number>().Int32Value();
     int32_t firstTokenContextIndex = info[1].As<Napi::Number>().Int32Value();
     Napi::Uint32Array tokens = info[2].As<Napi::Uint32Array>();
-    bool generateLogitAtTheEnd = info[3].As<Napi::Boolean>().Value();
+    Napi::Uint32Array tokenLogitIndexes = info[3].As<Napi::Uint32Array>();
 
     auto tokensLength = tokens.ElementLength();
+    auto tokenLogitIndexesLength = tokenLogitIndexes.ElementLength();
     GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);
 
-    for (size_t i = 0; i < tokensLength; i++) {
-        common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
-    }
-
-    if (generateLogitAtTheEnd) {
-        batch.logits[batch.n_tokens - 1] = true;
+    Napi::Uint32Array resLogitIndexes = Napi::Uint32Array::New(info.Env(), tokenLogitIndexesLength);
 
-        auto logit_index = batch.n_tokens - 1;
-
-        return Napi::Number::From(info.Env(), logit_index);
+    for (size_t i = 0, l = 0; i < tokensLength; i++) {
+        if (l < tokenLogitIndexesLength && l < tokenLogitIndexesLength && tokenLogitIndexes[l] == i) {
+            common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, true);
+            resLogitIndexes[l] = batch.n_tokens - 1;
+            l++;
+        } else {
+            common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
+        }
     }
 
-    return info.Env().Undefined();
+    return resLogitIndexes;
 }
 Napi::Value AddonContext::DisposeSequence(const Napi::CallbackInfo& info) {
     if (disposed) {
@@ -592,6 +628,62 @@ Napi::Value AddonContext::PrintTimings(const Napi::CallbackInfo& info) {
     return info.Env().Undefined();
 }
 
+Napi::Value AddonContext::EnsureDraftContextIsCompatibleForSpeculative(const Napi::CallbackInfo& info) {
+    constexpr auto vocabSizeMaxDifference = 128; // SPEC_VOCAB_MAX_SIZE_DIFFERENCE
+    constexpr auto vocabCheckStartTokenId = 5; // SPEC_VOCAB_CHECK_START_TOKEN_ID
+
+    const AddonContext * draftContext = Napi::ObjectWrap<AddonContext>::Unwrap(info[0].As<Napi::Object>());
+    const auto currentCtx = ctx;
+    const auto draftCtx = draftContext->ctx;
+    const auto currentModel = model->model;
+    const auto draftModel = draftContext->model->model;
+
+    if (llama_vocab_type(currentModel) != llama_vocab_type(draftModel)) {
+        Napi::Error::New(info.Env(), "Speculative draft model vocabulary type must match the target model vocabulary type").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    if (llama_add_bos_token(currentModel) != llama_add_bos_token(draftModel) ||
+        llama_add_eos_token(currentModel) != llama_add_eos_token(draftModel) ||
+        llama_token_bos(currentModel) != llama_token_bos(draftModel) ||
+        llama_token_eos(currentModel) != llama_token_eos(draftModel)
+    ) {
+        Napi::Error::New(info.Env(), "Speculative draft model special tokens must match the target model special tokens").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    const int currentModelVocabSize = llama_n_vocab(currentModel);
+    const int draftModelVocabSize = llama_n_vocab(draftModel);
+
+    const int vocabDiff = std::abs(currentModelVocabSize - draftModelVocabSize);
+
+    if (vocabDiff > vocabSizeMaxDifference) {
+        Napi::Error::New(
+            info.Env(),
+            std::string("Speculative draft model vocabulary must closely match the target model vocabulary size (vocabulary size difference: ") +
+            std::to_string(vocabDiff) + std::string(", max allowed: ") + std::to_string(vocabSizeMaxDifference) + std::string(")")
+        ).ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    const int minVocabSize = std::min(currentModelVocabSize, draftModelVocabSize);
+    for (int i = vocabCheckStartTokenId; i < minVocabSize; ++i) {
+        const char * currentTokenText = llama_token_get_text(currentModel, i);
+        const char * draftTokenText = llama_token_get_text(draftModel, i);
+        if (std::strcmp(currentTokenText, draftTokenText) != 0) {
+            Napi::Error::New(
+                info.Env(),
+                std::string("Speculative draft model vocabulary must match the target model vocabulary, but token ") +
+                std::to_string(i) + std::string(" content differs. Target: \"") + std::string(currentTokenText) +
+                std::string("\", Draft: \"") + std::string(draftTokenText) + std::string("")
+            ).ThrowAsJavaScriptException();
+            return info.Env().Undefined();
+        }
+    }
+
+    return info.Env().Undefined();
+}
+
 Napi::Value AddonContext::SetLora(const Napi::CallbackInfo& info) {
     AddonModelLora* lora = Napi::ObjectWrap<AddonModelLora>::Unwrap(info[0].As<Napi::Object>());
     float scale = info[1].As<Napi::Number>().FloatValue();
@@ -622,6 +714,7 @@ void AddonContext::init(Napi::Object exports) {
                 InstanceMethod("getThreads", &AddonContext::GetThreads),
                 InstanceMethod("setThreads", &AddonContext::SetThreads),
                 InstanceMethod("printTimings", &AddonContext::PrintTimings),
+                InstanceMethod("ensureDraftContextIsCompatibleForSpeculative", &AddonContext::EnsureDraftContextIsCompatibleForSpeculative),
                 InstanceMethod("setLora", &AddonContext::SetLora),
                 InstanceMethod("dispose", &AddonContext::Dispose),
             }
diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h
index 5af34188..b86a4f2f 100644
--- a/llama/addon/AddonContext.h
+++ b/llama/addon/AddonContext.h
@@ -45,6 +45,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
         Napi::Value SetThreads(const Napi::CallbackInfo& info);
 
         Napi::Value PrintTimings(const Napi::CallbackInfo& info);
+        Napi::Value EnsureDraftContextIsCompatibleForSpeculative(const Napi::CallbackInfo& info);
 
         Napi::Value SetLora(const Napi::CallbackInfo& info);
 
diff --git a/llama/addon/AddonGrammar.cpp b/llama/addon/AddonGrammar.cpp
index 15db61dd..47b6aed8 100644
--- a/llama/addon/AddonGrammar.cpp
+++ b/llama/addon/AddonGrammar.cpp
@@ -46,13 +46,10 @@ Napi::Value AddonGrammar::isTextCompatible(const Napi::CallbackInfo& info) {
     }
 
     const auto cpts = unicode_cpts_from_utf8(testText);
-    const llama_grammar_rules  & rules = llama_grammar_get_rules(parsed_grammar);
     llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(parsed_grammar);
 
     for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(parsed_grammar);
-
-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        llama_grammar_accept(parsed_grammar, cpt);
 
         if (stacks_cur.empty()) {
             // no stacks means that the grammar failed to match at this point
diff --git a/llama/addon/AddonGrammarEvaluationState.cpp b/llama/addon/AddonGrammarEvaluationState.cpp
index e5acec76..4fb6506d 100644
--- a/llama/addon/AddonGrammarEvaluationState.cpp
+++ b/llama/addon/AddonGrammarEvaluationState.cpp
@@ -6,13 +6,24 @@
 #include "AddonGrammar.h"
 
 AddonGrammarEvaluationState::AddonGrammarEvaluationState(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonGrammarEvaluationState>(info) {
-    model = Napi::ObjectWrap<AddonModel>::Unwrap(info[0].As<Napi::Object>());
-    model->Ref();
+    if (info.Length() == 1) {
+        AddonGrammarEvaluationState* existingState = Napi::ObjectWrap<AddonGrammarEvaluationState>::Unwrap(info[0].As<Napi::Object>());
+        model = existingState->model;
+        model->Ref();
 
-    grammarDef = Napi::ObjectWrap<AddonGrammar>::Unwrap(info[1].As<Napi::Object>());
-    grammarDef->Ref();
+        grammarDef = existingState->grammarDef;
+        grammarDef->Ref();
 
-    sampler = llama_sampler_init_grammar(model->model, grammarDef->grammarCode.c_str(), grammarDef->rootRuleName.c_str());
+        sampler = llama_sampler_clone(existingState->sampler);
+    } else {
+        model = Napi::ObjectWrap<AddonModel>::Unwrap(info[0].As<Napi::Object>());
+        model->Ref();
+
+        grammarDef = Napi::ObjectWrap<AddonGrammar>::Unwrap(info[1].As<Napi::Object>());
+        grammarDef->Ref();
+
+        sampler = llama_sampler_init_grammar(model->model, grammarDef->grammarCode.c_str(), grammarDef->rootRuleName.c_str());
+    }
 }
 AddonGrammarEvaluationState::~AddonGrammarEvaluationState() {
     llama_sampler_free(sampler);
diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp
index d84160d7..1a0ca5a7 100644
--- a/llama/addon/AddonSampler.cpp
+++ b/llama/addon/AddonSampler.cpp
@@ -350,15 +350,10 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {
 
         if (shouldCreateSampler) {
             repeatPenaltySampler = llama_sampler_init_penalties(
-                llama_n_vocab(model->model),
-                llama_token_eos(model->model),
-                llama_token_nl(model->model),
                 repeatPenaltyMaxTokens,
                 repeatPenalty,
                 repeatPenaltyFrequencyPenalty,
-                repeatPenaltyPresencePenalty,
-                true,
-                false
+                repeatPenaltyPresencePenalty
             );
             repeatPenalty_lastTokens = RingBuffer<llama_token>(repeatPenaltyMaxTokens);
 
diff --git a/package-lock.json b/package-lock.json
index b59e7907..4286a8eb 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -24,7 +24,7 @@
         "ignore": "^5.3.2",
         "ipull": "^3.9.2",
         "is-unicode-supported": "^2.1.0",
-        "lifecycle-utils": "^1.7.0",
+        "lifecycle-utils": "^1.7.2",
         "log-symbols": "^7.0.0",
         "nanoid": "^5.0.9",
         "node-addon-api": "^8.3.0",
@@ -10458,9 +10458,9 @@
       }
     },
     "node_modules/lifecycle-utils": {
-      "version": "1.7.0",
-      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-1.7.0.tgz",
-      "integrity": "sha512-suNHxB8zsWrvsWxsmy9PsOcHuThRsCzvUhtGwxfvYAl8mbeWv7lt+wNT3q9KgILWmNe9zEVZ6PXo1gsvpYIdvw==",
+      "version": "1.7.2",
+      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-1.7.2.tgz",
+      "integrity": "sha512-HwjXBOf6FUun3liJ2XA3bNR7WZoWn0b3ji9fSvQ7qvnfhPd1UCuq3M6If44Rl3K14373gm/eyeIYbpDAfB9cbw==",
       "license": "MIT"
     },
     "node_modules/lines-and-columns": {
diff --git a/package.json b/package.json
index 3ac48033..152007c3 100644
--- a/package.json
+++ b/package.json
@@ -190,7 +190,7 @@
     "ignore": "^5.3.2",
     "ipull": "^3.9.2",
     "is-unicode-supported": "^2.1.0",
-    "lifecycle-utils": "^1.7.0",
+    "lifecycle-utils": "^1.7.2",
     "log-symbols": "^7.0.0",
     "nanoid": "^5.0.9",
     "node-addon-api": "^8.3.0",
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 6303a711..c5d1459d 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -37,7 +37,8 @@ export type BindingModule = {
         }): AddonGrammar
     },
     AddonGrammarEvaluationState: {
-        new (model: AddonModel, grammar: AddonGrammar): AddonGrammarEvaluationState
+        new (model: AddonModel, grammar: AddonGrammar): AddonGrammarEvaluationState,
+        new (existingState: AddonGrammarEvaluationState): AddonGrammarEvaluationState
     },
     AddonSampler: {
         new (model: AddonModel): AddonSampler,
@@ -119,10 +120,15 @@ export type AddonContext = {
         sequenceId: number,
         firstTokenSequenceIndex: number,
         tokens: Uint32Array,
-        generateLogitAtTheEnd: boolean
-    ): BatchLogitIndex | undefined, // returns batchLogitIndex if `generateLogitAtTheEnd` is true
+        logitIndexes: Uint32Array,
+    ): Uint32Array, // returns an array with batchLogitIndex for each item in the logitIndexes array
     decodeBatch(): Promise<void>,
-    sampleToken(batchLogitIndex: BatchLogitIndex, sampler: AddonSampler): Promise<Token>,
+    sampleToken(batchLogitIndex: BatchLogitIndex, sampler: AddonSampler): Promise<Token | -1>,
+    sampleToken(
+        batchLogitIndex: BatchLogitIndex,
+        sampler: AddonSampler,
+        logprobs: boolean
+    ): Promise<[Token | -1, (Token | number)[] | undefined]>,
     disposeSequence(sequenceId: number): void,
 
     // startPos in inclusive, endPos is exclusive
@@ -136,6 +142,7 @@ export type AddonContext = {
     getThreads(): number,
     setThreads(threads: number): void,
     printTimings(): void,
+    ensureDraftContextIsCompatibleForSpeculative(draftContext: AddonContext): void,
     setLora(lora: AddonModelLora, scale: number): void
 };
 
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index c80a4760..621dbb68 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -4,6 +4,7 @@ import path from "path";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import fs from "fs-extra";
+import prettyMilliseconds from "pretty-ms";
 import {chatCommandHistoryFilePath, defaultChatSystemPrompt, documentationPageUrls} from "../../config.js";
 import {getIsInDocumentationMode} from "../../state.js";
 import {ReplHistory} from "../../utils/ReplHistory.js";
@@ -28,6 +29,7 @@ import {withProgressLog} from "../../utils/withProgressLog.js";
 import {resolveHeaderFlag} from "../utils/resolveHeaderFlag.js";
 import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescriptionDocsUrl.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
+import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 
 type ChatCommand = {
     modelPath?: string,
@@ -61,8 +63,11 @@ type ChatCommand = {
     maxTokens: number,
     noHistory: boolean,
     environmentFunctions: boolean,
+    tokenPredictionDraftModel?: string,
+    tokenPredictionModelContextSize?: number,
     debug: boolean,
     meter: boolean,
+    timing: boolean,
     printTimings: boolean
 };
 
@@ -261,6 +266,17 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Provide access to environment functions like `getDate` and `getTime`"
             })
+            .option("tokenPredictionDraftModel", {
+                alias: ["dm", "draftModel"],
+                type: "string",
+                description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
+            })
+            .option("tokenPredictionModelContextSize", {
+                alias: ["dc", "draftContextSize", "draftContext"],
+                type: "number",
+                description: "Max context size to use for the draft sequence token prediction model context",
+                default: 4096
+            })
             .option("debug", {
                 alias: "d",
                 type: "boolean",
@@ -272,11 +288,16 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Print how many tokens were used as input and output for each response"
             })
+            .option("timing", {
+                type: "boolean",
+                default: false,
+                description: "Print how how long it took to generate each response"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
                 default: false,
-                description: "Print llama.cpp timings after each response"
+                description: "Print llama.cpp's internal timings after each response"
             });
     },
     async handler({
@@ -285,14 +306,15 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
-        environmentFunctions, debug, meter, printTimings
+        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
     }) {
         try {
             await RunChat({
                 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
                 batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-                maxTokens, noHistory, environmentFunctions, debug, meter, printTimings
+                maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
+                timing, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -307,7 +329,8 @@ async function RunChat({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
     contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
-    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, debug, meter, printTimings
+    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
+    tokenPredictionModelContextSize, debug, meter, timing, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -334,6 +357,12 @@ async function RunChat({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention
     });
+    const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
+        ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
+            flashAttention,
+            consoleTitle: "Draft model file"
+        })
+        : undefined;
 
     if (systemInfo)
         console.log(llama.systemInfo);
@@ -393,6 +422,57 @@ async function RunChat({
             }
         }
     });
+    const draftModel = resolvedDraftModelPath == null
+        ? undefined
+        : await withProgressLog({
+            loadingText: chalk.blue.bold("Loading draft model"),
+            successText: chalk.blue("Draft model loaded"),
+            failText: chalk.blue("Failed to load draft model"),
+            liveUpdates: !debug,
+            noProgress: debug,
+            liveCtrlCSendsAbortSignal: true
+        }, async (progressUpdater) => {
+            try {
+                return await llama.loadModel({
+                    modelPath: resolvedDraftModelPath,
+                    defaultContextFlashAttention: flashAttention,
+                    onLoadProgress(loadProgress: number) {
+                        progressUpdater.setProgress(loadProgress);
+                    },
+                    loadSignal: progressUpdater.abortSignal
+                });
+            } catch (err) {
+                if (err === progressUpdater.abortSignal?.reason)
+                    process.exit(0);
+
+                throw err;
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+
+    const draftContext = draftModel == null
+        ? undefined
+        : await withOra({
+            loading: chalk.blue("Creating draft context"),
+            success: chalk.blue("Draft context created"),
+            fail: chalk.blue("Failed to create draft context"),
+            useStatusLogs: debug
+        }, async () => {
+            try {
+                return await draftModel.createContext({
+                    contextSize: {max: tokenPredictionModelContextSize}
+                });
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
     const context = await withOra({
         loading: chalk.blue("Creating context"),
         success: chalk.blue("Context created"),
@@ -414,6 +494,7 @@ async function RunChat({
             }
         }
     });
+
     const grammar = jsonSchemaGrammarFilePath != null
         ? new LlamaJsonSchemaGrammar(
             llama,
@@ -432,13 +513,20 @@ async function RunChat({
         tokenizer: model.tokenizer,
         noJinja
     }) ?? new GeneralChatWrapper();
-    const contextSequence = context.getSequence();
+    const draftContextSequence = draftContext?.getSequence();
+    const contextSequence = draftContextSequence != null
+        ? context.getSequence({
+            tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
+        })
+        : context.getSequence();
     const session = new LlamaChatSession({
         contextSequence,
         systemPrompt,
         chatWrapper: chatWrapper
     });
+    let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
     let lastTokenMeterState = contextSequence.tokenMeter.getState();
+    let lastTokenPredictionsStats = contextSequence.tokenPredictions;
 
     await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
 
@@ -450,10 +538,9 @@ async function RunChat({
         environmentFunctions = false;
     }
 
-    const padTitle = "Context".length + 1;
-    await printCommonInfoLines({
+    const padTitle = await printCommonInfoLines({
         context,
-        minTitleLength: padTitle,
+        draftContext,
         printBos: true,
         printEos: true,
         logBatchSize,
@@ -492,6 +579,10 @@ async function RunChat({
             show: environmentFunctions,
             title: "Environment functions",
             value: "enabled"
+        }, {
+            show: timing,
+            title: "Response timing",
+            value: "enabled"
         }]
     });
 
@@ -513,7 +604,18 @@ async function RunChat({
         return res;
     }
 
-    if (!printTimings && !meter)
+    if (prompt != null && prompt !== "" && !printTimings && (meter || timing)) {
+        // warm up the context sequence before the first evaluation, to make the timings of the actual evaluations more accurate
+        const contextFirstToken = session.chatWrapper.generateContextState({
+            chatHistory: [
+                ...session.getChatHistory(),
+                {type: "user", text: ""}
+            ]
+        }).contextText.tokenize(model.tokenizer)[0];
+
+        if (contextFirstToken != null)
+            await contextSequence.evaluateWithoutGeneratingNewTokens([contextFirstToken]);
+    } else if (!printTimings && !meter)
         void session.preloadPrompt("")
             .catch(() => void 0); // don't throw an error if preloading fails because a real prompt is sent early
 
@@ -544,6 +646,7 @@ async function RunChat({
             consoleInteraction.stop();
         });
 
+        const timeBeforePrompt = Date.now();
         try {
             process.stdout.write(startColor!);
             consoleInteraction.start();
@@ -608,6 +711,7 @@ async function RunChat({
 
             console.log();
         }
+        const timeAfterPrompt = Date.now();
 
         if (printTimings) {
             if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
@@ -619,12 +723,62 @@ async function RunChat({
             llama.logLevel = llamaLogLevel;
         }
 
+        if (timing)
+            console.info(
+                chalk.dim("Response duration: ") +
+                prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
+                    keepDecimalsOnWholeSeconds: true,
+                    secondsDecimalDigits: 2,
+                    separateMilliseconds: true,
+                    compact: false
+                })
+            );
+
         if (meter) {
             const newTokenMeterState = contextSequence.tokenMeter.getState();
             const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
             lastTokenMeterState = newTokenMeterState;
 
-            console.info(`${chalk.dim("Input tokens:")} ${String(tokenMeterDiff.usedInputTokens).padEnd(5, " ")}  ${chalk.dim("Output tokens:")} ${tokenMeterDiff.usedOutputTokens}`);
+            const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
+
+            const tokenPredictionsStats = contextSequence.tokenPredictions;
+            const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
+            const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
+            const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
+            const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
+            lastTokenPredictionsStats = tokenPredictionsStats;
+
+            console.info([
+                showDraftTokenMeterDiff && (
+                    chalk.yellow("Main".padEnd("Drafter".length))
+                ),
+                chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " ")
+                )
+            ].filter(Boolean).join("  "));
+
+            if (lastDraftTokenMeterState != null && draftContextSequence != null) {
+                const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
+                const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
+                lastDraftTokenMeterState = newDraftTokenMeterState;
+
+                console.info([
+                    chalk.yellow("Drafter"),
+                    chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                    chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
+                ].join("  "));
+            }
         }
     }
 }
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index 37196c2b..1ca9b9e6 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -4,6 +4,7 @@ import path from "path";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import fs from "fs-extra";
+import prettyMilliseconds from "pretty-ms";
 import {getLlama} from "../../bindings/getLlama.js";
 import {
     BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
@@ -19,6 +20,7 @@ import {resolveHeaderFlag} from "../utils/resolveHeaderFlag.js";
 import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescriptionDocsUrl.js";
 import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
+import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 
 type CompleteCommand = {
     modelPath?: string,
@@ -43,8 +45,11 @@ type CompleteCommand = {
     repeatFrequencyPenalty?: number,
     repeatPresencePenalty?: number,
     maxTokens: number,
+    tokenPredictionDraftModel?: string,
+    tokenPredictionModelContextSize?: number,
     debug: boolean,
     meter: boolean,
+    timing: boolean,
     printTimings: boolean
 };
 
@@ -188,6 +193,17 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: 0,
                 description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
             })
+            .option("tokenPredictionDraftModel", {
+                alias: ["dm", "draftModel"],
+                type: "string",
+                description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
+            })
+            .option("tokenPredictionModelContextSize", {
+                alias: ["dc", "draftContextSize", "draftContext"],
+                type: "number",
+                description: "Max context size to use for the draft sequence token prediction model context",
+                default: 4096
+            })
             .option("debug", {
                 alias: "d",
                 type: "boolean",
@@ -199,26 +215,31 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Log how many tokens were used as input and output for each response"
             })
+            .option("timing", {
+                type: "boolean",
+                default: false,
+                description: "Print how how long it took to generate each response"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
                 default: false,
-                description: "Print llama.cpp timings after each response"
+                description: "Print llama.cpp's internal timings after each response"
             });
     },
     async handler({
         modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
         flashAttention, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-        debug, meter, printTimings
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
+        debug, meter, timing, printTimings
     }) {
         try {
             await RunCompletion({
                 modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                debug, meter, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -233,7 +254,7 @@ async function RunCompletion({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    maxTokens, debug, meter, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, printTimings
 }: CompleteCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -259,6 +280,12 @@ async function RunCompletion({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention
     });
+    const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
+        ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
+            flashAttention,
+            consoleTitle: "Draft model file"
+        })
+        : undefined;
 
     if (systemInfo)
         console.log(llama.systemInfo);
@@ -311,6 +338,57 @@ async function RunCompletion({
             }
         }
     });
+    const draftModel = resolvedDraftModelPath == null
+        ? undefined
+        : await withProgressLog({
+            loadingText: chalk.blue.bold("Loading draft model"),
+            successText: chalk.blue("Draft model loaded"),
+            failText: chalk.blue("Failed to load draft model"),
+            liveUpdates: !debug,
+            noProgress: debug,
+            liveCtrlCSendsAbortSignal: true
+        }, async (progressUpdater) => {
+            try {
+                return await llama.loadModel({
+                    modelPath: resolvedDraftModelPath,
+                    defaultContextFlashAttention: flashAttention,
+                    onLoadProgress(loadProgress: number) {
+                        progressUpdater.setProgress(loadProgress);
+                    },
+                    loadSignal: progressUpdater.abortSignal
+                });
+            } catch (err) {
+                if (err === progressUpdater.abortSignal?.reason)
+                    process.exit(0);
+
+                throw err;
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+
+    const draftContext = draftModel == null
+        ? undefined
+        : await withOra({
+            loading: chalk.blue("Creating draft context"),
+            success: chalk.blue("Draft context created"),
+            fail: chalk.blue("Failed to create draft context"),
+            useStatusLogs: debug
+        }, async () => {
+            try {
+                return await draftModel.createContext({
+                    contextSize: {max: tokenPredictionModelContextSize}
+                });
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
     const context = await withOra({
         loading: chalk.blue("Creating context"),
         success: chalk.blue("Context created"),
@@ -333,18 +411,25 @@ async function RunCompletion({
         }
     });
 
-    const contextSequence = context.getSequence();
+    const draftContextSequence = draftContext?.getSequence();
+    const contextSequence = draftContextSequence != null
+        ? context.getSequence({
+            tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
+        })
+        : context.getSequence();
     const completion = new LlamaCompletion({
         contextSequence
     });
+    let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
     let lastTokenMeterState = contextSequence.tokenMeter.getState();
+    let lastTokenPredictionsStats = contextSequence.tokenPredictions;
 
     await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
 
-    const padTitle = "Complete".length + 1;
-    await printCommonInfoLines({
+    const padTitle = await printCommonInfoLines({
         context,
-        minTitleLength: padTitle,
+        draftContext,
+        minTitleLength: "Complete".length + 1,
         logBatchSize,
         tokenMeterEnabled: meter
     });
@@ -366,6 +451,10 @@ async function RunCompletion({
             show: !penalizeRepeatingNewLine,
             title: "Penalize repeating new line",
             value: "disabled"
+        }, {
+            show: timing,
+            title: "Response timing",
+            value: "enabled"
         }]
     });
 
@@ -412,6 +501,7 @@ async function RunCompletion({
             consoleInteraction.stop();
         });
 
+        const timeBeforePrompt = Date.now();
         try {
             process.stdout.write(startColor!);
             consoleInteraction.start();
@@ -451,6 +541,7 @@ async function RunCompletion({
 
             console.log();
         }
+        const timeAfterPrompt = Date.now();
 
         if (printTimings) {
             if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
@@ -462,12 +553,62 @@ async function RunCompletion({
             llama.logLevel = llamaLogLevel;
         }
 
+        if (timing)
+            console.info(
+                chalk.dim("Response duration: ") +
+                prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
+                    keepDecimalsOnWholeSeconds: true,
+                    secondsDecimalDigits: 2,
+                    separateMilliseconds: true,
+                    compact: false
+                })
+            );
+
         if (meter) {
             const newTokenMeterState = contextSequence.tokenMeter.getState();
             const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
             lastTokenMeterState = newTokenMeterState;
 
-            console.info(`${chalk.dim("Input tokens:")} ${String(tokenMeterDiff.usedInputTokens).padEnd(5, " ")}  ${chalk.dim("Output tokens:")} ${tokenMeterDiff.usedOutputTokens}`);
+            const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
+
+            const tokenPredictionsStats = contextSequence.tokenPredictions;
+            const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
+            const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
+            const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
+            const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
+            lastTokenPredictionsStats = tokenPredictionsStats;
+
+            console.info([
+                showDraftTokenMeterDiff && (
+                    chalk.yellow("Main".padEnd("Drafter".length))
+                ),
+                chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " ")
+                )
+            ].filter(Boolean).join("  "));
+
+            if (lastDraftTokenMeterState != null && draftContextSequence != null) {
+                const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
+                const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
+                lastDraftTokenMeterState = newDraftTokenMeterState;
+
+                console.info([
+                    chalk.yellow("Drafter"),
+                    chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                    chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
+                ].join("  "));
+            }
         }
     }
 }
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index f7dceaf6..bd87a1c9 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -4,6 +4,7 @@ import path from "path";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import fs from "fs-extra";
+import prettyMilliseconds from "pretty-ms";
 import {getLlama} from "../../bindings/getLlama.js";
 import {
     BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
@@ -19,6 +20,7 @@ import {resolveHeaderFlag} from "../utils/resolveHeaderFlag.js";
 import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescriptionDocsUrl.js";
 import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
+import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 
 type InfillCommand = {
     modelPath?: string,
@@ -45,8 +47,11 @@ type InfillCommand = {
     repeatFrequencyPenalty?: number,
     repeatPresencePenalty?: number,
     maxTokens: number,
+    tokenPredictionDraftModel?: string,
+    tokenPredictionModelContextSize?: number,
     debug: boolean,
     meter: boolean,
+    timing: boolean,
     printTimings: boolean
 };
 
@@ -198,6 +203,17 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: 0,
                 description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
             })
+            .option("tokenPredictionDraftModel", {
+                alias: ["dm", "draftModel"],
+                type: "string",
+                description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
+            })
+            .option("tokenPredictionModelContextSize", {
+                alias: ["dc", "draftContextSize", "draftContext"],
+                type: "number",
+                description: "Max context size to use for the draft sequence token prediction model context",
+                default: 4096
+            })
             .option("debug", {
                 alias: "d",
                 type: "boolean",
@@ -209,26 +225,31 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Log how many tokens were used as input and output for each response"
             })
+            .option("timing", {
+                type: "boolean",
+                default: false,
+                description: "Print how how long it took to generate each response"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
                 default: false,
-                description: "Print llama.cpp timings after each response"
+                description: "Print llama.cpp's internal timings after each response"
             });
     },
     async handler({
         modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize,
         flashAttention, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-        debug, meter, printTimings
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
+        debug, meter, timing, printTimings
     }) {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                debug, meter, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -243,7 +264,7 @@ async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    maxTokens, debug, meter, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, printTimings
 }: InfillCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -269,6 +290,12 @@ async function RunInfill({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention
     });
+    const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
+        ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
+            flashAttention,
+            consoleTitle: "Draft model file"
+        })
+        : undefined;
 
     if (systemInfo)
         console.log(llama.systemInfo);
@@ -335,6 +362,57 @@ async function RunInfill({
             }
         }
     });
+    const draftModel = resolvedDraftModelPath == null
+        ? undefined
+        : await withProgressLog({
+            loadingText: chalk.blue.bold("Loading draft model"),
+            successText: chalk.blue("Draft model loaded"),
+            failText: chalk.blue("Failed to load draft model"),
+            liveUpdates: !debug,
+            noProgress: debug,
+            liveCtrlCSendsAbortSignal: true
+        }, async (progressUpdater) => {
+            try {
+                return await llama.loadModel({
+                    modelPath: resolvedDraftModelPath,
+                    defaultContextFlashAttention: flashAttention,
+                    onLoadProgress(loadProgress: number) {
+                        progressUpdater.setProgress(loadProgress);
+                    },
+                    loadSignal: progressUpdater.abortSignal
+                });
+            } catch (err) {
+                if (err === progressUpdater.abortSignal?.reason)
+                    process.exit(0);
+
+                throw err;
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+
+    const draftContext = draftModel == null
+        ? undefined
+        : await withOra({
+            loading: chalk.blue("Creating draft context"),
+            success: chalk.blue("Draft context created"),
+            fail: chalk.blue("Failed to create draft context"),
+            useStatusLogs: debug
+        }, async () => {
+            try {
+                return await draftModel.createContext({
+                    contextSize: {max: tokenPredictionModelContextSize}
+                });
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
     const context = await withOra({
         loading: chalk.blue("Creating context"),
         success: chalk.blue("Context created"),
@@ -357,18 +435,24 @@ async function RunInfill({
         }
     });
 
-    const contextSequence = context.getSequence();
+    const draftContextSequence = draftContext?.getSequence();
+    const contextSequence = draftContextSequence != null
+        ? context.getSequence({
+            tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
+        })
+        : context.getSequence();
     const completion = new LlamaCompletion({
         contextSequence
     });
+    let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
     let lastTokenMeterState = contextSequence.tokenMeter.getState();
+    let lastTokenPredictionsStats = contextSequence.tokenPredictions;
 
     await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
 
-    const padTitle = "Context".length + 1;
-    await printCommonInfoLines({
+    const padTitle = await printCommonInfoLines({
         context,
-        minTitleLength: padTitle,
+        draftContext,
         logBatchSize,
         tokenMeterEnabled: meter
     });
@@ -390,6 +474,10 @@ async function RunInfill({
             show: !penalizeRepeatingNewLine,
             title: "Penalize repeating new line",
             value: "disabled"
+        }, {
+            show: timing,
+            title: "Response timing",
+            value: "enabled"
         }]
     });
 
@@ -457,6 +545,7 @@ async function RunInfill({
             consoleInteraction.stop();
         });
 
+        const timeBeforePrompt = Date.now();
         try {
             process.stdout.write(startColor!);
             consoleInteraction.start();
@@ -496,6 +585,7 @@ async function RunInfill({
 
             console.log();
         }
+        const timeAfterPrompt = Date.now();
 
         if (printTimings) {
             if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
@@ -507,12 +597,62 @@ async function RunInfill({
             llama.logLevel = llamaLogLevel;
         }
 
+        if (timing)
+            console.info(
+                chalk.dim("Response duration: ") +
+                prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
+                    keepDecimalsOnWholeSeconds: true,
+                    secondsDecimalDigits: 2,
+                    separateMilliseconds: true,
+                    compact: false
+                })
+            );
+
         if (meter) {
             const newTokenMeterState = contextSequence.tokenMeter.getState();
             const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
             lastTokenMeterState = newTokenMeterState;
 
-            console.info(`${chalk.dim("Input tokens:")} ${String(tokenMeterDiff.usedInputTokens).padEnd(5, " ")}  ${chalk.dim("Output tokens:")} ${tokenMeterDiff.usedOutputTokens}`);
+            const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
+
+            const tokenPredictionsStats = contextSequence.tokenPredictions;
+            const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
+            const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
+            const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
+            const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
+            lastTokenPredictionsStats = tokenPredictionsStats;
+
+            console.info([
+                showDraftTokenMeterDiff && (
+                    chalk.yellow("Main".padEnd("Drafter".length))
+                ),
+                chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " ")
+                )
+            ].filter(Boolean).join("  "));
+
+            if (lastDraftTokenMeterState != null && draftContextSequence != null) {
+                const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
+                const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
+                lastDraftTokenMeterState = newDraftTokenMeterState;
+
+                console.info([
+                    chalk.yellow("Drafter"),
+                    chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                    chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
+                ].join("  "));
+            }
         }
     }
 }
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
index d828a74f..5eefe335 100644
--- a/src/cli/utils/printCommonInfoLines.ts
+++ b/src/cli/utils/printCommonInfoLines.ts
@@ -6,6 +6,7 @@ import {printInfoLine} from "./printInfoLine.js";
 
 export async function printCommonInfoLines({
     context,
+    draftContext,
     minTitleLength = 0,
     logBatchSize = false,
     tokenMeterEnabled = false,
@@ -13,6 +14,7 @@ export async function printCommonInfoLines({
     printEos = false
 }: {
     context: LlamaContext,
+    draftContext?: LlamaContext,
     minTitleLength?: number,
     logBatchSize?: boolean,
     tokenMeterEnabled?: boolean,
@@ -21,7 +23,13 @@ export async function printCommonInfoLines({
 }) {
     const llama = context._llama;
     const model = context.model;
-    const padTitle = Math.max(minTitleLength, "Context".length + 1);
+    const padTitle = Math.max(
+        minTitleLength,
+        "Context".length + 1,
+        draftContext != null
+            ? ("Draft context".length + 1)
+            : 0
+    );
 
     if (llama.gpu !== false) {
         const [
@@ -98,6 +106,64 @@ export async function printCommonInfoLines({
             value: "enabled"
         }]
     });
+
+    if (draftContext != null) {
+        const draftModel = draftContext.model;
+
+        printInfoLine({
+            title: "Draft model",
+            padTitle: padTitle,
+            info: [{
+                title: "Type",
+                value: toOneLine(draftModel.typeDescription)
+            }, {
+                title: "Size",
+                value: bytes(draftModel.size)
+            }, {
+                show: llama.gpu !== false,
+                title: "GPU layers",
+                value: `${draftModel.gpuLayers}/${draftModel.fileInsights.totalLayers} offloaded ${
+                    chalk.dim(`(${Math.floor((draftModel.gpuLayers / draftModel.fileInsights.totalLayers) * 100)}%)`)
+                }`
+            }, {
+                show: printBos,
+                title: "BOS",
+                value: () => toOneLine(String(draftModel.tokens.bosString))
+            }, {
+                show: printEos,
+                title: "EOS",
+                value: () => toOneLine(String(draftModel.tokens.eosString))
+            }, {
+                title: "Train context size",
+                value: draftModel.trainContextSize.toLocaleString("en-US")
+            }]
+        });
+        printInfoLine({
+            title: "Draft context",
+            padTitle: padTitle,
+            info: [{
+                title: "Size",
+                value: draftContext.contextSize.toLocaleString("en-US")
+            }, {
+                title: "Threads",
+                value: draftContext.currentThreads.toLocaleString("en-US")
+            }, {
+                show: logBatchSize,
+                title: "Batch size",
+                value: draftContext.batchSize.toLocaleString("en-US")
+            }, {
+                show: draftContext.flashAttention,
+                title: "Flash attention",
+                value: "enabled"
+            }, {
+                show: tokenMeterEnabled,
+                title: "Token meter",
+                value: "enabled"
+            }]
+        });
+    }
+
+    return padTitle;
 }
 
 function toOneLine(text: string) {
diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts
index 6c9d638a..3db583c3 100644
--- a/src/cli/utils/resolveCommandGgufPath.ts
+++ b/src/cli/utils/resolveCommandGgufPath.ts
@@ -10,9 +10,9 @@ import {getReadablePath} from "./getReadablePath.js";
 import {interactivelyAskForModel} from "./interactivelyAskForModel.js";
 
 export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record<string, string>, {
-    targetDirectory = cliModelsDirectory, flashAttention = false
+    targetDirectory = cliModelsDirectory, flashAttention = false, consoleTitle = "File"
 }: {
-    targetDirectory?: string, flashAttention?: boolean
+    targetDirectory?: string, flashAttention?: boolean, consoleTitle?: string
 } = {}) {
     if (ggufPath == null)
         ggufPath = await interactivelyAskForModel({
@@ -50,7 +50,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
         const fileStats = await fs.stat(downloader.entrypointFilePath);
 
         if (downloader.totalSize === fileStats.size) {
-            console.info(`${chalk.yellow("File:")} ${getReadablePath(downloader.entrypointFilePath)}`);
+            console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`);
 
             return downloader.entrypointFilePath;
         }
@@ -62,7 +62,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
 
         if (!res) {
             console.info("Loading the existing file");
-            console.info(`${chalk.yellow("File:")} ${getReadablePath(downloader.entrypointFilePath)}`);
+            console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`);
 
             return downloader.entrypointFilePath;
         }
@@ -86,7 +86,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
     await downloader.download();
     consoleInteraction.stop();
 
-    console.info(`${chalk.yellow("File:")} ${getReadablePath(downloader.entrypointFilePath)}`);
+    console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`);
 
     return downloader.entrypointFilePath;
 }
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 7e8afa7f..3baeca9d 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -416,6 +416,9 @@ export class LlamaChat {
             } = {}
         } = options;
 
+        this.sequence.tokenPredictor?.updateInputTokens?.(
+            this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? "")
+        );
         const generateResponseState = new GenerateResponseState<Functions>(
             this,
             this._chatWrapper,
@@ -586,6 +589,13 @@ export class LlamaChat {
             ? lastEvaluationContextWindowHistoryItem.text
             : "";
 
+        this.sequence.tokenPredictor?.updateInputTokens?.(
+            this.model.tokenize(
+                (findLastModelMessageInChatHistory(history)?.response ?? [])
+                    .filter((item) => typeof item === "string")
+                    .join(" ")
+            )
+        );
         const generateResponseState = new GenerateResponseState<Functions>(
             this,
             this._chatWrapper,
@@ -994,6 +1004,26 @@ function setLastTextInChatHistory(itemType: "user" | "model", chatHistory: ChatH
         return setLastModelTextResponseInChatHistory(chatHistory, text);
 }
 
+function findLastUserMessageInChatHistory(chatHistory: readonly ChatHistoryItem[]) {
+    for (let i = chatHistory.length - 1; i >= 0; i--) {
+        const item = chatHistory[i]!;
+        if (item.type === "user")
+            return item;
+    }
+
+    return undefined;
+}
+
+function findLastModelMessageInChatHistory(chatHistory: readonly ChatHistoryItem[]) {
+    for (let i = chatHistory.length - 1; i >= 0; i--) {
+        const item = chatHistory[i]!;
+        if (item.type === "model")
+            return item;
+    }
+
+    return undefined;
+}
+
 function generateContextText(
     endWithUserText: boolean,
     chatWrapper: ChatWrapper,
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index 9f782a9c..24f5c017 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -329,6 +329,7 @@ export class LlamaCompletion {
                     ? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
                     : this._sequence.context.contextSize - inputTokens.length;
 
+            this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
             return await this._generateResponse(inputTokens, {
                 onTextChunk: safeEventCallback(onTextChunk),
                 onToken: safeEventCallback(onToken),
@@ -527,6 +528,7 @@ export class LlamaCompletion {
                     ? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
                     : this._sequence.context.contextSize - inputTokens.length;
 
+            this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
             return await this._generateResponse(inputTokens, {
                 onTextChunk: safeEventCallback(onTextChunk),
                 onToken: safeEventCallback(onToken),
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 4a241635..3f2321d6 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -1,4 +1,4 @@
-import {AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
+import {acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {removeNullFields} from "../../utils/removeNullFields.js";
 import {Token} from "../../types.js";
 import {AddonContext, AddonModelLora, BatchLogitIndex} from "../../bindings/AddonTypes.js";
@@ -10,12 +10,15 @@ import {TokenBias} from "../TokenBias.js";
 import {LlamaModel} from "../LlamaModel/LlamaModel.js";
 import {UnsupportedError} from "../../utils/UnsupportedError.js";
 import {ThreadsSplitterConsumer} from "../../utils/ThreadsSplitter.js";
+import {pushAll} from "../../utils/pushAll.js";
+import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {
-    BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextOptions,
-    LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem
+    BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, EvaluationPriority,
+    LlamaContextOptions, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, SequenceEvaluateOptions
 } from "./types.js";
 import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import {LlamaSampler} from "./LlamaSampler.js";
+import {TokenPredictor} from "./TokenPredictor.js";
 import type {Llama} from "../../bindings/Llama.js";
 
 const defaultLoraScale = 1;
@@ -25,6 +28,7 @@ const defaultFailedCreationRemedy = {
     retries: 6,
     autoContextSizeShrink: 0.16
 } as const satisfies Required<LlamaContextOptions["failedCreationRemedy"]>;
+const defaultEvaluationPriority: EvaluationPriority = 5;
 
 export class LlamaContext {
     /** @internal */ public readonly _llama: Llama;
@@ -237,6 +241,23 @@ export class LlamaContext {
     public getSequence(options: {
         contextShift?: ContextShiftOptions,
 
+        /**
+         * Token predictor to use for the sequence.
+         * Don't share the same token predictor between multiple sequences.
+         *
+         * Using a token predictor doesn't affect the generation output itself -
+         * it only allows for greater parallelization of the token evaluation to speed up the generation.
+         *
+         * > **Note:** that if a token predictor is too resource intensive,
+         * > it can slow down the generation process due to the overhead of running the predictor.
+         * >
+         * > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
+         *
+         * Automatically disposed when disposing the sequence.
+         * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
+         */
+        tokenPredictor?: TokenPredictor,
+
         /** @internal */
         _tokenMeter?: TokenMeter
     } = {}): LlamaContextSequence {
@@ -245,6 +266,7 @@ export class LlamaContext {
                 size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)),
                 strategy: contextShiftStrategy = "eraseBeginning"
             } = {},
+            tokenPredictor,
 
             _tokenMeter
         } = options;
@@ -262,7 +284,8 @@ export class LlamaContext {
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            tokenPredictor
         });
     }
 
@@ -281,6 +304,7 @@ export class LlamaContext {
             this._batchDispatchPending = false;
 
             let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+            const queuedDecodeToMappedLogits = new Map<InternalQueuedDecode, [tokenIndex: number, value: any][]>();
 
             const resolvePrioritizationStrategy = () => {
                 try {
@@ -302,6 +326,7 @@ export class LlamaContext {
                 for (const queuedDecode of this._queuedDecodes) {
                     const batchItem: BatchItem = {
                         tokens: queuedDecode.tokens,
+                        logits: queuedDecode.logits,
                         evaluationPriority: queuedDecode.evaluationPriority
                     };
                     batchItemToQueuedDecodeMap.set(batchItem, queuedDecode);
@@ -369,9 +394,11 @@ export class LlamaContext {
 
             const decodeTokenBatchItems = async (batchItems: CurrentBatchItem[], currentBatchSize: number) => {
                 const afterDecodeActions: Array<{
-                    batchLogitIndex: BatchLogitIndex | undefined,
-                    response: [accept: (res: any) => void, reject: (reason: unknown) => void],
-                    onDone?: (batchLogitIndex: BatchLogitIndex) => any
+                    queuedDecode: InternalQueuedDecode,
+                    batchLogitIndexes: Uint32Array,
+                    batchLogitTokenIndexes: number[],
+                    firstTokenIndex: number,
+                    returnResults?: true
                 }> = [];
                 const queuedDecodesToDelete = new Set<InternalQueuedDecode>();
                 const currentQueuedDecodeItems = new Set<InternalQueuedDecode>();
@@ -380,22 +407,22 @@ export class LlamaContext {
                     this._ctx.initBatch(currentBatchSize);
 
                 for (const {queuedDecode, processAmount} of batchItems) {
-                    let batchLogitIndex: ReturnType<typeof this._ctx.addToBatch>;
-                    try {
-                        const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
-                            processAmount === queuedDecode.tokens.length;
-
-                        const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                    let batchLogitIndexes: ReturnType<typeof this._ctx.addToBatch>;
+                    const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                    const tokenIndexesWithLogitsToProcess = queuedDecode.logits.slice(0, processAmount)
+                        .map((logit, index) => (logit ? index : undefined))
+                        .filter((index) => index != undefined);
 
-                        const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
-                        TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
-                        TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
+                    const numberOfOutputTokens = tokenIndexesWithLogitsToProcess.length;
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
 
-                        batchLogitIndex = this._ctx.addToBatch(
+                    try {
+                        batchLogitIndexes = this._ctx.addToBatch(
                             queuedDecode.sequenceId,
                             queuedDecode.firstTokenSequenceIndex,
                             Uint32Array.from(tokensToProcess),
-                            shouldGenerateLogitAtTheEnd
+                            Uint32Array.from(tokenIndexesWithLogitsToProcess)
                         );
                     } catch (err) {
                         this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
@@ -406,12 +433,23 @@ export class LlamaContext {
                     if (queuedDecode.tokens.length === processAmount) {
                         queuedDecodesToDelete.add(queuedDecode);
                         afterDecodeActions.push({
-                            batchLogitIndex,
-                            response: queuedDecode.response,
-                            onDone: queuedDecode.onDone
+                            queuedDecode,
+                            batchLogitIndexes,
+                            batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                            firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
+                            returnResults: true
                         });
                     } else {
+                        if (batchLogitIndexes.length > 0)
+                            afterDecodeActions.push({
+                                queuedDecode,
+                                batchLogitIndexes,
+                                batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                                firstTokenIndex: queuedDecode.firstTokenSequenceIndex
+                            });
+
                         queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
+                        queuedDecode.logits = queuedDecode.logits.slice(processAmount);
                         queuedDecode.firstTokenSequenceIndex += processAmount;
                     }
                 }
@@ -444,18 +482,69 @@ export class LlamaContext {
                     }
                 }
 
-                for (const action of afterDecodeActions) {
-                    const [accept, reject] = action.response;
-                    if (action.onDone != null && action.batchLogitIndex != null) {
-                        try {
-                            accept(action.onDone(action.batchLogitIndex ?? null));
-                        } catch (err) {
-                            reject(err);
-                        }
+                function finishAfterDecodeAction(
+                    action: typeof afterDecodeActions[number],
+                    mappedLogitValues?: [index: number, value: any][]
+                ) {
+                    if (mappedLogitValues != null && mappedLogitValues.length > 0) {
+                        if (queuedDecodeToMappedLogits.has(action.queuedDecode))
+                            pushAll(queuedDecodeToMappedLogits.get(action.queuedDecode)!, mappedLogitValues);
+                        else
+                            queuedDecodeToMappedLogits.set(action.queuedDecode, mappedLogitValues);
                     }
 
-                    accept(undefined);
+                    if (action.returnResults != null) {
+                        const [accept] = action.queuedDecode.response;
+                        const mappedLogits = queuedDecodeToMappedLogits.get(action.queuedDecode) ?? [];
+                        queuedDecodeToMappedLogits.delete(action.queuedDecode);
+                        accept(mappedLogits);
+                    }
                 }
+
+                const afterDecodeActionResults = afterDecodeActions.map((action): Promise<void> | void => {
+                    if (action.batchLogitIndexes.length === 0) {
+                        finishAfterDecodeAction(action);
+                        return undefined;
+                    }
+
+                    const mappedLogitValues: ([index: number, value: any] | Promise<[index: number, value: any]>)[] = [];
+                    let promiseChain: Promise<void> | undefined = undefined;
+
+                    const batchLogitIndexes = action.batchLogitIndexes;
+                    const batchLogitTokenIndexes = action.batchLogitTokenIndexes;
+                    for (let i = 0; i < batchLogitIndexes.length; i++) {
+                        const tokenIndex = batchLogitTokenIndexes[i]!;
+
+                        const mappedValue: Promise<any> | any = promiseChain != null
+                            ? promiseChain
+                                .then(() => action.queuedDecode.logitDataMapper(
+                                    batchLogitIndexes[i]! as BatchLogitIndex,
+                                    tokenIndex + action.firstTokenIndex
+                                ))
+                            : action.queuedDecode.logitDataMapper(
+                                batchLogitIndexes[i]! as BatchLogitIndex,
+                                tokenIndex + action.firstTokenIndex
+                            );
+
+                        if (mappedValue instanceof Promise) {
+                            promiseChain = mappedValue;
+                            mappedLogitValues.push(
+                                mappedValue
+                                    .then((value) => [tokenIndex + action.firstTokenIndex, value])
+                            );
+                        } else
+                            mappedLogitValues.push([tokenIndex + action.firstTokenIndex, mappedValue]);
+                    }
+
+                    if (promiseChain != null)
+                        return Promise.all(mappedLogitValues)
+                            .then((resolvedMappedLogitValues) => finishAfterDecodeAction(action, resolvedMappedLogitValues));
+
+                    finishAfterDecodeAction(action, mappedLogitValues as [index: number, value: any][]);
+                    return undefined;
+                });
+
+                await Promise.all(afterDecodeActionResults);
             };
 
             const prioritizationStrategy = resolvePrioritizationStrategy();
@@ -514,21 +603,21 @@ export class LlamaContext {
 
     /** @internal */
     public async _decodeTokens<T>({
-        sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter
+        sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter
     }: {
-        sequenceId: number, firstTokenSequenceIndex: number, tokens: Token[], generateLogitAtTheEnd?: boolean,
+        sequenceId: number, firstTokenSequenceIndex: number, tokens: Token[], logits: (true | undefined)[],
         evaluationPriority?: EvaluationPriority, tokenMeter: TokenMeter
-    }, onDone?: ((batchLogitIndex: BatchLogitIndex) => (T | Promise<T>))): Promise<T> {
+    }, logitDataMapper: ((batchLogitIndex: BatchLogitIndex, tokenIndex: number) => T | Promise<T>)): Promise<[index: number, value: T][]> {
         return await new Promise((accept, reject) => {
             this._queuedDecodes.push({
                 sequenceId,
                 tokens,
+                logits,
                 firstTokenSequenceIndex,
-                generateLogitAtTheEnd,
                 evaluationPriority,
                 tokenMeter,
                 response: [accept, reject],
-                onDone
+                logitDataMapper
             });
             this._queuedDecodeSequenceIds.add(sequenceId);
 
@@ -811,26 +900,37 @@ export class LlamaContextSequence {
     /** @internal */ private readonly _gcRegistry: FinalizationRegistry<number>;
     /** @internal */ private readonly _context: LlamaContext;
     /** @internal */ private readonly _contextShift: Required<ContextShiftOptions>;
+    /** @internal */ private readonly _tokenPredictor?: TokenPredictor;
     /** @internal */ private readonly _tokenMeter: TokenMeter;
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
-    /** @internal */ private _contextTokens: Token[] = [];
+    /** @internal */ private readonly _lock = {};
+    /** @internal */ private _resetTokenPredictor: boolean = false;
+    /** @internal */ private _tokenPredictorOwner: {} = {};
+    /** @internal */ public _contextTokens: Token[] = [];
     /** @internal */ private _nextTokenIndex: number = 0;
+    /** @internal */ private _loadedTokenPredictions: [input: Token, output: Token][] = [];
+    /** @internal */ private _usedTokenPredictions: number = 0;
+    /** @internal */ private _unusedTokenPredictions: number = 0;
+    /** @internal */ private _validatedTokenPredictions: number = 0;
+    /** @internal */ private _refutedTokenPredictions: number = 0;
     /** @internal */ private _disposed = false;
 
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        sequenceId, context, tokenMeter, contextShift
+        sequenceId, context, tokenMeter, contextShift, tokenPredictor
     }: {
         sequenceId: number,
         context: LlamaContext,
         tokenMeter?: TokenMeter,
-        contextShift: Required<ContextShiftOptions>
+        contextShift: Required<ContextShiftOptions>,
+        tokenPredictor?: TokenPredictor
     }) {
         this._sequenceId = sequenceId;
         this._context = context;
         this._tokenMeter = tokenMeter ?? new TokenMeter();
         this._contextShift = contextShift;
+        this._tokenPredictor = tokenPredictor;
         this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
 
         this._gcRegistry.register(this, sequenceId);
@@ -846,6 +946,9 @@ export class LlamaContextSequence {
         this._disposeAggregator.add(() => {
             this._context._reclaimUnusedSequenceId(this._sequenceId);
         });
+
+        if (this._tokenPredictor != null)
+            this._disposeAggregator.add(this._tokenPredictor);
     }
 
     public dispose() {
@@ -877,17 +980,57 @@ export class LlamaContextSequence {
     }
 
     public get nextTokenIndex() {
-        return this._nextTokenIndex;
+        return this._nextTokenIndex - this._loadedTokenPredictions.length;
     }
 
     public get contextTokens() {
-        return this._contextTokens.slice();
+        if (this._loadedTokenPredictions.length === 0)
+            return this._contextTokens.slice();
+
+        return this._contextTokens.slice(0, -this._loadedTokenPredictions.length);
     }
 
     public get tokenMeter() {
         return this._tokenMeter;
     }
 
+    /**
+     * The token predictor used when creating this sequence.
+     */
+    public get tokenPredictor() {
+        return this._tokenPredictor;
+    }
+
+    /**
+     * Statistics of token predictions using the sequence's `tokenPredictor`.
+     *
+     * The statistics change only when token prediction is used in this sequence.
+     *
+     * `validated` + `refuted` = total number of evaluated predictions.
+     *
+     * Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
+     */
+    public get tokenPredictions(): {
+        /** Number of token predictions that were actually used (tokens that were validated and then consumed) */
+        used: number,
+
+        /** Number of token predictions that were not used (tokens that were validated and were not consumed) */
+        unused: number,
+
+        /** Number of token predictions that were validated successfully */
+        validated: number,
+
+        /** Number of token predictions that were refuted */
+        refuted: number
+    } {
+        return {
+            used: this._usedTokenPredictions,
+            unused: this._unusedTokenPredictions,
+            validated: this._validatedTokenPredictions,
+            refuted: this._refutedTokenPredictions
+        };
+    }
+
     public get isLoadedToMemory() {
         return !this._disposed;
     }
@@ -895,7 +1038,7 @@ export class LlamaContextSequence {
     public compareContextTokens(tokens: Token[]): {
         firstDifferentIndex: number
     } {
-        for (let i = 0; i < this._contextTokens.length; i++) {
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length; i++) {
             if (compareTokens(this._contextTokens[i], tokens[i]))
                 continue;
 
@@ -905,7 +1048,7 @@ export class LlamaContextSequence {
         }
 
         return {
-            firstDifferentIndex: this._contextTokens.length
+            firstDifferentIndex: this._contextTokens.length - this._loadedTokenPredictions.length
         };
     }
 
@@ -922,8 +1065,8 @@ export class LlamaContextSequence {
     public async adaptStateToTokens(tokens: Token[], allowShift: boolean = true) {
         if (this.model.fileInsights.isRecurrent || !allowShift) {
             const {firstDifferentIndex} = this.compareContextTokens(tokens);
-            if (firstDifferentIndex < this._nextTokenIndex)
-                await this.eraseContextTokenRanges([{
+            if (firstDifferentIndex < this.nextTokenIndex)
+                await this._eraseContextTokenRanges([{
                     start: firstDifferentIndex,
                     end: this._nextTokenIndex
                 }]);
@@ -935,7 +1078,7 @@ export class LlamaContextSequence {
 
         let tokensIndex = 0;
         let differentTokenIndex: number | undefined = undefined;
-        for (let i = 0; i < this._contextTokens.length && tokensIndex < tokens.length; i++) {
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length && tokensIndex < tokens.length; i++) {
             if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) {
                 if (differentTokenIndex != null) {
                     eraseRanges.push({
@@ -961,7 +1104,7 @@ export class LlamaContextSequence {
             });
 
         if (eraseRanges.length > 0)
-            await this.eraseContextTokenRanges(eraseRanges);
+            await this._eraseContextTokenRanges(eraseRanges);
     }
 
     /**
@@ -971,7 +1114,7 @@ export class LlamaContextSequence {
     public async clearHistory() {
         this._ensureNotDisposed();
 
-        await this.eraseContextTokenRanges([{start: 0, end: this._nextTokenIndex}]);
+        await this._eraseContextTokenRanges([{start: 0, end: this._nextTokenIndex}]);
     }
 
     /**
@@ -979,7 +1122,23 @@ export class LlamaContextSequence {
      * The start of each range is inclusive, and the end of each range is exclusive.
      * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
      */
-    public async eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]) {
+    public eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]) {
+        return this._eraseContextTokenRanges(ranges);
+    }
+
+    /** @internal */
+    private async _eraseContextTokenRanges(
+        ranges: ContextTokensDeleteRange[],
+        {
+            canResetTokenPredictor = true,
+            canRemovePredictionTokens = true,
+            skipLock = false
+        }: {
+            canResetTokenPredictor?: boolean,
+            canRemovePredictionTokens?: boolean,
+            skipLock?: boolean
+        } = {}
+    ) {
         this._ensureNotDisposed();
 
         await withLock(this._context, "context", async () => {
@@ -1023,6 +1182,21 @@ export class LlamaContextSequence {
                     return ranges;
                 }, [] as ContextTokensDeleteRange[]);
 
+            const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens)
+                ? this._loadedTokenPredictions.length
+                : 0;
+            if (tokenPredictionsToRemove > 0) {
+                const startDeleteIndex = this._nextTokenIndex - this._loadedTokenPredictions.length;
+                const lastDeleteRange = resolvedRanges[resolvedRanges.length - 1]!;
+                if (lastDeleteRange.end >= startDeleteIndex)
+                    lastDeleteRange.end = this._nextTokenIndex;
+                else
+                    resolvedRanges.push({start: startDeleteIndex, end: this._nextTokenIndex});
+
+                if (canResetTokenPredictor)
+                    await this._abortTokenPredictor(true);
+            }
+
             let removedTokens = 0;
             let lastDeleteRangeEndPos: number | null = null;
             for (const range of resolvedRanges) {
@@ -1040,6 +1214,9 @@ export class LlamaContextSequence {
                 lastDeleteRangeEndPos = range.end;
             }
 
+            if (tokenPredictionsToRemove > 0)
+                this._loadedTokenPredictions.splice(0, tokenPredictionsToRemove);
+
             if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 &&
                 lastDeleteRangeEndPos !== this._nextTokenIndex
             ) {
@@ -1050,6 +1227,9 @@ export class LlamaContextSequence {
 
             this._nextTokenIndex -= removedTokens;
 
+            if (canResetTokenPredictor && removedTokens > 0)
+                await this._abortTokenPredictor(true);
+
             if (deletionSuccessful)
                 return;
 
@@ -1057,57 +1237,16 @@ export class LlamaContextSequence {
             this._nextTokenIndex = 0;
             this._context._ctx.disposeSequence(this._sequenceId);
 
-            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
+            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock});
         });
     }
 
-    public evaluate(tokens: Token[], options: {
-        temperature?: number, minP?: number, topK?: number, topP?: number,
-
-        /**
-         * Used to control the randomness of the generated text.
-         *
-         * Change the seed to get different results.
-         *
-         * Defaults to the current epoch time.
-         *
-         * Only relevant when using `temperature`.
-         */
-        seed?: number,
-        grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
-        repeatPenalty?: LlamaContextSequenceRepeatPenalty,
-
-        /**
-         * Adjust the probability of tokens being generated.
-         * Can be used to bias the model to generate tokens that you want it to lean towards,
-         * or to avoid generating tokens that you want it to avoid.
-         */
-        tokenBias?: TokenBias | (() => TokenBias),
-
-        /**
-         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
-         * evaluated based on the strategy chosen for the context.
-         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
-         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
-         * highest evaluation priority.
-         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
-         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
-         */
-        evaluationPriority?: EvaluationPriority,
-
-        /** Override the sequence context shift options for this evaluation */
-        contextShift?: ContextShiftOptions,
-
-        /**
-         * Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
-         * When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
-         * Defaults to `false`.
-         */
-        yieldEogToken?: boolean,
-
-        /** @internal */
-        _noSampling?: boolean
-    } = {}): AsyncGenerator<Token, void | Token> {
+    /**
+     * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
+     *
+     * This method uses the token predictor (when provided) to generate new tokens faster.
+     */
+    public evaluate(tokens: Token[], options: SequenceEvaluateOptions = {}): AsyncGenerator<Token, void | Token> {
         const {
             temperature = 0,
             minP = 0,
@@ -1117,7 +1256,7 @@ export class LlamaContextSequence {
             grammarEvaluationState,
             repeatPenalty,
             tokenBias,
-            evaluationPriority = 5,
+            evaluationPriority = defaultEvaluationPriority,
             contextShift: {
                 size: contextShiftSize = this._contextShift.size,
                 strategy: contextShiftStrategy = this._contextShift.strategy
@@ -1127,6 +1266,25 @@ export class LlamaContextSequence {
             _noSampling = false
         } = options;
 
+        if (this._tokenPredictor != null && !_noSampling && tokens.length > 0)
+            return this._speculativeEvaluate(tokens, {
+                temperature,
+                minP,
+                topK,
+                topP,
+                seed,
+                grammarEvaluationState,
+                repeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                contextShiftOptions: {
+                    size: contextShiftSize,
+                    strategy: contextShiftStrategy
+                },
+                yieldEogToken,
+                tokenPredictor: this._tokenPredictor
+            });
+
         return this._evaluate(tokens, {
             temperature,
             minP,
@@ -1149,16 +1307,8 @@ export class LlamaContextSequence {
 
     /**
      * Evaluate the provided tokens into the context sequence without generating new tokens.
-     * @param tokens
-     * @param [options]
      */
-    public async evaluateWithoutGeneratingNewTokens(tokens: Token[], {
-        evaluationPriority = 5,
-        contextShift: {
-            size: contextShiftSize = this._contextShift.size,
-            strategy: contextShiftStrategy = this._contextShift.strategy
-        } = {}
-    }: {
+    public async evaluateWithoutGeneratingNewTokens(tokens: Token[], options: {
         /**
          * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
          * evaluated based on the strategy chosen for the context.
@@ -1171,46 +1321,258 @@ export class LlamaContextSequence {
         evaluationPriority?: EvaluationPriority,
 
         /** Override the sequence context shift options for this evaluation */
-        contextShift?: ContextShiftOptions
+        contextShift?: ContextShiftOptions,
+
+        /** @internal */
+        _skipLock?: boolean
     } = {}): Promise<void> {
+        const {
+            evaluationPriority = defaultEvaluationPriority,
+            contextShift: {
+                size: contextShiftSize = this._contextShift.size,
+                strategy: contextShiftStrategy = this._contextShift.strategy
+            } = {},
+            _skipLock = false
+        } = options;
+
         const iterator = this._evaluate(tokens, {
             generateNewTokens: false,
             evaluationPriority,
             contextShiftOptions: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            _skipLock
         });
+        const predictorAlignmentPromise = this.tokenPredictor == null
+            ? undefined
+            : this._tokenPredictor?.reset({
+                stateTokens: [...this._contextTokens, ...tokens],
+                evaluateOptions: {
+                    evaluationPriority,
+                    contextShift: {
+                        size: contextShiftSize,
+                        strategy: contextShiftStrategy
+                    }
+                },
+                targetSequence: this
+            });
+        if (predictorAlignmentPromise != null) {
+            this._tokenPredictorOwner = {};
+            this._resetTokenPredictor = false;
+        }
 
         // eslint-disable-next-line @typescript-eslint/no-unused-vars
         for await (const token of iterator) {
             // Array.from doesn't work with async generators, so we have to iterate over the generator
         }
+
+        if (predictorAlignmentPromise != null)
+            await predictorAlignmentPromise;
+    }
+
+    /**
+     * Evaluate the provided tokens into the context sequence with custom options for each token.
+     *
+     * This method allows for more precise control of the generation process.
+     *
+     * A next token will be generated for a given token only if any of the `generateNext` options for it are used.
+     *
+     * To generate more tokens after this method finishes,
+     * use it again with token(s) you selected to add to the context from the previous evaluation.
+     *
+     * This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
+     * Use the `evaluate` method when you need to use token prediction.
+     * @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
+     * For indexes that have no output, there won't be any value at the corresponding index in the output array.
+     *
+     * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
+     */
+    public async controlledEvaluate(input: Array<Token | [Token, {
+        generateNext?: {
+            /**
+             * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+             *
+             * Only enable when needed, since it impacts the performance.
+             *
+             * Defaults to `false`.
+             */
+            probabilitiesList?: boolean,
+
+            /**
+             * Generate the next token with the provided options using sampling.
+             *
+             * Setting this to `true` will generate probabilities for the next token and sample it.
+             */
+            singleToken?: boolean,
+
+            options?: {
+                temperature?: number, minP?: number, topK?: number, topP?: number,
+
+                /**
+                 * Used to control the randomness of the generated text.
+                 *
+                 * Change the seed to get different results.
+                 *
+                 * Defaults to the current epoch time.
+                 *
+                 * Only relevant when using `temperature`.
+                 */
+                seed?: number,
+                repeatPenalty?: LlamaContextSequenceRepeatPenalty,
+
+                /**
+                 * Adjust the probability of tokens being generated.
+                 * Can be used to bias the model to generate tokens that you want it to lean towards,
+                 * or to avoid generating tokens that you want it to avoid.
+                 */
+                tokenBias?: TokenBias | (() => TokenBias)
+            }
+        }
+    }]>, options?: {
+        /**
+         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+         * evaluated based on the strategy chosen for the context.
+         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+         * highest evaluation priority.
+         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+         */
+        evaluationPriority?: EvaluationPriority,
+
+        /** Override the sequence context shift options for this evaluation */
+        contextShift?: ContextShiftOptions,
+
+        /** Called on each token result after it's generated */
+        onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void
+    }): Promise<Array<undefined | ControlledEvaluateIndexOutput>> {
+        const {
+            evaluationPriority = defaultEvaluationPriority,
+            contextShift: {
+                size: contextShiftSize = this._contextShift.size,
+                strategy: contextShiftStrategy = this._contextShift.strategy
+            } = {}
+        } = options ?? {};
+        const contextShiftOptions: Required<ContextShiftOptions> = {
+            size: contextShiftSize,
+            strategy: contextShiftStrategy
+        };
+
+        this._ensureNotDisposed();
+
+        if (input.length === 0)
+            return [];
+
+        await this._abortTokenPredictor();
+
+        const sampler = new LlamaSampler(this.model);
+        const onTokenResult = safeEventCallback(options?.onTokenResult);
+
+        const logitsArray: (true | undefined)[] = [];
+        const resolvedTokens = input.map((item, index) => {
+            if (item instanceof Array) {
+                const [token, options] = item;
+                const generateNext = options?.generateNext ?? {};
+                if (generateNext.probabilitiesList || generateNext.singleToken === true || generateNext.singleToken != null)
+                    logitsArray[index] = true;
+
+                return token;
+            }
+
+            return item;
+        });
+
+        const evaluatorLock = await acquireLock(this._lock, "evaluate");
+        try {
+            return await this._decodeTokens(
+                resolvedTokens,
+                logitsArray,
+                evaluationPriority,
+                this._tokenMeter,
+                contextShiftOptions,
+                async (batchLogitIndex, tokenIndex) => {
+                    const inputToken = input[tokenIndex];
+                    const inputOptions = inputToken instanceof Array
+                        ? (inputToken[1] ?? {})
+                        : {};
+                    const generateNext = inputOptions.generateNext;
+
+                    if (generateNext == null || (
+                        (generateNext.probabilitiesList == null || !generateNext.probabilitiesList) &&
+                        (generateNext.singleToken == null || !generateNext.singleToken)
+                    ))
+                        return undefined;
+
+                    const sampleOptions = generateNext.options ?? {};
+                    const samplerConfig = this._resolveSamplerConfig({
+                        temperature: sampleOptions.temperature,
+                        minP: sampleOptions.minP,
+                        topK: sampleOptions.topK,
+                        topP: sampleOptions.topP,
+                        seed: sampleOptions.seed,
+                        repeatPenalty: sampleOptions.repeatPenalty,
+                        tokenBias: sampleOptions.tokenBias
+                    });
+
+                    return await withLock(sampler, "sample", async () => {
+                        if (sampler.disposed)
+                            return undefined;
+
+                        sampler.applyConfig(samplerConfig);
+                        const [token, probabilitiesList] = await this._context._ctx.sampleToken(
+                            batchLogitIndex,
+                            sampler._sampler,
+                            !!generateNext.probabilitiesList
+                        );
+
+                        const output: ControlledEvaluateIndexOutput = {
+                            next: {
+                                token: generateNext.singleToken
+                                    ? token === -1
+                                        ? null
+                                        : (token ?? null)
+                                    : undefined,
+                                probabilities: reviveTokenProbabilities(probabilitiesList)
+                            }
+                        };
+                        onTokenResult?.(tokenIndex, output);
+
+                        return output;
+                    });
+                }
+            );
+        } finally {
+            evaluatorLock.dispose();
+            void withLock(sampler, "sample", sampler.asyncDispose);
+        }
     }
 
     /** @internal */
     private async *_evaluate(tokens: Token[], {
-        temperature = 0,
-        minP = 0,
-        topK = 40,
-        topP = 0.95,
+        temperature,
+        minP,
+        topK,
+        topP,
         seed,
         grammarEvaluationState,
         repeatPenalty,
         tokenBias,
-        evaluationPriority = 5,
+        evaluationPriority = defaultEvaluationPriority,
         generateNewTokens = true,
         contextShiftOptions,
         yieldEogToken = false,
 
-        _noSampling = false
+        _noSampling = false,
+        _skipLock = false
     }: {
         temperature?: number, minP?: number, topK?: number, topP?: number, seed?: number,
         grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
         repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias),
         evaluationPriority?: EvaluationPriority, generateNewTokens?: boolean, contextShiftOptions: Required<ContextShiftOptions>,
         yieldEogToken?: boolean,
-        _noSampling?: boolean
+        _noSampling?: boolean,
+        _skipLock?: boolean
     }): AsyncGenerator<Token, void | Token> {
         this._ensureNotDisposed();
 
@@ -1219,81 +1581,294 @@ export class LlamaContextSequence {
         if (evalTokens.length === 0)
             return;
 
+        await this._abortTokenPredictor(false, true);
+
         const sampler = new LlamaSampler(this.model);
         try {
             while (true) {
                 this._ensureNotDisposed();
+                const evaluatorLock = _skipLock
+                    ? undefined
+                    : await acquireLock(this._lock, "evaluate");
+                let nextToken: Token | -1 | null | undefined;
 
-                // Evaluate to get the next token.
-                const nextToken: Token | null = await this._decodeTokens(
-                    evalTokens,
-                    generateNewTokens,
-                    evaluationPriority,
-                    this._tokenMeter,
-                    contextShiftOptions,
-                    (batchLogitIndex) => {
-                        if (_noSampling)
-                            return null;
-
-                        const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
-                            ? repeatPenalty.punishTokens()
-                            : repeatPenalty?.punishTokens;
-
-                        const maxPunishTokens = Math.max(
-                            repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens,
-                            repeatPenaltyTokens?.length ?? 0
-                        );
+                try {
+                    const logitsArray: (true | undefined)[] = [];
+
+                    if (generateNewTokens)
+                        logitsArray[evalTokens.length - 1] = true;
+
+                    // Evaluate to get the next token.
+                    const decodeResult = await this._decodeTokens(
+                        evalTokens,
+                        logitsArray,
+                        evaluationPriority,
+                        this._tokenMeter,
+                        contextShiftOptions,
+                        (batchLogitIndex) => {
+                            if (_noSampling)
+                                return null;
+
+                            const samplerConfig = this._resolveSamplerConfig({
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                grammarEvaluationState,
+                                repeatPenalty,
+                                tokenBias
+                            });
+
+                            return withLock(sampler, "sample", async () => {
+                                if (sampler.disposed)
+                                    return null;
+
+                                sampler.applyConfig(samplerConfig);
+                                return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                            });
+                        }
+                    );
+
+                    nextToken = decodeResult[evalTokens.length - 1];
+
+                    if (nextToken === -1)
+                        throw new Error("Failed to sample next token");
+
+                    if (nextToken == null)
+                        return;
+
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
+                } finally {
+                    evaluatorLock?.dispose();
+                }
+
+                const replacementToken = (yield nextToken) as undefined | Token;
+
+                // set the tokens for the next evaluation
+                if (replacementToken != null)
+                    evalTokens = [replacementToken];
+                else
+                    evalTokens = [nextToken];
+            }
+        } finally {
+            void withLock(sampler, "sample", sampler.asyncDispose);
+        }
+    }
+
+    /** @internal */
+    private async *_speculativeEvaluate(tokens: Token[], {
+        temperature,
+        minP,
+        topK,
+        topP,
+        seed,
+        grammarEvaluationState,
+        repeatPenalty,
+        tokenBias,
+        evaluationPriority = defaultEvaluationPriority,
+        contextShiftOptions,
+        yieldEogToken = false,
+        tokenPredictor
+    }: {
+        temperature?: number, minP?: number, topK?: number, topP?: number, seed?: number,
+        grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
+        repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias),
+        evaluationPriority?: EvaluationPriority, contextShiftOptions: Required<ContextShiftOptions>,
+        yieldEogToken?: boolean, tokenPredictor: TokenPredictor
+    }): AsyncGenerator<Token, void | Token> {
+        this._ensureNotDisposed();
+
+        let evalTokens = tokens.slice();
+
+        if (evalTokens.length === 0)
+            return;
+
+        const tokenPredictorOwner: {} = {};
+        this._tokenPredictorOwner = tokenPredictorOwner;
+        await this._abortTokenPredictor();
+
+        let logitsArray: (true | undefined)[] = [];
+        let logitsStartIndex = evalTokens.length - 1;
+        const validatedTokens: [input: Token, output: Token][] = [];
+        logitsArray[logitsStartIndex] = true;
+
+        const sampler = new LlamaSampler(this.model);
+        try {
+            while (true) {
+                this._ensureNotDisposed();
+                const evaluatorLock = await acquireLock(this._lock, "evaluate");
+                let nextToken: Token | undefined;
 
+                try {
+                    if (this._tokenPredictorOwner === tokenPredictorOwner &&
+                        this._loadedTokenPredictions.length > 0 &&
+                        evalTokens.length === 1 &&
+                        evalTokens[0] === this._loadedTokenPredictions[0]?.[0]
+                    ) {
+                        nextToken = this._loadedTokenPredictions.shift()![1];
                         const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
                             ? grammarEvaluationState()
                             : grammarEvaluationState;
 
-                        if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
-                            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
-
-                        const {tokenBiasKeys, tokenBiasValues} = getTokenBiasesForAddon(tokenBias, this.model);
-
-                        sampler.applyConfig(removeNullFields({
-                            temperature,
-                            minP,
-                            topK,
-                            topP,
-                            seed: Math.max(
-                                0,
-                                Number.isFinite(seed)
-                                    ? Math.floor(seed ?? (Date.now() / 1000))
-                                    : Math.floor(Date.now() / 1000)
-                            ),
-                            repeatPenalty: repeatPenalty?.penalty,
-                            repeatPenaltyMaxTokens: maxPunishTokens,
-                            repeatPenaltyTokens: repeatPenaltyTokens != null
-                                ? Uint32Array.from(repeatPenaltyTokens)
-                                : undefined,
-                            repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
-                            repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
-                            tokenBiasKeys,
-                            tokenBiasValues,
-                            grammarEvaluationState: resolvedGrammarEvaluationState?._state
-                        }));
-
-                        return withLock(sampler, "sample", async () => {
-                            if (sampler.disposed)
-                                return null;
+                        if (resolvedGrammarEvaluationState != null)
+                            LlamaSampler._acceptTokenOnGrammarEvaluationState(
+                                this._context._llama,
+                                resolvedGrammarEvaluationState,
+                                nextToken
+                            );
+
+                        this._unusedTokenPredictions--;
+                        this._usedTokenPredictions++;
+                    } else if (this._tokenPredictorOwner === tokenPredictorOwner && this._loadedTokenPredictions.length > 0) {
+                        const deleteStartIndex = Math.max(0, this._nextTokenIndex - this._loadedTokenPredictions.length);
+                        await this._eraseContextTokenRanges(
+                            [{start: deleteStartIndex, end: this._nextTokenIndex}],
+                            {canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock: true}
+                        );
+                        this._loadedTokenPredictions.length = 0;
+                    }
 
-                            return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                    if (this._resetTokenPredictor) {
+                        await tokenPredictor.reset({
+                            stateTokens: [...this._contextTokens, ...evalTokens],
+                            evaluateOptions: {
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                grammarEvaluationState: grammarEvaluationState instanceof Function
+                                    ? grammarEvaluationState()?.clone()
+                                    : grammarEvaluationState?.clone(),
+                                repeatPenalty,
+                                tokenBias,
+                                evaluationPriority,
+                                contextShift: contextShiftOptions,
+                                yieldEogToken: true
+                            },
+                            targetSequence: this
                         });
+                        this._resetTokenPredictor = false;
+                        this._tokenPredictorOwner = tokenPredictorOwner;
                     }
-                );
 
-                if (nextToken === -1)
-                    throw new Error("Failed to sample next token");
+                    if (nextToken == null) {
+                        if (this._tokenPredictorOwner === tokenPredictorOwner &&
+
+                            // prevent incurring context shifts due to token prediction validations
+                            this._nextTokenIndex + evalTokens.length < this._context.contextSize
+                        ) {
+                            const testGrammarClone = grammarEvaluationState instanceof Function
+                                ? grammarEvaluationState()?.clone()
+                                : grammarEvaluationState?.clone();
+                            for (const token of await tokenPredictor.predictTokens()) {
+                                if (testGrammarClone != null) {
+                                    const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(
+                                        this.model._llama,
+                                        testGrammarClone,
+                                        token
+                                    );
+
+                                    if (!canAddToken)
+                                        break;
+                                }
+
+                                evalTokens.push(token);
+                                logitsArray[evalTokens.length - 1] = true;
+
+                                // prevent incurring context shifts due to token prediction validations
+                                if (this._nextTokenIndex + evalTokens.length >= this._context.contextSize)
+                                    break;
+                            }
+                        }
 
-                if (nextToken == null)
-                    return;
+                        let resolvedGrammarEvaluationState: LlamaGrammarEvaluationState | undefined = undefined;
+
+                        // Evaluate to get the next token.
+                        const decodeResult = await this._decodeTokens(
+                            evalTokens,
+                            logitsArray,
+                            evaluationPriority,
+                            this._tokenMeter,
+                            contextShiftOptions,
+                            (batchLogitIndex, tokenIndex: number) => {
+                                if (tokenIndex === logitsStartIndex)
+                                    resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+                                        ? grammarEvaluationState()
+                                        : grammarEvaluationState;
+                                else if (tokenIndex === logitsStartIndex + 1)
+                                    resolvedGrammarEvaluationState = resolvedGrammarEvaluationState?.clone();
+
+                                const samplerConfig = this._resolveSamplerConfig({
+                                    temperature,
+                                    minP,
+                                    topK,
+                                    topP,
+                                    seed,
+                                    grammarEvaluationState: resolvedGrammarEvaluationState,
+                                    repeatPenalty,
+                                    tokenBias
+                                });
+
+                                return withLock(sampler, "sample", async () => {
+                                    if (sampler.disposed)
+                                        return null;
+
+                                    sampler.applyConfig(samplerConfig);
+                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                                });
+                            }
+                        );
 
-                // the model finished generating text
-                if (!yieldEogToken && this._context.model.isEogToken(nextToken))
-                    break;
+                        for (let i = logitsStartIndex; i < evalTokens.length; i++) {
+                            const resultToken = decodeResult[i];
+
+                            if (i === logitsStartIndex) {
+                                if (resultToken === -1)
+                                    throw new Error("Failed to sample next token");
+
+                                if (resultToken == null)
+                                    return;
+
+                                nextToken = resultToken;
+                            } else {
+                                if (resultToken === -1 || resultToken == null)
+                                    break;
+
+                                const lastValidatedTokenOutput = i === logitsStartIndex + 1
+                                    ? nextToken
+                                    : validatedTokens.at(-1)?.[1];
+                                if (lastValidatedTokenOutput != null && lastValidatedTokenOutput === evalTokens[i]) {
+                                    this._loadedTokenPredictions.push([evalTokens[i]!, resultToken]);
+                                    this._validatedTokenPredictions++;
+                                    this._unusedTokenPredictions++;
+                                } else {
+                                    const deleteSize = Math.min(evalTokens.length - i, this.context.contextSize);
+                                    this._refutedTokenPredictions += deleteSize;
+                                    const deleteStartIndex = this._nextTokenIndex - deleteSize;
+                                    tokenPredictor.stop(true);
+                                    await this._eraseContextTokenRanges([{
+                                        start: deleteStartIndex,
+                                        end: this._nextTokenIndex
+                                    }], {canResetTokenPredictor: false, canRemovePredictionTokens: false, skipLock: true});
+                                    break; // the assumption that this token will be generated was wrong
+                                }
+                            }
+                        }
+                    }
+
+                    if (nextToken == null)
+                        throw new Error("Failed to generated next token");
+
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
+                } finally {
+                    evaluatorLock.dispose();
+                }
 
                 const replacementToken = (yield nextToken) as undefined | Token;
 
@@ -1302,62 +1877,151 @@ export class LlamaContextSequence {
                     evalTokens = [replacementToken];
                 else
                     evalTokens = [nextToken];
+
+                if (this._tokenPredictorOwner === tokenPredictorOwner)
+                    tokenPredictor.pushTokens(evalTokens);
+
+                logitsArray = [];
+                logitsStartIndex = evalTokens.length - 1;
+                logitsArray[logitsStartIndex] = true;
             }
         } finally {
             void withLock(sampler, "sample", sampler.asyncDispose);
+
+            if (this._tokenPredictorOwner === tokenPredictorOwner)
+                tokenPredictor.stop();
         }
     }
 
     /** @internal */
+    private async _abortTokenPredictor(skipClearingPredictionsFromState: boolean = false, skipLock: boolean = false) {
+        this._tokenPredictor?.stop();
+        this._resetTokenPredictor = true;
+
+        if (skipClearingPredictionsFromState)
+            return;
+
+        if (this._loadedTokenPredictions.length > 0)
+            await this._eraseContextTokenRanges([{
+                start: this._nextTokenIndex - this._loadedTokenPredictions.length,
+                end: this._nextTokenIndex
+            }], {canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock});
+    }
+
+    /** @internal */
+    private _resolveSamplerConfig({
+        temperature = 0,
+        minP = 0,
+        topK = 40,
+        topP = 0.95,
+        seed,
+        grammarEvaluationState,
+        repeatPenalty,
+        tokenBias
+    }: {
+        temperature?: number, minP?: number, topK?: number, topP?: number, seed?: number,
+        grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
+        repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias)
+    }) {
+        const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
+            ? repeatPenalty.punishTokens()
+            : repeatPenalty?.punishTokens;
+
+        const maxPunishTokens = Math.max(
+            repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens,
+            repeatPenaltyTokens?.length ?? 0
+        );
+
+        const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+            ? grammarEvaluationState()
+            : grammarEvaluationState;
+
+        if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
+            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+
+        const {tokenBiasKeys, tokenBiasValues} = getTokenBiasesForAddon(tokenBias, this.model);
+
+        return removeNullFields<Parameters<typeof LlamaSampler.prototype.applyConfig>[0]>({
+            temperature,
+            minP,
+            topK,
+            topP,
+            seed: Math.max(
+                0,
+                Number.isFinite(seed)
+                    ? Math.floor(seed ?? (Date.now() / 1000))
+                    : Math.floor(Date.now() / 1000)
+            ),
+            repeatPenalty: repeatPenalty?.penalty,
+            repeatPenaltyMaxTokens: maxPunishTokens,
+            repeatPenaltyTokens: repeatPenaltyTokens != null
+                ? Uint32Array.from(repeatPenaltyTokens)
+                : undefined,
+            repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
+            repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
+            tokenBiasKeys,
+            tokenBiasValues,
+            grammarEvaluationState: resolvedGrammarEvaluationState?._state
+        });
+    }
+
+    /**
+     * The caller of this function has to wrap it with a lock to ensure this function doesn't run concurrently.
+     * @internal
+     */
     private async _decodeTokens<T>(
         tokens: Token[],
-        generateLogit: boolean,
+        logits: (true | undefined)[],
         evaluationPriority: EvaluationPriority,
         tokenMeter: TokenMeter,
         contextShiftOptions: Required<ContextShiftOptions>,
-        onDecodeDone: ((batchLogitIndex: BatchLogitIndex) => T | Promise<T>)
-    ): Promise<T | null> {
+        logitDataMapper: ((batchLogitIndex: BatchLogitIndex, tokenIndex: number) => T | Promise<T>)
+    ): Promise<Array<undefined | T>> {
         this._ensureNotDisposed();
 
         const tokensLeftToDecode = tokens.slice();
+        const tokenLogitsLeftToDecode = logits.slice();
+        let currentTokenIndex = 0;
+        const res: Array<undefined | T> = [];
 
-        return await withLock(this, "evaluate", async (): Promise<T | null> => {
-            while (tokensLeftToDecode.length > 0) {
-                this._ensureNotDisposed();
+        const normalizedLogitDataMapper = (batchLogitIndex: BatchLogitIndex, contextStateTokenIndex: number) => {
+            return logitDataMapper(batchLogitIndex, currentTokenIndex + (contextStateTokenIndex - this._nextTokenIndex));
+        };
 
-                let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+        while (tokensLeftToDecode.length > 0) {
+            this._ensureNotDisposed();
 
-                if (freeSpace <= 0) {
-                    await this._freeUpSpaceForTokens(contextShiftOptions);
-                    freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+            let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
 
-                    if (freeSpace <= 0)
-                        throw new Error("Failed to free up space for new tokens");
-                }
+            if (freeSpace <= 0) {
+                await this._freeUpSpaceForTokens(contextShiftOptions);
+                freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
 
-                const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
-                const generateLogitAtTheEnd = generateLogit && tokensLeftToDecode.length === 0;
+                if (freeSpace <= 0)
+                    throw new Error("Failed to free up space for new tokens");
+            }
 
-                const nextToken = await this._context._decodeTokens({
-                    sequenceId: this._sequenceId,
-                    tokens: tokensToDecode,
-                    firstTokenSequenceIndex: this._nextTokenIndex,
-                    generateLogitAtTheEnd,
-                    evaluationPriority,
-                    tokenMeter
-                }, !generateLogitAtTheEnd
-                    ? undefined
-                    : onDecodeDone
-                );
-                this._nextTokenIndex += tokensToDecode.length;
-                this._contextTokens = this._contextTokens.concat(tokensToDecode);
+            const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
+            const tokensLogits = tokenLogitsLeftToDecode.slice(0, tokensToDecode.length);
 
-                if (generateLogitAtTheEnd && nextToken != null)
-                    return nextToken;
-            }
+            const generatedLogits = await this._context._decodeTokens({
+                sequenceId: this._sequenceId,
+                tokens: tokensToDecode,
+                firstTokenSequenceIndex: this._nextTokenIndex,
+                logits: tokensLogits,
+                evaluationPriority,
+                tokenMeter
+            }, normalizedLogitDataMapper);
 
-            return null;
-        });
+            for (const [index, value] of generatedLogits)
+                res[currentTokenIndex + (index - this._nextTokenIndex)] = value;
+
+            this._nextTokenIndex += tokensToDecode.length;
+            currentTokenIndex += tokensToDecode.length;
+            this._contextTokens = this._contextTokens.concat(tokensToDecode);
+        }
+
+        return res;
     }
 
     /** @internal */
@@ -1381,7 +2045,7 @@ export class LlamaContextSequence {
             if (this.model.tokens.bos != null && this._contextTokens[0] === this.model.tokens.bos)
                 eraseStartIndex = 1;
 
-            await this.eraseContextTokenRanges([{start: eraseStartIndex, end: size + eraseStartIndex}]);
+            await this._eraseContextTokenRanges([{start: eraseStartIndex, end: size + eraseStartIndex}], {skipLock: true});
         } else {
             const ranges = await contextShiftOptions.strategy({
                 sequence: this,
@@ -1391,10 +2055,10 @@ export class LlamaContextSequence {
             if (ranges == null)
                 throw new Error("Invalid delete ranges");
 
-            await this.eraseContextTokenRanges(ranges);
+            await this._eraseContextTokenRanges(ranges, {skipLock: true});
 
-            if (this.nextTokenIndex >= this._context.contextSize - 1)
-                await this.eraseContextTokenRanges([{start: 0, end: size}]);
+            if (this._nextTokenIndex >= this._context.contextSize - 1)
+                await this._eraseContextTokenRanges([{start: 0, end: size}], {skipLock: true});
         }
     }
 
@@ -1413,12 +2077,14 @@ export class LlamaContextSequence {
         contextShift: {
             size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)),
             strategy: contextShiftStrategy = "eraseBeginning"
-        } = {}
+        } = {},
+        tokenPredictor
     }: {
         sequenceId: number,
         context: LlamaContext,
         tokenMeter?: TokenMeter,
-        contextShift?: ContextShiftOptions
+        contextShift?: ContextShiftOptions,
+        tokenPredictor?: TokenPredictor
     }): LlamaContextSequence {
         return new LlamaContextSequence({
             sequenceId,
@@ -1427,7 +2093,8 @@ export class LlamaContextSequence {
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            tokenPredictor
         });
     }
 }
@@ -1436,11 +2103,11 @@ type InternalQueuedDecode = {
     sequenceId: number,
     firstTokenSequenceIndex: number,
     tokens: readonly Token[],
-    generateLogitAtTheEnd: boolean,
+    logits: (true | undefined)[],
     evaluationPriority: EvaluationPriority,
     tokenMeter: TokenMeter,
     response: [accept: (res: any) => void, reject: (reason: unknown) => void],
-    onDone?: (batchLogitIndex: BatchLogitIndex) => any
+    logitDataMapper: ((batchLogitIndex: BatchLogitIndex, tokenIndex: number) => any | Promise<any>)
 };
 
 type CurrentBatchItem = {
@@ -1485,6 +2152,22 @@ function getTokenBiasesForAddon(tokenBias: undefined | TokenBias | (() => TokenB
     };
 }
 
+function reviveTokenProbabilities(probabilitiesList?: (Token | number)[]) {
+    if (probabilitiesList == null)
+        return undefined;
+
+    const res = new Map<Token, number>();
+
+    for (let i = 1; i < probabilitiesList.length; i++) {
+        const token = probabilitiesList[i - 1]! as Token;
+        const probability = probabilitiesList[i]! as number;
+
+        res.set(token, probability);
+    }
+
+    return res;
+}
+
 function disposeContextIfReferenced(contextRef: WeakRef<LlamaContext>) {
     const context = contextRef.deref();
 
diff --git a/src/evaluator/LlamaContext/TokenPredictor.ts b/src/evaluator/LlamaContext/TokenPredictor.ts
new file mode 100644
index 00000000..f7f7ff9f
--- /dev/null
+++ b/src/evaluator/LlamaContext/TokenPredictor.ts
@@ -0,0 +1,65 @@
+import {Token} from "../../types.js";
+import {SequenceEvaluateOptions} from "./types.js";
+import {LlamaContextSequence} from "./LlamaContext.js";
+
+/**
+ * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
+ */
+export abstract class TokenPredictor {
+    /**
+     * Resets the state of the predictor.
+     *
+     * Called before the generation starts.
+     */
+    public abstract reset(params: {
+        /** The target sequence that this token predictor is generating tokens for */
+        targetSequence: LlamaContextSequence,
+
+        /**
+         * The tokens that are or will be loaded into the state.
+         *
+         * The initial predictions should be based on these tokens.
+         *
+         * When additional tokens are pushed into the state, the `pushTokens` method will be called with those tokens.
+         */
+        stateTokens: Token[],
+
+        /**
+         * Options used for the evaluation on the target sequence.
+         *
+         * The `grammarEvaluationState` is cloned before being passed to the token predictor,
+         * so it can be modified without affecting the original state.
+         */
+        evaluateOptions: Readonly<SequenceEvaluateOptions>
+    }): Promise<void> | void;
+    public abstract pushTokens(tokens: Token[]): void;
+
+    /**
+     * Predicts the next tokens based on the current state.
+     *
+     * If the generation should wait until the minimum predications are ready,
+     * this method should return a promise that resolves when the minimum predictions are ready.
+     *
+     * A background prediction process can be started when this function is called,
+     * so that the next predictions will be ready when this function is called again.
+     */
+    public abstract predictTokens(): Promise<Token[]> | Token[];
+
+    /**
+     * Stops the prediction process when it runs in the background.
+     * @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
+     */
+    public stop(untilPredictionsExhausted?: boolean): Promise<void> | void {}
+
+    /**
+     * Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+     */
+    public updateInputTokens(tokens: Token[]): void {}
+
+    public dispose(): Promise<void> | void {}
+
+    /** @hidden */
+    public [Symbol.dispose]() {
+        return this.dispose();
+    }
+}
diff --git a/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts b/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
new file mode 100644
index 00000000..5c85d92c
--- /dev/null
+++ b/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
@@ -0,0 +1,326 @@
+import {withLock} from "lifecycle-utils";
+import {Token} from "../../../types.js";
+import {LlamaGrammarEvaluationState} from "../../LlamaGrammarEvaluationState.js";
+import {pushAll} from "../../../utils/pushAll.js";
+import {getConsoleLogPrefix} from "../../../utils/getConsoleLogPrefix.js";
+import {SequenceEvaluateOptions} from "../types.js";
+import {LlamaSampler} from "../LlamaSampler.js";
+import {LlamaContextSequence} from "../LlamaContext.js";
+import {TokenPredictor} from "../TokenPredictor.js";
+
+const defaultPredictionMinTokens = 0;
+const defaultPredictionMaxTokens = 16;
+
+/**
+ * Predicts the next tokens by evaluating the current state of the target sequence
+ * on a draft sequence from a smaller and faster draft model.
+ * @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
+ */
+export class DraftSequenceTokenPredictor extends TokenPredictor {
+    /** @internal */ private readonly _draftSequence: LlamaContextSequence;
+    /** @internal */ private readonly _minTokens: number;
+    /** @internal */ private readonly _maxTokens: number;
+    /** @internal */ private _stateTokens: Token[] = [];
+    /** @internal */ private _pendingEvalTokens: Token[] = [];
+    /** @internal */ private _predictedTokens: Token[] = [];
+    /** @internal */ private _evaluateOptions: SequenceEvaluateOptions = {};
+    /** @internal */ private _overrideEvaluateOptions: SequenceEvaluateOptions = {};
+    /** @internal */ private _grammarEvaluationStateOption?: LlamaGrammarEvaluationState;
+    /** @internal */ private _currentEvaluationAbortController: AbortController = new AbortController();
+    /** @internal */ private _resetAbortController: AbortController = new AbortController();
+    /** @internal */ private _stopped: boolean = true;
+    /** @internal */ private _waitForPredictionExhaustion: boolean = false;
+    /** @internal */ private _minTokensCallbacks: Array<() => void> = [];
+    /** @internal */ private _resetPredictions: boolean = false;
+    /** @internal */ private _iterator?: AsyncGenerator<Token, void | Token>;
+    /** @internal */ private _active: boolean = false;
+    /** @internal */ private _disposed: boolean = false;
+
+    public constructor(draftSequence: LlamaContextSequence, options: {
+        /**
+         * The minimum number of tokens to draft.
+         *
+         * Defaults to `0`.
+         */
+        minTokens?: number,
+
+        /**
+         * Maximum number of tokens to draft.
+         *
+         * Defaults to `16`.
+         */
+        maxTokens?: number,
+
+        /**
+         * Evaluate options default to the values of the target sequence.
+         *
+         * You can override any of the options for the prediction here.
+         */
+        evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">
+    } = {}) {
+        super();
+
+        this._draftSequence = draftSequence;
+        this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
+        this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
+        this._overrideEvaluateOptions = options.evaluateOptions ?? {};
+
+        if (draftSequence.disposed)
+            throw new Error("The draft sequence is disposed");
+    }
+
+    public get draftSequence() {
+        return this._draftSequence;
+    }
+
+    public get minTokens() {
+        return this._minTokens;
+    }
+
+    public get maxTokens() {
+        return this._maxTokens;
+    }
+
+    public async reset({targetSequence, stateTokens, evaluateOptions}: {
+        targetSequence: LlamaContextSequence,
+        stateTokens: Token[],
+        evaluateOptions: Readonly<SequenceEvaluateOptions>
+    }) {
+        this._currentEvaluationAbortController.abort();
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+        this._resetAbortController = new AbortController();
+        this._stopped = true;
+        this._waitForPredictionExhaustion = false;
+        this._iterator?.return();
+        this._iterator = undefined;
+        const currentAbortSignal = this._resetAbortController.signal;
+
+        targetSequence.context._ctx.ensureDraftContextIsCompatibleForSpeculative(this._draftSequence.context._ctx);
+
+        try {
+            await withLock(this, "evaluate", currentAbortSignal, async () => {
+                this._stateTokens = stateTokens.slice();
+                this._pendingEvalTokens = [];
+                this._predictedTokens = [];
+                this._resetPredictions = false;
+
+                while (this._minTokensCallbacks.length > 0)
+                    this._minTokensCallbacks.shift()?.();
+
+                const lastToken = this._stateTokens.pop();
+                if (lastToken != null)
+                    this._pendingEvalTokens.push(lastToken);
+
+                this._evaluateOptions = evaluateOptions;
+                this._grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+                    ? this._evaluateOptions.grammarEvaluationState()?.clone()
+                    : this._evaluateOptions.grammarEvaluationState?.clone();
+
+                const newStateTokens = this._stateTokens.slice(-this._draftSequence.context.contextSize + 1);
+                await this._draftSequence.adaptStateToTokens(newStateTokens, true);
+
+                newStateTokens.splice(0, this._draftSequence.nextTokenIndex);
+
+                await this._draftSequence.evaluateWithoutGeneratingNewTokens(newStateTokens, {
+                    contextShift: this._evaluateOptions.contextShift,
+                    evaluationPriority: this._evaluateOptions.evaluationPriority
+                });
+            });
+        } catch (err) {
+            if (err !== currentAbortSignal.reason)
+                throw err;
+        }
+    }
+
+    public pushTokens(tokens: Token[]) {
+        const grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+            ? this._evaluateOptions.grammarEvaluationState()?.clone()
+            : this._evaluateOptions.grammarEvaluationState?.clone();
+        void withLock(this, "pushTokens", async () => {
+            this._grammarEvaluationStateOption = grammarEvaluationStateOption;
+
+            const tokensToPush = tokens.slice();
+            while (!this._resetPredictions && tokensToPush.length > 0) {
+                const token = tokensToPush.shift()!;
+
+                if (this._predictedTokens.length > 0 && this._predictedTokens[0] === token) {
+                    this._predictedTokens.shift();
+                } else {
+                    tokensToPush.unshift(token);
+                    break;
+                }
+            }
+
+            if (tokensToPush.length === 0) {
+                if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0)
+                    this._resume();
+
+                return;
+            }
+
+            this._currentEvaluationAbortController.abort();
+            this._currentEvaluationAbortController = new AbortController();
+
+            pushAll(this._pendingEvalTokens, tokensToPush);
+            this._resetPredictions = true;
+
+            this._resume();
+        });
+    }
+
+    public predictTokens() {
+        if (this._stopped && this._pendingEvalTokens.length === 0 && !this._resetPredictions)
+            return this._predictedTokens;
+
+        this._stopped = false;
+        if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0) {
+            this._waitForPredictionExhaustion = false;
+            this._resume();
+        }
+
+        if (this._predictedTokens.length >= this._minTokens && !this._resetPredictions)
+            return this._predictedTokens;
+
+        if (!this._active || (this._waitForPredictionExhaustion && this._predictedTokens.length > 0)) {
+            if (this._resetPredictions)
+                return [];
+
+            return this._predictedTokens;
+        }
+
+        return new Promise<void>((accept) => void this._minTokensCallbacks.push(accept))
+            .then(() => {
+                if (this._resetPredictions)
+                    return [];
+
+                return this._predictedTokens;
+            });
+    }
+
+    public override stop(untilPredictionsExhausted: boolean = false) {
+        this._stopped = true;
+        this._currentEvaluationAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+        void withLock(this, "evaluate", async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+
+    public override dispose() {
+        this._disposed = true;
+        this._stopped = true;
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController.abort();
+
+        void withLock(this, "evaluate", async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+
+    /** @internal */
+    private _canIterate(): boolean {
+        return !this._disposed && !this._stopped && (this._predictedTokens.length < this._maxTokens || this._resetPredictions);
+    }
+
+    /** @internal */
+    private _resume() {
+        if (this._active || !this._canIterate())
+            return;
+
+        this._active = true;
+        void withLock(this, "evaluate", async () => {
+            try {
+                const abortSignal = this._currentEvaluationAbortController.signal;
+
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+
+                const resetPredications = async () => {
+                    this._iterator?.return();
+                    this._iterator = undefined;
+                    this._waitForPredictionExhaustion = false;
+                    this._resetPredictions = false;
+                    const tokenToDelete = Math.max(0, Math.min(this._predictedTokens.length - 1, this._draftSequence.context.contextSize));
+                    this._predictedTokens = [];
+                    await this._draftSequence.eraseContextTokenRanges([{
+                        start: this._draftSequence.nextTokenIndex - tokenToDelete,
+                        end: this._draftSequence.nextTokenIndex
+                    }]);
+                };
+
+                const createIterator = () => {
+                    const tokens = this._pendingEvalTokens;
+                    this._pendingEvalTokens = [];
+                    return this.draftSequence.evaluate(tokens, {
+                        ...this._evaluateOptions,
+                        ...this._overrideEvaluateOptions,
+                        grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
+                    });
+                };
+
+                if (this._resetPredictions)
+                    await resetPredications();
+
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+
+                let iterator = createIterator();
+                this._iterator = iterator;
+                while (this._canIterate() && !abortSignal.aborted) {
+                    const {value, done} = await iterator.next();
+                    if (value != null)
+                        this._predictedTokens.push(value);
+
+                    if (this._resetPredictions && !abortSignal.aborted) {
+                        await resetPredications();
+                        iterator = createIterator();
+                        this._iterator = iterator;
+                        continue;
+                    }
+
+                    if (this._predictedTokens.length >= this._minTokens) {
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+                    }
+
+                    if (done) {
+                        this._iterator = undefined;
+                        await iterator.return();
+                        this._waitForPredictionExhaustion = true;
+
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+
+                        break;
+                    }
+                }
+            } finally {
+                this._active = false;
+            }
+        });
+    }
+
+    /** @internal */
+    private _getGrammarEvaluationStateWithTokens(tokens: Token[]) {
+        if (this._grammarEvaluationStateOption == null)
+            return undefined;
+
+        const clone = this._grammarEvaluationStateOption.clone();
+        for (const token of tokens) {
+            const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+
+            if (!canAddToken) {
+                console.warn(getConsoleLogPrefix(false, false), "The pushed tokens are incompatible with the grammar evaluation state. The grammar will be ignored.");
+                this._grammarEvaluationStateOption = undefined;
+                return undefined;
+            }
+
+            LlamaSampler._acceptTokenOnGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+        }
+
+        return clone;
+    }
+}
diff --git a/src/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.ts b/src/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.ts
new file mode 100644
index 00000000..ba890b53
--- /dev/null
+++ b/src/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.ts
@@ -0,0 +1,221 @@
+import {DisposedError} from "lifecycle-utils";
+import {Token} from "../../../types.js";
+import {pushAll} from "../../../utils/pushAll.js";
+import {TokenPredictor} from "../TokenPredictor.js";
+
+const defaultPatternMinLength = 1;
+const defaultPatternMaxLength = 0;
+const defaultPredictionMinLength = 1;
+const defaultPredictionMaxLength = 3;
+
+/**
+ * Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
+ *
+ * This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
+ * such as in text summarization or modifying code).
+ *
+ * This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+ *
+ * Based on https://github.com/apoorvumang/prompt-lookup-decoding.
+ * @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
+ */
+export class InputLookupTokenPredictor extends TokenPredictor {
+    /** @internal */ private readonly _patternMinLength: number;
+    /** @internal */ private readonly _patternMaxLength: number;
+    /** @internal */ private readonly _predictionMinLength: number;
+    /** @internal */ private readonly _predictionMaxLength: number;
+    /** @internal */ private _lastPredictionMatchStartIndex: number | undefined = undefined;
+    /** @internal */ private _lastPredictionMatchLength: number | undefined = undefined;
+    /** @internal */ private _stateTokens: Token[] = [];
+    /** @internal */ private _inputTokens: Token[] = [];
+    /** @internal */ private _disposed = false;
+
+    public constructor(options: {
+        patternLength?: {
+            /**
+             * Min pattern length to look for in the input tokens.
+             *
+             * Defaults to `1`.
+             */
+            min?: number,
+
+            /**
+             * Max pattern length to look for in the input tokens.
+             *
+             * Set to `0` to disable the max pattern size.
+             *
+             * Defaults to `0`.
+             */
+            max?: number
+        },
+
+        predictionLength?: {
+            /**
+             * Minimum number of tokens to predict.
+             *
+             * Defaults to `1`.
+             */
+            min?: number,
+
+            /**
+             * Maximum number of tokens to predict.
+             *
+             * Defaults to `3`.
+             */
+            max?: number
+        }
+    } = {}) {
+        super();
+
+        this._patternMinLength = Math.floor(Math.max(1, options?.patternLength?.min ?? defaultPatternMinLength));
+        this._patternMaxLength = Math.floor(
+            Math.max(
+                0,
+                Math.max(this._patternMinLength, options?.patternLength?.max ?? defaultPatternMaxLength)
+            )
+        );
+        this._predictionMinLength = Math.floor(Math.max(1, options.predictionLength?.min ?? defaultPredictionMinLength));
+        this._predictionMaxLength = Math.floor(
+            Math.max(
+                this._patternMinLength,
+                options.predictionLength?.max ?? defaultPredictionMaxLength
+            )
+        );
+    }
+
+    public get patternMinLength() {
+        return this._patternMinLength;
+    }
+
+    public get patternMaxLength() {
+        return this._patternMaxLength;
+    }
+
+    public get predictionMinLength() {
+        return this._predictionMinLength;
+    }
+
+    public get predictionMaxLength() {
+        return this._predictionMaxLength;
+    }
+
+    public reset({stateTokens}: {
+        stateTokens: Token[]
+    }) {
+        this._stateTokens = stateTokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+
+    public override updateInputTokens(tokens: Token[]) {
+        this._inputTokens = tokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+
+    public pushTokens(tokens: Token[]) {
+        pushAll(this._stateTokens, tokens);
+
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            this._lastPredictionMatchLength += tokens.length;
+        }
+    }
+
+    public predictTokens() {
+        if (this._disposed)
+            throw new DisposedError();
+
+        if (this._inputTokens.length === 0 || this._stateTokens.length === 0)
+            return [];
+
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            for (
+                let p = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength - 1,
+                    s = this._stateTokens.length - 1;
+                p >= this._lastPredictionMatchStartIndex && s >= 0;
+                p--, s--
+            ) {
+                if (this._inputTokens[p] !== this._stateTokens[s]) {
+                    delete this._lastPredictionMatchStartIndex;
+                    delete this._lastPredictionMatchLength;
+                    break;
+                }
+            }
+
+            if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+                const predictionEndIndex = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength;
+                if (predictionEndIndex < this._inputTokens.length) {
+                    return this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+                }
+            }
+        }
+
+        const [matchStartIndex, matchLength] = this._findLongestPatternIndex(this._inputTokens, this._stateTokens);
+        if (matchStartIndex == null || matchLength == null)
+            return [];
+
+        const predictionEndIndex = matchStartIndex + matchLength;
+        const res = this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+
+        if (res.length >= this._predictionMinLength) {
+            this._lastPredictionMatchStartIndex = matchStartIndex;
+            this._lastPredictionMatchLength = matchLength;
+            return res;
+        }
+
+        return [];
+    }
+
+    public override dispose() {
+        this._disposed = true;
+        this._stateTokens = [];
+        this._inputTokens = [];
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+
+    /** @internal */
+    private _findLongestPatternIndex(findIn: Token[], lookupPattern: Token[]): [index: number, length: number] | [] {
+        const checkIndexes: number[] = [];
+        let bestIndex = -1;
+        let bestIndexDiff = -1;
+
+        for (let i = findIn.length - this._predictionMinLength; i >= 0; i--) {
+            const token = findIn[i];
+
+            for (let j = checkIndexes.length - 1; j >= 0; j--) {
+                const startIndex = checkIndexes[j]!;
+                const indexDiff = startIndex - i;
+                if (lookupPattern[lookupPattern.length - 1 - indexDiff] !== token || (
+                    this._patternMaxLength > 0 && indexDiff >= this._patternMaxLength
+                )) {
+                    checkIndexes.splice(j, 1);
+
+                    if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                        bestIndex = startIndex;
+                        bestIndexDiff = indexDiff;
+                    }
+                }
+            }
+
+            if (token === lookupPattern[lookupPattern.length - 1])
+                checkIndexes.unshift(i);
+        }
+
+        for (let j = checkIndexes.length - 1; j >= 0; j--) {
+            const startIndex = checkIndexes[j]!;
+            const indexDiff = startIndex + 1;
+            checkIndexes.splice(j, 1);
+
+            if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                bestIndex = startIndex;
+                bestIndexDiff = indexDiff;
+            }
+        }
+
+        if (bestIndex >= 0)
+            return [bestIndex - (bestIndexDiff - 1), bestIndexDiff];
+
+        return [];
+    }
+}
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 428bd6b1..a87eb9c6 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -260,6 +260,61 @@ export type ContextTokensDeleteRange = {
     end: number
 };
 
+export type SequenceEvaluateOptions = {
+    temperature?: number, minP?: number, topK?: number, topP?: number,
+
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Defaults to the current epoch time.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number,
+    grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
+    repeatPenalty?: LlamaContextSequenceRepeatPenalty,
+
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias),
+
+    /**
+     * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+     * evaluated based on the strategy chosen for the context.
+     * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+     * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+     * highest evaluation priority.
+     * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+     * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+     */
+    evaluationPriority?: EvaluationPriority,
+
+    /** Override the sequence context shift options for this evaluation */
+    contextShift?: ContextShiftOptions,
+
+    /**
+     * Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
+     * When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
+     * Defaults to `false`.
+     */
+    yieldEogToken?: boolean,
+
+    /** @internal */
+    _noSampling?: boolean
+};
+
+export type ControlledEvaluateIndexOutput = {
+    next: {
+        token?: Token | null,
+        probabilities?: Map<Token, number>
+    }
+};
+
 /**
  * 1 - low
  *
@@ -269,6 +324,7 @@ export type EvaluationPriority = 1 | 2 | 3 | 4 | 5;
 
 export type BatchItem = {
     readonly tokens: readonly Token[],
+    readonly logits: readonly (true | undefined)[],
     readonly evaluationPriority: EvaluationPriority
 };
 export type PrioritizedBatchItem = {
diff --git a/src/evaluator/LlamaGrammarEvaluationState.ts b/src/evaluator/LlamaGrammarEvaluationState.ts
index db6518c5..7c9bbbc5 100644
--- a/src/evaluator/LlamaGrammarEvaluationState.ts
+++ b/src/evaluator/LlamaGrammarEvaluationState.ts
@@ -20,15 +20,25 @@ export class LlamaGrammarEvaluationState {
     /** @internal */ public readonly _llama: Llama;
     /** @internal */ public readonly _state: AddonGrammarEvaluationState;
 
-    /**
-     * @param options
-     */
-    public constructor({model, grammar}: LlamaGrammarEvaluationStateOptions) {
-        this._llama = model._llama;
+    public constructor(options: LlamaGrammarEvaluationStateOptions);
+    public constructor(existingState: LlamaGrammarEvaluationState);
+    public constructor(existingStateOrOptions: LlamaGrammarEvaluationStateOptions | LlamaGrammarEvaluationState) {
+        if (existingStateOrOptions instanceof LlamaGrammarEvaluationState) {
+            this._llama = existingStateOrOptions._llama;
+            this._state = new this._llama._bindings.AddonGrammarEvaluationState(existingStateOrOptions._state);
+        } else {
+            const {model, grammar} = existingStateOrOptions;
+            this._llama = model._llama;
 
-        if (model._llama !== grammar._llama)
-            throw new Error("The given LlamaModel and LlamaGrammar must be from the same Llama instance");
+            if (model._llama !== grammar._llama)
+                throw new Error("The given LlamaModel and LlamaGrammar must be from the same Llama instance");
 
-        this._state = new model._llama._bindings.AddonGrammarEvaluationState(model._model, grammar._grammar);
+            this._state = new model._llama._bindings.AddonGrammarEvaluationState(model._model, grammar._grammar);
+        }
+    }
+
+    /** Clone the grammar evaluation state */
+    public clone(): LlamaGrammarEvaluationState {
+        return new LlamaGrammarEvaluationState(this);
     }
 }
diff --git a/src/index.ts b/src/index.ts
index c8277571..37d74ace 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -16,9 +16,9 @@ import {LlamaContext, LlamaContextSequence} from "./evaluator/LlamaContext/Llama
 import {LlamaEmbeddingContext, type LlamaEmbeddingContextOptions} from "./evaluator/LlamaEmbeddingContext.js";
 import {LlamaEmbedding, type LlamaEmbeddingOptions, type LlamaEmbeddingJSON} from "./evaluator/LlamaEmbedding.js";
 import {
-    type LlamaContextOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty, type CustomBatchingDispatchSchedule,
-    type CustomBatchingPrioritizationStrategy, type BatchItem, type PrioritizedBatchItem, type ContextShiftOptions,
-    type ContextTokensDeleteRange, type EvaluationPriority
+    type LlamaContextOptions, type SequenceEvaluateOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty,
+    type CustomBatchingDispatchSchedule, type CustomBatchingPrioritizationStrategy, type BatchItem, type PrioritizedBatchItem,
+    type ContextShiftOptions, type ContextTokensDeleteRange, type EvaluationPriority, type ControlledEvaluateIndexOutput
 } from "./evaluator/LlamaContext/types.js";
 import {TokenBias} from "./evaluator/TokenBias.js";
 import {
@@ -70,6 +70,9 @@ import {
     type BuiltinSpecialTokenValue
 } from "./utils/LlamaText.js";
 import {appendUserMessageToChatHistory} from "./utils/appendUserMessageToChatHistory.js";
+import {TokenPredictor} from "./evaluator/LlamaContext/TokenPredictor.js";
+import {DraftSequenceTokenPredictor} from "./evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
+import {InputLookupTokenPredictor} from "./evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js";
 import {getModuleVersion} from "./utils/getModuleVersion.js";
 import {readGgufFileInfo} from "./gguf/readGgufFileInfo.js";
 import {GgufInsights, type GgufInsightsResourceRequirements} from "./gguf/insights/GgufInsights.js";
@@ -130,6 +133,7 @@ export {
     LlamaContext,
     LlamaContextSequence,
     type LlamaContextOptions,
+    type SequenceEvaluateOptions,
     type BatchingOptions,
     type CustomBatchingDispatchSchedule,
     type CustomBatchingPrioritizationStrategy,
@@ -139,6 +143,7 @@ export {
     type ContextTokensDeleteRange,
     type EvaluationPriority,
     type LlamaContextSequenceRepeatPenalty,
+    type ControlledEvaluateIndexOutput,
     TokenBias,
     LlamaEmbeddingContext,
     type LlamaEmbeddingContextOptions,
@@ -220,6 +225,9 @@ export {
     type LlamaTextSpecialTokensTextJSON,
     type LlamaTextSpecialTokenJSON,
     type BuiltinSpecialTokenValue,
+    TokenPredictor,
+    DraftSequenceTokenPredictor,
+    InputLookupTokenPredictor,
     appendUserMessageToChatHistory,
     getModuleVersion,
     type ChatHistoryItem,
diff --git a/test/modelDependent/llama3.1/tokenPredictor.test.ts b/test/modelDependent/llama3.1/tokenPredictor.test.ts
new file mode 100644
index 00000000..d20d072f
--- /dev/null
+++ b/test/modelDependent/llama3.1/tokenPredictor.test.ts
@@ -0,0 +1,276 @@
+import {describe, expect, test} from "vitest";
+import {LlamaChatSession, Token, DraftSequenceTokenPredictor, InputLookupTokenPredictor} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+import {compareTokens} from "../../../src/utils/compareTokens.js";
+
+describe("llama 3.1", () => {
+    describe("token predictor", () => {
+        test("DraftModelTokenPredictor", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 2048,
+                sequences: 2
+            });
+            const draftSequence = context.getSequence();
+            const predictor = new DraftSequenceTokenPredictor(draftSequence, {
+                minTokens: 2,
+                maxTokens: 2
+            });
+
+            const mainSequence = context.getSequence();
+            const chatSession = new LlamaChatSession({
+                contextSequence: mainSequence
+            });
+
+            await chatSession.preloadPrompt("Hello");
+
+            await predictor.reset({
+                targetSequence: mainSequence,
+                stateTokens: mainSequence.contextTokens,
+                evaluateOptions: {}
+            });
+
+            const predictedTokens = await predictor.predictTokens();
+            expect(predictedTokens.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                ",",
+                " I",
+              ]
+            `);
+
+            const textTokens = model.tokenize("! The");
+            predictor.pushTokens(textTokens);
+
+            const predictedTokens2 = await predictor.predictTokens();
+            expect(predictedTokens2.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                " weather",
+                " in",
+              ]
+            `);
+
+
+            await chatSession.preloadPrompt("What");
+
+            await predictor.reset({
+                targetSequence: mainSequence,
+                stateTokens: mainSequence.contextTokens,
+                evaluateOptions: {}
+            });
+
+            const predictedTokens3 = await predictor.predictTokens();
+            expect(predictedTokens3.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                " is",
+                " the",
+              ]
+            `);
+
+            const text2Tokens = model.tokenize("can the");
+            predictor.pushTokens(text2Tokens);
+
+            const predictedTokens4 = await predictor.predictTokens();
+            expect(predictedTokens4.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                " average",
+                " person",
+              ]
+            `);
+        });
+
+        describe("InputLookupTokenPredictor", () => {
+            // made up example paragraph
+            const exampleParagraph = [
+                "The Luminawing (genus: Luxavis, species: nocturna) is a rare and enigmatic nocturnal creature native to the dense forests of the remote continent of Aethoria.",
+                "Characterized by its striking appearance and unique adaptations, this mystical animal has garnered significant attention from scientists and naturalists.",
+                "",
+                "## Physical Characteristics",
+                "The Luminawing's most distinctive feature is its pair of iridescent wings, which reflect the colors of its surroundings through a complex process involving microscopic crystals embedded in the wing membrane.",
+                "This remarkable ability allows the creature to blend seamlessly into the night sky, making it nearly invisible to predators and prey alike.",
+                "",
+                "Its slender body measures approximately 30-40 centimeters in length, covered in soft, glowing fur that shimmers like starlight under ultraviolet light. The Luminawing's large, round eyes are capable of perceiving even the faintest glows, allowing it to navigate through the dark forest with ease.",
+                "",
+                "## Behavior and Habitat",
+                "The Luminawing is a solitary creature, only coming together with others of its kind during the mating season.",
+                "It inhabits the dense forests of Aethoria, where it feeds on the nectar of rare, moon-blooming flowers (genus: Lunaria).",
+                "These flowers are said to possess magical properties, which are believed to be absorbed by the Luminawing through its diet.",
+                "",
+                "The creature's haunting melody can be heard echoing through the forest at dusk, a siren call that beckons in the night creatures and fills the air with wonder. This unique vocalization is thought to play a crucial role in the Luminawing's mating rituals and territorial defense.",
+                "",
+                "## Conservation Status",
+                "Due to its elusive nature and limited range, the Luminawing is currently listed as a species of special concern by the Aethorian Conservation Society.",
+                "Efforts are being made to protect its habitat and study its behavior, but more research is needed to fully understand this enigmatic creature's place in the ecosystem."
+            ].join("\n");
+
+            test("no evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => {
+                const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+                const llama = await getTestLlama();
+
+                const model = await llama.loadModel({
+                    modelPath
+                });
+                const context = await model.createContext({
+                    contextSize: 2048
+                });
+                const predictor = new InputLookupTokenPredictor({
+                    patternLength: {
+                        min: 4
+                    },
+                    predictionLength: {
+                        min: 1,
+                        max: 5
+                    }
+                });
+
+                const sequence = context.getSequence();
+                const chatSession = new LlamaChatSession({
+                    contextSequence: sequence
+                });
+
+                const paragraphTokens = model.tokenize(exampleParagraph);
+                let endIndex = 3 + 4;
+                const tokensExcerpt = [
+                    ...model.tokenize("Some random text here"),
+                    ...paragraphTokens.slice(3, endIndex)
+                ];
+
+                await chatSession.preloadPrompt("Hello");
+
+                predictor.reset({
+                    stateTokens: tokensExcerpt.slice()
+                });
+                predictor.updateInputTokens(paragraphTokens.slice());
+
+                const predictedTokens = predictor.predictTokens();
+                expect(predictedTokens.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+                  [
+                    ":",
+                    " Lux",
+                    "avis",
+                    ",",
+                    " species",
+                  ]
+                `);
+
+                predictor.pushTokens(paragraphTokens.slice(endIndex, endIndex + 2));
+                endIndex += 2;
+
+                const predictedTokens2 = predictor.predictTokens();
+                expect(predictedTokens2.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+                  [
+                    "avis",
+                    ",",
+                    " species",
+                    ":",
+                    " noct",
+                  ]
+                `);
+
+
+                predictor.reset({
+                    stateTokens: [...paragraphTokens, ...tokensExcerpt]
+                });
+                predictor.updateInputTokens(paragraphTokens.slice());
+
+                const predictedTokens3 = predictor.predictTokens();
+                expect(predictedTokens3.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+                  [
+                    ":",
+                    " Lux",
+                    "avis",
+                    ",",
+                    " species",
+                  ]
+                `);
+            });
+
+            test("with evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => {
+                const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+                const llama = await getTestLlama();
+
+                const model = await llama.loadModel({
+                    modelPath
+                });
+                const context = await model.createContext({
+                    contextSize: 2048
+                });
+                const predictor = new InputLookupTokenPredictor({
+                    patternLength: {
+                        min: 4
+                    },
+                    predictionLength: {
+                        min: 1,
+                        max: 5
+                    }
+                });
+
+                const sequence = context.getSequence({
+                    tokenPredictor: predictor
+                });
+                const chatSession = new LlamaChatSession({
+                    contextSequence: sequence
+                });
+
+                await chatSession.prompt("Summarize this text:\n\n" + exampleParagraph, {
+                    maxTokens: 80
+                });
+
+                expect(sequence.tokenPredictions.validated).toMatchInlineSnapshot("8");
+                expect(sequence.tokenPredictions.refuted).toMatchInlineSnapshot("47");
+                expect(sequence.tokenPredictions.used).toMatchInlineSnapshot("7");
+                expect(sequence.tokenPredictions.unused).toMatchInlineSnapshot("1");
+
+                const exposedNextTokenIndex = sequence.nextTokenIndex;
+
+                {
+                    const actualContextTokensLength = sequence._contextTokens.length;
+                    const exposedContextTokensLength = sequence.contextTokens.length;
+
+                    expect(exposedContextTokensLength).toMatchInlineSnapshot("598");
+                    expect(actualContextTokensLength).toMatchInlineSnapshot("599");
+                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("598");
+                    expect(exposedContextTokensLength).to.not.be.eql(actualContextTokensLength);
+                }
+
+                const lastToken = sequence.contextTokens.at(-1)!;
+                const exampleToken = sequence.contextTokens
+                    .slice()
+                    .reverse()
+                    .find((token) => !compareTokens(token, lastToken))!;
+
+                const addedTokens: Token[] = [];
+                for await (const token of sequence.evaluate([exampleToken])) {
+                    addedTokens.push(token);
+                    break; // evaluate only one token
+                }
+
+                expect(addedTokens).toMatchInlineSnapshot(`
+                  [
+                    10318,
+                  ]
+                `);
+
+                await sequence.eraseContextTokenRanges([{start: sequence.nextTokenIndex - 1, end: sequence.nextTokenIndex}]);
+
+                {
+                    const actualContextTokensLength = sequence._contextTokens.length;
+                    const exposedContextTokensLength = sequence.contextTokens.length;
+
+                    expect(exposedContextTokensLength).toMatchInlineSnapshot("598");
+                    expect(actualContextTokensLength).toMatchInlineSnapshot("598");
+                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("598");
+                    expect(exposedNextTokenIndex).to.be.eql(sequence.nextTokenIndex);
+                    expect(exposedContextTokensLength).to.be.eql(actualContextTokensLength);
+                    expect(sequence.contextTokens.at(-1)).to.not.be.eql(exampleToken);
+                    expect(sequence.contextTokens.at(-1)).to.be.eql(lastToken);
+                }
+            });
+        });
+    });
+});

From 2c1af7abcab0b8e9b9c144bacba31474e03e7689 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 25 Dec 2024 03:30:04 +0200
Subject: [PATCH 14/73] fix: typos

---
 README.md                           | 2 +-
 docs/guide/troubleshooting.md       | 2 +-
 src/evaluator/LlamaContext/types.ts | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9f41fff4..fdfbd69f 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@
 * [Use the CLI to chat with a model without writing any code](#try-it-without-installing)
 * Up-to-date with the latest `llama.cpp`. Download and compile the latest release with a [single CLI command](https://node-llama-cpp.withcat.ai//guide/building-from-source#downloading-a-release)
 * Enforce a model to generate output in a parseable format, [like JSON](https://node-llama-cpp.withcat.ai/guide/chat-session#json-response), or even force it to [follow a specific JSON schema](https://node-llama-cpp.withcat.ai/guide/chat-session#response-json-schema)
-* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions
+* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information or perform actions
 * [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding)
 * [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)
 * Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md
index 0899d733..f60a7745 100644
--- a/docs/guide/troubleshooting.md
+++ b/docs/guide/troubleshooting.md
@@ -67,7 +67,7 @@ pkg install vulkan-tools vulkan-loader-android vulkan-headers vulkan-extension-l
 > If that happens, disable Vulkan in your code or uninstall the Vulkan packages.
 
 
-## Crashes With an `illegal hardware instruction` Error or a `SIGILL` Signal
+## Crashes With an `illegal hardware instruction` Error or a `SIGILL` Signal {#illegal-hardware-instruction}
 A common cause for this issue is when the installed nodejs architecture is different from the host machine CPU architecture.
 
 For example, having an x64 nodejs installed on an arm64 machine (such as Apple Silicon Macs).
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index a87eb9c6..1b65e8b4 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -209,7 +209,7 @@ export type LlamaContextSequenceRepeatPenalty = {
 export type BatchingOptions = {
     /**
      * The strategy used to dispatch items to be processed when there are items pending to be processed.
-     * - **`"nextTick"`** - dispatch the items on the next even loop tick.
+     * - **`"nextTick"`** - dispatch the items on the next event loop tick.
      * You can provide a custom function to define a custom dispatch schedule.
      *
      * Defaults to `"nextTick"`.

From 46c225118ce05fd8bb0bbcd6cb15af540d87769f Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 25 Dec 2024 03:31:32 +0200
Subject: [PATCH 15/73] fix: improve types

---
 src/evaluator/LlamaContext/types.ts | 10 ++++++++--
 src/gguf/types/GgufMetadataTypes.ts | 18 +++++++++---------
 src/utils/pushAll.ts                |  2 +-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 1b65e8b4..aee06bd8 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -1,3 +1,5 @@
+import type {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js";
+import type {TokenBias} from "../TokenBias.js";
 import type {Token} from "../../types.js";
 import type {LlamaContextSequence} from "./LlamaContext.js";
 
@@ -89,7 +91,11 @@ export type LlamaContextOptions = {
         min?: number
     },
 
-    /** control the parallel sequences processing behavior */
+    /**
+     * Control the parallel sequences processing behavior.
+     *
+     * See {@link BatchingOptions} for more information.
+     */
     batching?: BatchingOptions,
 
     /**
@@ -221,7 +227,7 @@ export type BatchingOptions = {
      * - **`"maximumParallelism"`** - process as many different sequences in parallel as possible.
      * - **`"firstInFirstOut"`** - process items in the order they were added.
      * - **Custom prioritization function** - a custom function that prioritizes the items to be processed.
-     * See the `CustomBatchingPrioritizationStrategy` type for more information.
+     * See the {@link CustomBatchingPrioritizationStrategy} type for more information.
      *
      * Defaults to `"maximumParallelism"`.
      */
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index 89922f20..a7f9e6b7 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -86,9 +86,9 @@ export enum GgufFileType {
     MOSTLY_F16 = 1,
     MOSTLY_Q4_0 = 2,
     MOSTLY_Q4_1 = 3,
-    MOSTLY_Q4_1_SOME_F16 = 4,
-    MOSTLY_Q4_2 = 5,
-    MOSTLY_Q4_3 = 6,
+    MOSTLY_Q4_1_SOME_F16 = 4, // deprecated
+    MOSTLY_Q4_2 = 5, // deprecated
+    MOSTLY_Q4_3 = 6, // deprecated
     MOSTLY_Q8_0 = 7,
     MOSTLY_Q5_0 = 8,
     MOSTLY_Q5_1 = 9,
@@ -115,11 +115,11 @@ export enum GgufFileType {
     MOSTLY_IQ4_XS = 30,
     MOSTLY_IQ1_M = 31,
     MOSTLY_BF16 = 32,
-    MOSTLY_Q4_0_4_4 = 33,
-    MOSTLY_Q4_0_4_8 = 34,
-    MOSTLY_Q4_0_8_8 = 35,
-    LLAMA_FTYPE_MOSTLY_TQ1_0 = 36,
-    LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
+    MOSTLY_Q4_0_4_4 = 33, // deprecated
+    MOSTLY_Q4_0_4_8 = 34, // deprecated
+    MOSTLY_Q4_0_8_8 = 35, // deprecated
+    LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // deprecated
+    LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 // deprecated
 }
 
 
@@ -224,7 +224,7 @@ export const enum GgufMetadataTokenizerTokenType {
 export type GgufMetadataTokenizer = {
     readonly ggml: {
         readonly model: "no_vocab" | "llama" | "gpt2" | "bert" | string,
-        readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "mpt" |
+        readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "falcon3" | "mpt" |
             "starcoder" | "gpt-2" | "phi-2" | "jina-es" | "jina-de" | "jina-v1-en" | "jina-v2-es" | "jina-v2-de" | "jina-v2-code" |
             "refact" | "command-r" | "qwen2" | "stablelm2" | "olmo" | "dbrx" | "smaug-bpe" | "poro-chat" | "chatglm-bpe" | "viking" |
             "jais" | "tekken" | "smollm" | "codeshell" | "bloom" | "gpt3-finnish" | "exaone" | "chameleon" | "minerva-7b" | string,
diff --git a/src/utils/pushAll.ts b/src/utils/pushAll.ts
index 42cbcce7..0f48ea5b 100644
--- a/src/utils/pushAll.ts
+++ b/src/utils/pushAll.ts
@@ -3,7 +3,7 @@
  * @param array - The array to push the items to
  * @param items - The items to push to the array
  */
-export function pushAll<T>(array: T[], items: readonly T[] | ReadonlySet<T>): T[] {
+export function pushAll<const T>(array: T[], items: readonly NoInfer<T>[] | ReadonlySet<NoInfer<T>>): T[] {
     for (const item of items)
         array.push(item);
 

From 31f16bdca1364f8735664c94bad1c327d023b8c0 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 25 Dec 2024 03:32:24 +0200
Subject: [PATCH 16/73] fix: bug

---
 src/utils/tokenizerUtils.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/tokenizerUtils.ts b/src/utils/tokenizerUtils.ts
index 12b4327e..987d45dd 100644
--- a/src/utils/tokenizerUtils.ts
+++ b/src/utils/tokenizerUtils.ts
@@ -10,7 +10,7 @@ export function resolveBeginningTokenToPrepend(vocabularyType: LlamaVocabularyTy
         return null;
 
     if (vocabularyType === LlamaVocabularyType.wpm)
-        return tokens.cls;
+        return tokens.cls ?? tokens.bos; // https://github.com/ggerganov/llama.cpp/pull/10930
 
     if (tokens.shouldPrependBosToken)
         return tokens.bos;

From ad95c77dc556bc644e46b20a229a8e8b55bfba58 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 25 Dec 2024 03:35:47 +0200
Subject: [PATCH 17/73] fix: eslint config

---
 eslint.config.js                                     | 8 ++++++--
 templates/electron-typescript-react/eslint.config.js | 8 ++++++--
 templates/node-typescript/eslint.config.js           | 8 ++++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/eslint.config.js b/eslint.config.js
index 15e7409b..394fc136 100644
--- a/eslint.config.js
+++ b/eslint.config.js
@@ -65,7 +65,11 @@ export default tseslint.config({
             SwitchCase: 1,
             FunctionDeclaration: {
                 parameters: "first"
-            }
+            },
+            ignoredNodes: [
+                // fix for indent warnings on function object return types when the function has no parameters
+                'FunctionExpression[params.length=0][returnType.type="TSTypeAnnotation"]'
+            ]
         }],
         "@stylistic/indent-binary-ops": ["off"],
         "@stylistic/eqeqeq": ["off"],
@@ -142,7 +146,7 @@ export default tseslint.config({
                 {blankLine: "always", prev: "*", next: "method"}
             ]
         }],
-        "@stylistic/no-trailing-spaces": ["warn"],
+        "@stylistic/no-trailing-spaces": ["off"],
         "@stylistic/no-multi-spaces": ["warn"]
     }
 }, {
diff --git a/templates/electron-typescript-react/eslint.config.js b/templates/electron-typescript-react/eslint.config.js
index becfe23a..86f90cd2 100644
--- a/templates/electron-typescript-react/eslint.config.js
+++ b/templates/electron-typescript-react/eslint.config.js
@@ -45,7 +45,11 @@ export default tseslint.config({
             SwitchCase: 1,
             FunctionDeclaration: {
                 parameters: "first"
-            }
+            },
+            ignoredNodes: [
+                // fix for indent warnings on function object return types when the function has no parameters
+                'FunctionExpression[params.length=0][returnType.type="TSTypeAnnotation"]'
+            ]
         }],
         "@stylistic/indent-binary-ops": ["off"],
         "@stylistic/eqeqeq": ["off"],
@@ -119,7 +123,7 @@ export default tseslint.config({
                 {blankLine: "always", prev: "*", next: "method"}
             ]
         }],
-        "@stylistic/no-trailing-spaces": ["warn"],
+        "@stylistic/no-trailing-spaces": ["off"],
         "@stylistic/no-multi-spaces": ["warn"]
     }
 }, {
diff --git a/templates/node-typescript/eslint.config.js b/templates/node-typescript/eslint.config.js
index 180452e1..b1c7faa2 100644
--- a/templates/node-typescript/eslint.config.js
+++ b/templates/node-typescript/eslint.config.js
@@ -47,7 +47,11 @@ export default tseslint.config({
             SwitchCase: 1,
             FunctionDeclaration: {
                 parameters: "first"
-            }
+            },
+            ignoredNodes: [
+                // fix for indent warnings on function object return types when the function has no parameters
+                'FunctionExpression[params.length=0][returnType.type="TSTypeAnnotation"]'
+            ]
         }],
         "@stylistic/indent-binary-ops": ["off"],
         "@stylistic/eqeqeq": ["off"],
@@ -122,7 +126,7 @@ export default tseslint.config({
                 {blankLine: "always", prev: "*", next: "method"}
             ]
         }],
-        "@stylistic/no-trailing-spaces": ["warn"],
+        "@stylistic/no-trailing-spaces": ["off"],
         "@stylistic/no-multi-spaces": ["warn"]
     }
 }, {

From 74daf2799937412a40c4069a85d709087eb5c669 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 25 Dec 2024 03:45:27 +0200
Subject: [PATCH 18/73] fix: detect running under Rosetta on Apple Silicone and
 show an error message instead of crashing

---
 src/cli/commands/OnPostInstallCommand.ts | 26 ++++++++++++++++++++++--
 src/cli/utils/isRunningUnderRosetta.ts   | 23 +++++++++++++++++++++
 src/config.ts                            |  3 +++
 3 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 src/cli/utils/isRunningUnderRosetta.ts

diff --git a/src/cli/commands/OnPostInstallCommand.ts b/src/cli/commands/OnPostInstallCommand.ts
index 68ab3eb4..ee602984 100644
--- a/src/cli/commands/OnPostInstallCommand.ts
+++ b/src/cli/commands/OnPostInstallCommand.ts
@@ -1,7 +1,10 @@
 import {CommandModule} from "yargs";
-import {defaultSkipDownload} from "../../config.js";
+import chalk from "chalk";
+import {defaultSkipDownload, documentationPageUrls} from "../../config.js";
 import {getLlamaForOptions} from "../../bindings/getLlama.js";
 import {setForceShowConsoleLogPrefix} from "../../state.js";
+import {isRunningUnderRosetta} from "../utils/isRunningUnderRosetta.js";
+import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
 
 type OnPostInstallCommand = null;
 
@@ -12,7 +15,26 @@ export const OnPostInstallCommand: CommandModule<object, OnPostInstallCommand> =
         if (defaultSkipDownload)
             return;
 
-        setForceShowConsoleLogPrefix(false);
+        setForceShowConsoleLogPrefix(true);
+
+        if (await isRunningUnderRosetta()) {
+            console.error(
+                getConsoleLogPrefix(false, false),
+                chalk.red(
+                    "llama.cpp is not supported on Rosetta on Apple Silicone Macs. " +
+                    "Ensure that you're using a native arm64 node.js installation.")
+            );
+            console.error(
+                getConsoleLogPrefix(false, false),
+                "process.platform: " + process.platform + ", process.arch: " + process.arch
+            );
+            console.error(
+                getConsoleLogPrefix(false, false),
+                "troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction
+            );
+
+            process.exit(1);
+        }
 
         try {
             await getLlamaForOptions({
diff --git a/src/cli/utils/isRunningUnderRosetta.ts b/src/cli/utils/isRunningUnderRosetta.ts
new file mode 100644
index 00000000..e3a25017
--- /dev/null
+++ b/src/cli/utils/isRunningUnderRosetta.ts
@@ -0,0 +1,23 @@
+import path from "path";
+import {fileURLToPath} from "url";
+import process from "process";
+import {getPlatform} from "../../bindings/utils/getPlatform.js";
+import {spawnCommand} from "../../utils/spawnCommand.js";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+export async function isRunningUnderRosetta() {
+    const platform = getPlatform();
+
+    // // only check for rosetta on macOS when x64 is detected
+    if (platform !== "mac" || process.arch !== "x64")
+        return false;
+
+    try {
+        const res = await spawnCommand("sysctl", ["-n", "sysctl.proc_translated"], __dirname, process.env, false);
+
+        return res.combinedStd.trim() === "1";
+    } catch (err) {
+        return false;
+    }
+}
diff --git a/src/config.ts b/src/config.ts
index 65c08e66..a0d8e75e 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -117,6 +117,9 @@ export const documentationPageUrls = {
             Build: documentationCliUrl + "/source/build",
             Clear: documentationCliUrl + "/source/clear"
         }
+    },
+    troubleshooting: {
+        RosettaIllegalHardwareInstruction: documentationUrl + "/guide/troubleshooting#illegal-hardware-instruction"
     }
 } as const;
 export const recommendedBaseDockerImage = "node:20";

From e703a472f6e292ded31d96f31e17b58797c007b0 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 29 Dec 2024 04:39:40 +0200
Subject: [PATCH 19/73] feat: build on arm64 using LLVM, use Visual Studio's
 CMake when available

---
 docs/guide/building-from-source.md            |  45 ++++++++
 llama/CMakeLists.txt                          |  15 ++-
 .../llvm.win32.host-arm64.target-arm64.cmake  |  75 ++++++++++++
 .../llvm.win32.host-x64.target-arm64.cmake    |  72 ++++++++++++
 .../win32.host-arm64.target-arm64.cmake       |  98 ++++++++++++++++
 .../win32.host-x64.target-arm64.cmake         | 108 +++++++++++++-----
 src/bindings/utils/compileLLamaCpp.ts         |  49 +++++---
 .../utils/detectAvailableComputeLayers.ts     |   8 +-
 src/utils/cmake.ts                            |  68 +++++++++++
 9 files changed, 490 insertions(+), 48 deletions(-)
 create mode 100644 llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
 create mode 100644 llama/profiles/llvm.win32.host-x64.target-arm64.cmake
 create mode 100644 llama/toolchains/win32.host-arm64.target-arm64.cmake

diff --git a/docs/guide/building-from-source.md b/docs/guide/building-from-source.md
index be8e695b..60a7d50c 100644
--- a/docs/guide/building-from-source.md
+++ b/docs/guide/building-from-source.md
@@ -34,6 +34,51 @@ If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile
 
 :::
 
+::: details Dependencies for Windows x64
+If the build fails on your machine, ensure you have all the necessary build tools installed.
+
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using these commands:
+```shell
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
+```
+> WinGet is built-in on Windows 11 and modern Windows 10 versions
+
+---
+
+You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
+* **`Workloads` tab:** select `Desktop development with C++`
+* **`Individual components` tab**: select the following:
+  * C++ ATL for latest v143 build tools (x86 & x64)
+  * C++ MFC for latest v143 build tools (x86 & x64)
+  * C++ CMake tools for Windows
+  * C++ Clang Compiler for Windows
+  * MSBuild support for LLVM (clang-cl) toolset
+  * Windows Universal CRT SDK
+:::
+
+::: details Dependencies for Windows on Arm
+On Windows on Arm you need to install additional build tools to build `llama.cpp` from source.
+
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using these commands:
+```shell
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.Tools.ARM64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATL.ARM64 Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.MFC.ARM64 Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
+```
+> WinGet is built-in on Windows 11 and modern Windows 10 versions
+
+---
+
+You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
+* **`Workloads` tab:** select `Desktop development with C++`
+* **`Individual components` tab**: select the following:
+  * MSVC v143 - VS 2022 C++ ARM64 build tools (latest)
+  * C++ ATL for latest v143 build tools (ARM64/ARM64EC)
+  * C++ MFC for latest v143 build tools (ARM64/ARM64EC)
+  * C++ CMake tools for Windows
+  * C++ Clang Compiler for Windows
+  * MSBuild support for LLVM (clang-cl) toolset
+  * Windows Universal CRT SDK
+:::
+
 ## `source download` and `source build` Commands
 The difference between the [`source download`](../cli/source/download.md) and [`source build`](../cli/source/build.md) commands
 is that the `source download` command downloads a release of `llama.cpp` and builds it,
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index d6413202..647b3e2b 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -1,5 +1,15 @@
 cmake_minimum_required(VERSION 3.14)
 
+if (NLC_TARGET_PLATFORM STREQUAL "win-arm64" AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") AND NOT MINGW)
+    if(NLC_CURRENT_PLATFORM STREQUAL "win-x64")
+        get_filename_component(INCLUDE_PROFILE_ABS "./profiles/llvm.win32.host-x64.target-arm64.cmake" ABSOLUTE)
+        include("${INCLUDE_PROFILE_ABS}")
+    elseif(NLC_CURRENT_PLATFORM STREQUAL "win-arm64")
+        get_filename_component(INCLUDE_PROFILE_ABS "./profiles/llvm.win32.host-arm64.target-arm64.cmake" ABSOLUTE)
+        include("${INCLUDE_PROFILE_ABS}")
+    endif()
+endif()
+
 project("llama-addon" C CXX)
 
 if (MSVC)
@@ -24,8 +34,8 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
 
 set(LLAMA_BUILD_COMMON ON)
 
-if (NOT MINGW)
-    set(GGML_BACKEND_DL ON)
+if (MINGW)
+    set(GGML_BACKEND_DL OFF)
     set(BUILD_SHARED_LIBS ON)
 endif()
 
@@ -46,7 +56,6 @@ endif()
 include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
 
 add_subdirectory("llama.cpp")
-include_directories("gpuInfo")
 include_directories("llama.cpp")
 include_directories("./llama.cpp/common")
 
diff --git a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
new file mode 100644
index 00000000..0ba604b4
--- /dev/null
+++ b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
@@ -0,0 +1,75 @@
+set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
+set(PROGRAMFILES "$ENV{ProgramFiles}")
+set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+set(PROGRAMFILES_PATHS
+    "${PROGRAMFILES_ARM64}"
+    "${PROGRAMFILES}"
+    "${PROGRAMFILES_X86}"
+    "C:/Program Files (Arm)"
+    "C:/Program Files"
+    "C:/Program Files (x86)"
+)
+
+if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+    if (NOT DEFINED NODE_LIB_CMAKE_AR)
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            if(NODE_LIB_CMAKE_AR)
+                break()
+            endif()
+
+            file(GLOB_RECURSE FOUND_LIB_EXE
+                "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Hostarm64/arm64/lib.exe"
+                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Hostarm64/arm64/lib.exe")
+
+            if(FOUND_LIB_EXE)
+                list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
+                break()
+            endif()
+        endforeach()
+    endif()
+
+    if (EXISTS "${NODE_LIB_CMAKE_AR}")
+        # Generate node.lib
+        execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} /MACHINE:ARM64 /nologo)
+    else()
+        message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
+    endif()
+endif()
+
+# adapt cmake-js to work with llvm in GNU mode
+if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
+    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
+        CMAKE_SHARED_LINKER_FLAGS
+        "${CMAKE_SHARED_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+
+# ensure CMAKE_AR is configured
+if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
+    set(LLVM_INSTALL_PATHS "")
+    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+        file(GLOB_RECURSE FOUND_LLVM_ROOT
+            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/ARM64"
+            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/ARM64")
+
+        if(FOUND_LLVM_ROOT)
+            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+        endif()
+    endforeach()
+
+    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+    endif()
+
+    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+        if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
+            set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
+            break()
+        endif()
+    endforeach()
+endif()
\ No newline at end of file
diff --git a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
new file mode 100644
index 00000000..7eff4f49
--- /dev/null
+++ b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
@@ -0,0 +1,72 @@
+set(PROGRAMFILES "$ENV{ProgramFiles}")
+set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+set(PROGRAMFILES_PATHS
+    "${PROGRAMFILES}"
+    "${PROGRAMFILES_X86}"
+    "C:/Program Files"
+    "C:/Program Files (x86)"
+)
+
+if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+    if (NOT DEFINED NODE_LIB_CMAKE_AR)
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            if(NODE_LIB_CMAKE_AR)
+                break()
+            endif()
+
+            file(GLOB_RECURSE FOUND_LIB_EXE
+                "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/lib.exe"
+                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/lib.exe")
+
+            if(FOUND_LIB_EXE)
+                list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
+                break()
+            endif()
+        endforeach()
+    endif()
+
+    if (EXISTS "${NODE_LIB_CMAKE_AR}")
+        # Generate node.lib
+        execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} /MACHINE:ARM64 /nologo)
+    else()
+        message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
+    endif()
+endif()
+
+# adapt cmake-js to work with llvm in GNU mode
+if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
+    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
+        CMAKE_SHARED_LINKER_FLAGS
+        "${CMAKE_SHARED_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+
+# ensure CMAKE_AR is configured
+if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
+    set(LLVM_INSTALL_PATHS "")
+    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+        file(GLOB_RECURSE FOUND_LLVM_ROOT
+            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
+            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
+
+        if(FOUND_LLVM_ROOT)
+            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+        endif()
+    endforeach()
+
+    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+    endif()
+
+    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+        if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
+            set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
+            break()
+        endif()
+    endforeach()
+endif()
\ No newline at end of file
diff --git a/llama/toolchains/win32.host-arm64.target-arm64.cmake b/llama/toolchains/win32.host-arm64.target-arm64.cmake
new file mode 100644
index 00000000..9e7c94ef
--- /dev/null
+++ b/llama/toolchains/win32.host-arm64.target-arm64.cmake
@@ -0,0 +1,98 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR arm64)
+
+set(target arm64-pc-windows-msvc)
+set(CMAKE_C_COMPILER_TARGET ${target})
+set(CMAKE_CXX_COMPILER_TARGET ${target})
+
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_RC_COMPILER llvm-rc)
+
+set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
+
+set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
+set(PROGRAMFILES "$ENV{ProgramFiles}")
+set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+set(PROGRAMFILES_PATHS
+    "${PROGRAMFILES_ARM64}"
+    "${PROGRAMFILES}"
+    "${PROGRAMFILES_X86}"
+    "C:/Program Files (Arm)"
+    "C:/Program Files"
+    "C:/Program Files (x86)"
+)
+
+set(LLVM_INSTALL_PATHS "")
+foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+    list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+    file(GLOB_RECURSE FOUND_LLVM_ROOT
+        "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/ARM64"
+        "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/ARM64")
+
+    if(FOUND_LLVM_ROOT)
+        list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+    endif()
+endforeach()
+
+if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+    list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+endif()
+
+set(LLVM_ROOT "")
+foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+    if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
+        set(LLVM_ROOT "${PATH}")
+        break()
+    endif()
+endforeach()
+
+if(LLVM_ROOT STREQUAL "")
+    message(FATAL_ERROR "LLVM installation was not found. Please install LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+endif()
+
+if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
+    set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
+    set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
+    set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
+endif()
+
+if (NOT EXISTS "${CMAKE_C_COMPILER}")
+    message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+endif()
+if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
+    message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+endif()
+if (NOT EXISTS "${CMAKE_RC_COMPILER}")
+    message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+endif()
+
+set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
+set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")
+
+set(CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
+set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
+
+if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
+    find_program(NINJA_EXECUTABLE ninja)
+
+    if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
+        set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
+    else()
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            file(GLOB_RECURSE FOUND_NINJA_EXE
+                "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
+                "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
+
+            if(FOUND_NINJA_EXE)
+                list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                break()
+            endif()
+        endforeach()
+    endif()
+
+    if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+        message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
+    endif()
+endif()
diff --git a/llama/toolchains/win32.host-x64.target-arm64.cmake b/llama/toolchains/win32.host-x64.target-arm64.cmake
index c5ae267e..da82b61b 100644
--- a/llama/toolchains/win32.host-x64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-x64.target-arm64.cmake
@@ -1,41 +1,95 @@
 set(CMAKE_SYSTEM_NAME Windows)
-set(CMAKE_SYSTEM_PROCESSOR ARM64)
+set(CMAKE_SYSTEM_PROCESSOR arm64)
+
+set(target arm64-pc-windows-msvc)
+set(CMAKE_C_COMPILER_TARGET ${target})
+set(CMAKE_CXX_COMPILER_TARGET ${target})
+
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_RC_COMPILER llvm-rc)
+
+set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
 
-# Look for cl.exe in the Visual Studio installation directories
 set(PROGRAMFILES "$ENV{ProgramFiles}")
 set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
-
-set(VS_INSTALL_PATHS
-    "${PROGRAMFILES_X86}/Microsoft Visual Studio"
-    "${PROGRAMFILES}/Microsoft Visual Studio"
-    "C:/Program Files (x86)/Microsoft Visual Studio"
-    "C:/Program Files/Microsoft Visual Studio"
+set(PROGRAMFILES_PATHS
+    "${PROGRAMFILES}"
+    "${PROGRAMFILES_X86}"
+    "C:/Program Files"
+    "C:/Program Files (x86)"
 )
-foreach(PATH IN LISTS VS_INSTALL_PATHS)
-    if(CL_EXE_PATH)
-        break()
-    endif()
 
-    file(GLOB_RECURSE FOUND_CL_EXE "${PATH}/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/cl.exe")
-    if(FOUND_CL_EXE)
-        list(GET FOUND_CL_EXE 0 CL_EXE_PATH)
-        break()
-    endif()
+set(LLVM_INSTALL_PATHS "")
+foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+    list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
 
-    if(CL_EXE_PATH)
-        break()
+    file(GLOB_RECURSE FOUND_LLVM_ROOT
+        "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
+        "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
+
+    if(FOUND_LLVM_ROOT)
+        list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
     endif()
+endforeach()
 
-    file(GLOB_RECURSE FOUND_CL_EXE "${PATH}/**/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/cl.exe")
-    if(FOUND_CL_EXE)
-        list(GET FOUND_CL_EXE 0 CL_EXE_PATH)
+if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+    list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+endif()
+
+set(LLVM_ROOT "")
+foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+    if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
+        set(LLVM_ROOT "${PATH}")
         break()
     endif()
 endforeach()
 
-if(NOT CL_EXE_PATH)
-    message(FATAL_ERROR "cl.exe not found for ARM architecture.")
-else()
-    set(CMAKE_C_COMPILER "${CL_EXE_PATH}")
-    set(CMAKE_CXX_COMPILER "${CL_EXE_PATH}")
+if(LLVM_ROOT STREQUAL "")
+    message(FATAL_ERROR "LLVM installation was not found. Please install LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+
+if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
+    set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
+    set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
+    set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
+endif()
+
+if (NOT EXISTS "${CMAKE_C_COMPILER}")
+    message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
+    message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+if (NOT EXISTS "${CMAKE_RC_COMPILER}")
+    message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+
+set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
+set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")
+
+set(CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
+set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
+
+if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
+    find_program(NINJA_EXECUTABLE ninja)
+
+    if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
+        set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
+    else()
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            file(GLOB_RECURSE FOUND_NINJA_EXE
+                "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
+                "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
+
+            if(FOUND_NINJA_EXE)
+                list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                break()
+            endif()
+        endforeach()
+    endif()
+
+    if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+        message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
+    endif()
 endif()
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index e898829a..29a4967e 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -24,6 +24,7 @@ import {testCmakeBinary} from "./testCmakeBinary.js";
 import {getCudaNvccPaths} from "./detectAvailableComputeLayers.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const buildConfigType: "Release" | "RelWithDebInfo" | "Debug" = "Release";
 
 export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions: {
     nodeTarget?: string,
@@ -75,18 +76,20 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     await downloadCmakeIfNeeded(buildOptions.progressLogs);
 
                 const cmakePathArgs = await getCmakePathArgs();
+                const cmakeGeneratorArgs = getCmakeGeneratorArgs(buildOptions.platform, buildOptions.arch);
                 const toolchainFile = await getToolchainFileForArch(buildOptions.arch);
                 const runtimeVersion = nodeTarget.startsWith("v") ? nodeTarget.slice("v".length) : nodeTarget;
                 const cmakeCustomOptions = new Map(buildOptions.customCmakeOptions);
 
+                cmakeCustomOptions.set("CMAKE_CONFIGURATION_TYPES", buildConfigType);
+                cmakeCustomOptions.set("NLC_CURRENT_PLATFORM", platform + "-" + process.arch);
+                cmakeCustomOptions.set("NLC_TARGET_PLATFORM", buildOptions.platform + "-" + buildOptions.arch);
+
                 if (buildOptions.gpu === "metal" && process.platform === "darwin" && !cmakeCustomOptions.has("GGML_METAL"))
                     cmakeCustomOptions.set("GGML_METAL", "1");
                 else if (!cmakeCustomOptions.has("GGML_METAL"))
                     cmakeCustomOptions.set("GGML_METAL", "OFF");
 
-                // if (cmakeCustomOptions.get("GGML_METAL") === "1" && !cmakeCustomOptions.has("GGML_METAL_EMBED_LIBRARY"))
-                //     cmakeCustomOptions.set("GGML_METAL_EMBED_LIBRARY", "1");
-
                 if (buildOptions.gpu === "cuda" && !cmakeCustomOptions.has("GGML_CUDA"))
                     cmakeCustomOptions.set("GGML_CUDA", "1");
 
@@ -99,15 +102,21 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 if (toolchainFile != null && !cmakeCustomOptions.has("CMAKE_TOOLCHAIN_FILE"))
                     cmakeCustomOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
 
+                if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP"))
+                    cmakeCustomOptions.set("GGML_OPENMP", "OFF");
+
                 if (ciMode) {
                     if (!cmakeCustomOptions.has("GGML_OPENMP"))
                         cmakeCustomOptions.set("GGML_OPENMP", "OFF");
 
-                    if (!cmakeCustomOptions.has("GGML_NATIVE") && !(buildOptions.platform === "mac" && buildOptions.arch === "arm64")) {
+                    if (!cmakeCustomOptions.has("GGML_NATIVE") || isCmakeValueOff(cmakeCustomOptions.get("GGML_NATIVE"))) {
                         cmakeCustomOptions.set("GGML_NATIVE", "OFF");
 
-                        if (!cmakeCustomOptions.has("GGML_CPU_ALL_VARIANTS"))
+                        if (buildOptions.arch === "x64" && !cmakeCustomOptions.has("GGML_CPU_ALL_VARIANTS")) {
                             cmakeCustomOptions.set("GGML_CPU_ALL_VARIANTS", "ON");
+                            cmakeCustomOptions.set("GGML_BACKEND_DL", "ON");
+                        } else if (!cmakeCustomOptions.has("GGML_BACKEND_DL"))
+                            cmakeCustomOptions.set("GGML_BACKEND_DL", "ON");
                     }
                 }
 
@@ -131,11 +140,12 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     [
                         "run", "-s", "cmake-js-llama", "--", "compile",
                         "--log-level", "warn",
-                        "--config", "Release",
+                        "--config", buildConfigType,
                         "--arch=" + buildOptions.arch,
                         "--out", path.relative(llamaDirectory, outDirectory),
                         "--runtime-version=" + runtimeVersion,
                         "--parallel=" + parallelBuildThreads,
+                        ...cmakeGeneratorArgs,
                         ...cmakePathArgs,
                         ...(
                             [...cmakeCustomOptions].map(([key, value]) => "--CD" + key + "=" + value)
@@ -150,10 +160,10 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     path.join(outDirectory, "bin"),
                     path.join(outDirectory, "llama.cpp", "bin")
                 ];
-                const compiledResultDirPath = path.join(outDirectory, "Release");
+                const compiledResultDirPath = path.join(outDirectory, buildConfigType);
 
                 if (!await fs.pathExists(compiledResultDirPath))
-                    throw new Error("Could not find Release directory");
+                    throw new Error(`Could not find ${buildConfigType} directory`);
 
                 for (const binFilesDirPath of binFilesDirPaths) {
                     if (await fs.pathExists(binFilesDirPath)) {
@@ -302,8 +312,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
 }
 
 export async function getLocalBuildBinaryPath(folderName: string) {
-    const binaryPath = path.join(llamaLocalBuildBinsDirectory, folderName, "Release", "llama-addon.node");
-    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, "Release", buildMetadataFileName);
+    const binaryPath = path.join(llamaLocalBuildBinsDirectory, folderName, buildConfigType, "llama-addon.node");
+    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, buildConfigType, buildMetadataFileName);
     const buildDoneStatusPath = path.join(llamaLocalBuildBinsDirectory, folderName, "buildDone.status");
 
     const [
@@ -323,7 +333,7 @@ export async function getLocalBuildBinaryPath(folderName: string) {
 }
 
 export async function getLocalBuildBinaryBuildMetadata(folderName: string) {
-    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, "Release", buildMetadataFileName);
+    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, buildConfigType, buildMetadataFileName);
 
     if (!(await fs.pathExists(buildMetadataFilePath)))
         throw new Error(`Could not find build metadata file for local build "${folderName}"`);
@@ -372,7 +382,7 @@ export async function getPrebuiltBinaryBuildMetadata(folderPath: string, folderN
 }
 
 async function applyResultDirFixes(resultDirPath: string, tempDirPath: string) {
-    const releaseDirPath = path.join(resultDirPath, "Release");
+    const releaseDirPath = path.join(resultDirPath, buildConfigType);
 
     if (await fs.pathExists(releaseDirPath)) {
         await fs.remove(tempDirPath);
@@ -489,7 +499,7 @@ async function getCmakePathArgs() {
 }
 
 async function getToolchainFileForArch(targetArch: string) {
-    if (process.arch === targetArch)
+    if (process.arch === targetArch || (process.platform === "win32" && process.arch === "arm64"))
         return null;
 
     const platform = process.platform;
@@ -500,11 +510,18 @@ async function getToolchainFileForArch(targetArch: string) {
     const filePath = path.join(llamaToolchainsDirectory, toolchainFilename);
 
     if (await fs.pathExists(filePath))
-        return filePath;
+        return path.resolve(filePath);
 
     return null;
 }
 
+function getCmakeGeneratorArgs(targetPlatform: BinaryPlatform, targetArch: string) {
+    if (targetPlatform === "win" && targetArch === "arm64")
+        return ["--generator", "Ninja Multi-Config"];
+
+    return [];
+}
+
 function getParallelBuildThreadsToUse(platform: BinaryPlatform, gpu?: BuildGpu, ciMode: boolean = false) {
     const cpuCount = os.cpus().length;
 
@@ -523,3 +540,7 @@ function getParallelBuildThreadsToUse(platform: BinaryPlatform, gpu?: BuildGpu,
 function reduceParallelBuildThreads(originalParallelBuildThreads: number) {
     return Math.max(1, Math.round(originalParallelBuildThreads / 2));
 }
+
+function isCmakeValueOff(value?: string) {
+    return value === "OFF" || value === "0";
+}
diff --git a/src/bindings/utils/detectAvailableComputeLayers.ts b/src/bindings/utils/detectAvailableComputeLayers.ts
index 3cae6d79..f04ea6d1 100644
--- a/src/bindings/utils/detectAvailableComputeLayers.ts
+++ b/src/bindings/utils/detectAvailableComputeLayers.ts
@@ -377,15 +377,15 @@ function getWindir() {
 }
 
 
-async function getWindowsProgramFilesPaths() {
+export async function getWindowsProgramFilesPaths() {
     const potentialPaths = await Promise.all(
         [
+            process.env["ProgramFiles(Arm)"],
             process.env.ProgramFiles,
             process.env["ProgramFiles(x86)"],
-            process.env["ProgramFiles(Arm)"],
+            `${process.env.SystemDrive ?? "C:"}\\Program Files (Arm)`,
             `${process.env.SystemDrive ?? "C:"}\\Program Files`,
-            `${process.env.SystemDrive ?? "C:"}\\Program Files (x86)`,
-            `${process.env.SystemDrive ?? "C:"}\\Program Files (Arm)`
+            `${process.env.SystemDrive ?? "C:"}\\Program Files (x86)`
         ]
             .map(async (programFilesPath) => {
                 if (programFilesPath == null)
diff --git a/src/utils/cmake.ts b/src/utils/cmake.ts
index cf80d6a1..8195591e 100644
--- a/src/utils/cmake.ts
+++ b/src/utils/cmake.ts
@@ -8,6 +8,8 @@ import {
     xpackDirectory, xpmVersion
 } from "../config.js";
 import {logDistroInstallInstruction} from "../bindings/utils/logDistroInstallInstruction.js";
+import {getPlatform} from "../bindings/utils/getPlatform.js";
+import {getWindowsProgramFilesPaths} from "../bindings/utils/detectAvailableComputeLayers.js";
 import {spawnCommand} from "./spawnCommand.js";
 import withStatusLogs from "./withStatusLogs.js";
 import {withLockfile} from "./withLockfile.js";
@@ -32,6 +34,12 @@ export async function getCmakePath() {
             return resolvedPath;
     } catch (err) {}
 
+    try {
+        const existingCmake = await findExistingCmake();
+        if (existingCmake != null)
+            return existingCmake;
+    } catch (err) {}
+
     try {
         let resolvedPath = await which("cmake", {
             path: path.join(llamaDirectory, "xpack", "xpacks", ".bin")
@@ -99,6 +107,66 @@ export async function fixXpackPermissions() {
     } catch (err) {}
 }
 
+async function findExistingCmake() {
+    const platform = getPlatform();
+
+    if (platform === "win") {
+        const programFilesPaths = await getWindowsProgramFilesPaths();
+        const potentialVisualStudioPaths = programFilesPaths
+            .map((programFilesPath) => `${programFilesPath}/Microsoft Visual Studio`);
+
+        const versionPaths = (await Promise.all(
+            potentialVisualStudioPaths.map(async (vsPath) => {
+                if (await fs.pathExists(vsPath)) {
+                    const versions = await fs.readdir(vsPath);
+                    return versions
+                        .sort((a, b) => {
+                            const aNumber = parseInt(a);
+                            const bNumber = parseInt(b);
+
+                            if (Number.isFinite(aNumber) && Number.isFinite(bNumber))
+                                return bNumber - aNumber;
+                            else if (Number.isFinite(aNumber))
+                                return -1;
+                            else if (Number.isFinite(bNumber))
+                                return 1;
+
+                            return 0;
+                        })
+                        .map((version) => path.join(vsPath, version));
+                }
+
+                return [];
+            })
+        )).flat();
+
+        const editionPaths = (await Promise.all(
+            versionPaths.map(async (versionPath) => {
+                const editions = await fs.readdir(versionPath);
+                return editions.map((edition) => path.join(versionPath, edition));
+            })
+        )).flat();
+
+        const potentialCmakePaths = editionPaths.map((editionPath) => (
+            path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")
+        ));
+
+        const cmakePaths = (await Promise.all(
+            potentialCmakePaths.map(async (cmakePath) => {
+                if (await fs.pathExists(cmakePath))
+                    return cmakePath;
+
+                return null;
+            })
+        ))
+            .filter((cmakePath) => cmakePath != null);
+
+        return cmakePaths[0];
+    }
+
+    return undefined;
+}
+
 async function downloadCmake({progressLogs = true}: {progressLogs?: boolean} = {}) {
     await withLockfile({
         resourcePath: path.join(xpackDirectory, "cmakeInstall")

From 602cb3ce93603c008c1decdef75eb361df169854 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 29 Dec 2024 04:41:46 +0200
Subject: [PATCH 20/73] fix: embedding context deadlock

---
 package-lock.json                      | 14 ++++++++++----
 package.json                           |  2 +-
 src/evaluator/LlamaModel/LlamaModel.ts |  9 +--------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 4286a8eb..d5869f27 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -24,7 +24,7 @@
         "ignore": "^5.3.2",
         "ipull": "^3.9.2",
         "is-unicode-supported": "^2.1.0",
-        "lifecycle-utils": "^1.7.2",
+        "lifecycle-utils": "^2.0.0",
         "log-symbols": "^7.0.0",
         "nanoid": "^5.0.9",
         "node-addon-api": "^8.3.0",
@@ -9721,6 +9721,12 @@
         "@reflink/reflink": "^0.1.16"
       }
     },
+    "node_modules/ipull/node_modules/lifecycle-utils": {
+      "version": "1.7.3",
+      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-1.7.3.tgz",
+      "integrity": "sha512-T7zs7J6/sgsqwVyG34Sfo5LTQmlPmmqaUe3yBhdF8nq24RtR/HtbkNZRhNbr9BEaKySdSgH+P9H5U9X+p0WjXw==",
+      "license": "MIT"
+    },
     "node_modules/ipull/node_modules/parse-ms": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/parse-ms/-/parse-ms-3.0.0.tgz",
@@ -10458,9 +10464,9 @@
       }
     },
     "node_modules/lifecycle-utils": {
-      "version": "1.7.2",
-      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-1.7.2.tgz",
-      "integrity": "sha512-HwjXBOf6FUun3liJ2XA3bNR7WZoWn0b3ji9fSvQ7qvnfhPd1UCuq3M6If44Rl3K14373gm/eyeIYbpDAfB9cbw==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-2.0.0.tgz",
+      "integrity": "sha512-KIkV6NeD2n0jZnO+fdIGKI5Or7alyhb6UTFzeaqf6EnE5y3pdK821+kd7yOMBUL/sPYhHU5ny74J0QKslLikGw==",
       "license": "MIT"
     },
     "node_modules/lines-and-columns": {
diff --git a/package.json b/package.json
index 152007c3..c61fd7ce 100644
--- a/package.json
+++ b/package.json
@@ -190,7 +190,7 @@
     "ignore": "^5.3.2",
     "ipull": "^3.9.2",
     "is-unicode-supported": "^2.1.0",
-    "lifecycle-utils": "^1.7.2",
+    "lifecycle-utils": "^2.0.0",
     "log-symbols": "^7.0.0",
     "nanoid": "^5.0.9",
     "node-addon-api": "^8.3.0",
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index ca2e2c4e..f8d609e8 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -522,14 +522,7 @@ export class LlamaModel {
         if (this._vocabOnly)
             throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
 
-        return await withLock(this._llama._memoryLock, LlamaLocks.loadToMemory, options.createSignal, async () => {
-            const preventDisposalHandle = this._backendModelDisposeGuard.createPreventDisposalHandle();
-            try {
-                return await LlamaEmbeddingContext._create({_model: this}, options);
-            } finally {
-                preventDisposalHandle.dispose();
-            }
-        });
+        return await LlamaEmbeddingContext._create({_model: this}, options);
     }
 
     /**

From 275c0052137569bd266407240ad3fa5a0df02d8f Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 30 Dec 2024 00:03:28 +0200
Subject: [PATCH 21/73] feat: try compiling with LLVM on Windows x64 when
 available

for improved performance and compatibility
---
 llama/CMakeLists.txt                          |   3 +
 .../llvm.win32.host-x64.target-x64.cmake      |  72 ++++++++
 .../llvm.win32.host-x64.target-x64.cmake      |  90 ++++++++++
 src/bindings/utils/asyncSome.ts               |   3 +
 src/bindings/utils/compileLLamaCpp.ts         |  44 ++++-
 src/bindings/utils/detectBuildTools.ts        | 163 ++++++++++++++++++
 src/utils/cmake.ts                            |  38 +---
 7 files changed, 370 insertions(+), 43 deletions(-)
 create mode 100644 llama/profiles/llvm.win32.host-x64.target-x64.cmake
 create mode 100644 llama/toolchains/llvm.win32.host-x64.target-x64.cmake
 create mode 100644 src/bindings/utils/detectBuildTools.ts

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 647b3e2b..7e9f18a1 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -8,6 +8,9 @@ if (NLC_TARGET_PLATFORM STREQUAL "win-arm64" AND (CMAKE_GENERATOR STREQUAL "Ninj
         get_filename_component(INCLUDE_PROFILE_ABS "./profiles/llvm.win32.host-arm64.target-arm64.cmake" ABSOLUTE)
         include("${INCLUDE_PROFILE_ABS}")
     endif()
+elseif (NLC_CURRENT_PLATFORM STREQUAL "win-x64" AND NLC_TARGET_PLATFORM STREQUAL "win-x64" AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") AND NOT MINGW)
+    get_filename_component(INCLUDE_PROFILE_ABS "./profiles/llvm.win32.host-x64.target-x64.cmake" ABSOLUTE)
+    include("${INCLUDE_PROFILE_ABS}")
 endif()
 
 project("llama-addon" C CXX)
diff --git a/llama/profiles/llvm.win32.host-x64.target-x64.cmake b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
new file mode 100644
index 00000000..f02b0e8e
--- /dev/null
+++ b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
@@ -0,0 +1,72 @@
+set(PROGRAMFILES "$ENV{ProgramFiles}")
+set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+set(PROGRAMFILES_PATHS
+    "${PROGRAMFILES}"
+    "${PROGRAMFILES_X86}"
+    "C:/Program Files"
+    "C:/Program Files (x86)"
+)
+
+if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+    if (NOT DEFINED NODE_LIB_CMAKE_AR)
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            if(NODE_LIB_CMAKE_AR)
+                break()
+            endif()
+
+            file(GLOB_RECURSE FOUND_LIB_EXE
+                "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Hostx64/x64/lib.exe"
+                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Hostx64/x64/lib.exe")
+
+            if(FOUND_LIB_EXE)
+                list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
+                break()
+            endif()
+        endforeach()
+    endif()
+
+    if (EXISTS "${NODE_LIB_CMAKE_AR}")
+        # Generate node.lib
+        execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} /MACHINE:X64 /nologo)
+    else()
+        message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
+    endif()
+endif()
+
+# adapt cmake-js to work with llvm in GNU mode
+if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
+    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
+        CMAKE_SHARED_LINKER_FLAGS
+        "${CMAKE_SHARED_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+
+# ensure CMAKE_AR is configured
+if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
+    set(LLVM_INSTALL_PATHS "")
+    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+        file(GLOB_RECURSE FOUND_LLVM_ROOT
+            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
+            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
+
+        if(FOUND_LLVM_ROOT)
+            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+        endif()
+    endforeach()
+
+    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+    endif()
+
+    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+        if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
+            set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
+            break()
+        endif()
+    endforeach()
+endif()
\ No newline at end of file
diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
new file mode 100644
index 00000000..ec4d495c
--- /dev/null
+++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
@@ -0,0 +1,90 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_RC_COMPILER llvm-rc)
+
+set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
+
+set(PROGRAMFILES "$ENV{ProgramFiles}")
+set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+set(PROGRAMFILES_PATHS
+    "${PROGRAMFILES}"
+    "${PROGRAMFILES_X86}"
+    "C:/Program Files"
+    "C:/Program Files (x86)"
+)
+
+set(LLVM_INSTALL_PATHS "")
+foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+    list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+    file(GLOB_RECURSE FOUND_LLVM_ROOT
+        "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
+        "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
+
+    if(FOUND_LLVM_ROOT)
+        list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+    endif()
+endforeach()
+
+if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+    list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+endif()
+
+set(LLVM_ROOT "")
+foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+    if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
+        set(LLVM_ROOT "${PATH}")
+        break()
+    endif()
+endforeach()
+
+if(LLVM_ROOT STREQUAL "")
+    message(FATAL_ERROR "LLVM installation was not found. Please install LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+
+if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
+    set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
+    set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
+    set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
+endif()
+
+if (NOT EXISTS "${CMAKE_C_COMPILER}")
+    message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
+    message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+if (NOT EXISTS "${CMAKE_RC_COMPILER}")
+    message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+endif()
+
+set(arch_c_flags "-march=native")
+
+set(CMAKE_C_FLAGS_INIT "${arch_c_flags}")
+set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags}")
+
+if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
+    find_program(NINJA_EXECUTABLE ninja)
+
+    if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
+        set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
+    else()
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            file(GLOB_RECURSE FOUND_NINJA_EXE
+                "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
+                "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
+
+            if(FOUND_NINJA_EXE)
+                list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                break()
+            endif()
+        endforeach()
+    endif()
+
+    if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+        message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
+    endif()
+endif()
diff --git a/src/bindings/utils/asyncSome.ts b/src/bindings/utils/asyncSome.ts
index 3cd38f16..d3e979cd 100644
--- a/src/bindings/utils/asyncSome.ts
+++ b/src/bindings/utils/asyncSome.ts
@@ -5,6 +5,9 @@ import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
  * Note that this function will not throw on error and instead will log the error to the console.
  */
 export async function asyncSome(promises: Promise<boolean>[]): Promise<boolean> {
+    if (promises.length === 0)
+        return Promise.resolve(false);
+
     return new Promise((resolve) => {
         let fulfilled = 0;
 
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 29a4967e..7f90aa4f 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -22,6 +22,7 @@ import {BinaryPlatform, getPlatform} from "./getPlatform.js";
 import {logDistroInstallInstruction} from "./logDistroInstallInstruction.js";
 import {testCmakeBinary} from "./testCmakeBinary.js";
 import {getCudaNvccPaths} from "./detectAvailableComputeLayers.js";
+import {detectWindowsBuildTools} from "./detectBuildTools.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const buildConfigType: "Release" | "RelWithDebInfo" | "Debug" = "Release";
@@ -32,7 +33,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
     includeBuildOptionsInBinaryFolderName?: boolean,
     ensureLlamaCppRepoIsCloned?: boolean,
     downloadCmakeIfNeeded?: boolean,
-    ignoreWorkarounds?: ("cudaArchitecture" | "reduceParallelBuildThreads" | "singleBuildThread")[],
+    ignoreWorkarounds?: ("cudaArchitecture" | "reduceParallelBuildThreads" | "singleBuildThread" | "avoidWindowsLlvm")[],
     envVars?: typeof process.env,
     ciMode?: boolean
 }): Promise<void> {
@@ -52,6 +53,9 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
     const finalBuildFolderName = includeBuildOptionsInBinaryFolderName
         ? buildFolderName.withCustomCmakeOptions
         : buildFolderName.withoutCustomCmakeOptions;
+    const useWindowsLlvm = (platform === "win" && !ignoreWorkarounds.includes("avoidWindowsLlvm"))
+        ? areWindowsBuildToolsCapableForLlvmBuild(await detectWindowsBuildTools())
+        : false;
 
     const outDirectory = path.join(llamaLocalBuildBinsDirectory, finalBuildFolderName);
 
@@ -76,8 +80,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     await downloadCmakeIfNeeded(buildOptions.progressLogs);
 
                 const cmakePathArgs = await getCmakePathArgs();
-                const cmakeGeneratorArgs = getCmakeGeneratorArgs(buildOptions.platform, buildOptions.arch);
-                const toolchainFile = await getToolchainFileForArch(buildOptions.arch);
+                const cmakeGeneratorArgs = getCmakeGeneratorArgs(buildOptions.platform, buildOptions.arch, useWindowsLlvm);
+                const toolchainFile = await getToolchainFileForArch(buildOptions.arch, useWindowsLlvm);
                 const runtimeVersion = nodeTarget.startsWith("v") ? nodeTarget.slice("v".length) : nodeTarget;
                 const cmakeCustomOptions = new Map(buildOptions.customCmakeOptions);
 
@@ -306,6 +310,20 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 chalk.yellow("To resolve errors related to Vulkan compilation, see the Vulkan guide: ") +
                 documentationPageUrls.Vulkan
             );
+        else if (useWindowsLlvm) {
+            if (buildOptions.progressLogs)
+                console.info(getConsoleLogPrefix(true) + "Trying to compile again without LLVM");
+
+            try {
+                return await compileLlamaCpp(buildOptions, {
+                    ...compileOptions,
+                    ignoreWorkarounds: [...ignoreWorkarounds, "avoidWindowsLlvm"]
+                });
+            } catch (err) {
+                if (buildOptions.progressLogs)
+                    console.error(getConsoleLogPrefix(true, false), err);
+            }
+        }
 
         throw err;
     }
@@ -498,14 +516,20 @@ async function getCmakePathArgs() {
     return ["--cmake-path", cmakePath];
 }
 
-async function getToolchainFileForArch(targetArch: string) {
-    if (process.arch === targetArch || (process.platform === "win32" && process.arch === "arm64"))
+async function getToolchainFileForArch(targetArch: string, windowsLlvmSupport: boolean = false) {
+    let toolchainPrefix = "";
+
+    if (process.platform === "win32" && process.arch === "arm64") {
+        // a toolchain is needed to cross-compile to arm64 on Windows, and to compile on arm64 on Windows
+    } else if (process.platform === "win32" && process.arch === "x64" && targetArch === "x64" && windowsLlvmSupport) {
+        toolchainPrefix = "llvm.";
+    } else if (process.arch === targetArch)
         return null;
 
     const platform = process.platform;
     const hostArch = process.arch;
 
-    const toolchainFilename = `${platform}.host-${hostArch}.target-${targetArch}.cmake`;
+    const toolchainFilename = `${toolchainPrefix}${platform}.host-${hostArch}.target-${targetArch}.cmake`;
 
     const filePath = path.join(llamaToolchainsDirectory, toolchainFilename);
 
@@ -515,9 +539,11 @@ async function getToolchainFileForArch(targetArch: string) {
     return null;
 }
 
-function getCmakeGeneratorArgs(targetPlatform: BinaryPlatform, targetArch: string) {
+function getCmakeGeneratorArgs(targetPlatform: BinaryPlatform, targetArch: string, windowsLlvmSupport: boolean) {
     if (targetPlatform === "win" && targetArch === "arm64")
         return ["--generator", "Ninja Multi-Config"];
+    else if (windowsLlvmSupport && targetPlatform === "win" && process.arch === "x64" && targetArch === "x64")
+        return ["--generator", "Ninja Multi-Config"];
 
     return [];
 }
@@ -544,3 +570,7 @@ function reduceParallelBuildThreads(originalParallelBuildThreads: number) {
 function isCmakeValueOff(value?: string) {
     return value === "OFF" || value === "0";
 }
+
+function areWindowsBuildToolsCapableForLlvmBuild(detectedBuildTools: Awaited<ReturnType<typeof detectWindowsBuildTools>>) {
+    return detectedBuildTools.hasLlvm && detectedBuildTools.hasNinja && detectedBuildTools.hasLibExe;
+}
diff --git a/src/bindings/utils/detectBuildTools.ts b/src/bindings/utils/detectBuildTools.ts
new file mode 100644
index 00000000..e10884cc
--- /dev/null
+++ b/src/bindings/utils/detectBuildTools.ts
@@ -0,0 +1,163 @@
+import path from "path";
+import fs from "fs-extra";
+import {getWindowsProgramFilesPaths} from "./detectAvailableComputeLayers.js";
+import {asyncSome} from "./asyncSome.js";
+import {asyncEvery} from "./asyncEvery.js";
+import {getPlatform} from "./getPlatform.js";
+
+/**
+ * On platforms other than Windows, this function will return an empty array
+ * @returns Visual Studio edition installation paths - the paths are ordered from the most recent version to the oldest
+ */
+export async function getWindowsVisualStudioEditionPaths() {
+    const platform = getPlatform();
+
+    if (platform !== "win")
+        return [];
+
+    const programFilesPaths = await getWindowsProgramFilesPaths();
+    const potentialVisualStudioPaths = programFilesPaths
+        .map((programFilesPath) => `${programFilesPath}/Microsoft Visual Studio`);
+
+    const versionPaths = (await Promise.all(
+        potentialVisualStudioPaths.map(async (vsPath) => {
+            if (await fs.pathExists(vsPath)) {
+                const versions = await fs.readdir(vsPath);
+                return versions
+                    .sort((a, b) => {
+                        const aNumber = parseInt(a);
+                        const bNumber = parseInt(b);
+
+                        if (Number.isFinite(aNumber) && Number.isFinite(bNumber))
+                            return bNumber - aNumber;
+                        else if (Number.isFinite(aNumber))
+                            return -1;
+                        else if (Number.isFinite(bNumber))
+                            return 1;
+
+                        return 0;
+                    })
+                    .map((version) => path.join(vsPath, version));
+            }
+
+            return [];
+        })
+    )).flat();
+
+    const editionPaths = (await Promise.all(
+        versionPaths.map(async (versionPath) => {
+            const editions = await fs.readdir(versionPath);
+            return editions.map((edition) => path.join(versionPath, edition));
+        })
+    )).flat();
+
+    return editionPaths;
+}
+
+export async function detectWindowsBuildTools(targetArch: typeof process.arch = process.arch) {
+    try {
+        const currentArch = process.arch;
+        const editionPaths = await getWindowsVisualStudioEditionPaths();
+
+        if (editionPaths.length === 0)
+            return {
+                hasCmake: false,
+                hasNinja: false,
+                hasLlvm: false,
+                hasLibExe: false
+            };
+
+        const msvcPaths = (await Promise.all(
+            editionPaths.map(async (editionPath) => {
+                const msvcVersionsPath = path.join(editionPath, "VC", "Tools", "MSVC");
+
+                if (await fs.pathExists(msvcVersionsPath)) {
+                    const msvcVersions = await fs.readdir(msvcVersionsPath);
+                    return msvcVersions
+                        .sort((a, b) => {
+                            const aNumber = parseInt(a);
+                            const bNumber = parseInt(b);
+
+                            if (Number.isFinite(aNumber) && Number.isFinite(bNumber))
+                                return bNumber - aNumber;
+                            else if (Number.isFinite(aNumber))
+                                return -1;
+                            else if (Number.isFinite(bNumber))
+                                return 1;
+
+                            return 0;
+                        })
+                        .map((msvcVersion) => path.join(msvcVersionsPath, msvcVersion));
+                }
+
+                return [];
+            })
+        )).flat();
+
+        const potentialCmakePaths = editionPaths.map((editionPath) => (
+            path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")
+        ));
+        const potentialNinjaPaths = editionPaths.map((editionPath) => (
+            path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "Ninja", "ninja.exe")
+        ));
+        const potentialLlvmPaths = editionPaths.map((editionPath) => {
+            if (currentArch === "x64")
+                return path.join(editionPath, "VC", "Tools", "Llvm", "x64", "bin");
+            else if (currentArch === "arm64")
+                return path.join(editionPath, "VC", "Tools", "Llvm", "ARM64", "bin");
+
+            return path.join(editionPath, "VC", "Tools", "Llvm", "bin");
+        });
+        const potentialLibExePaths = msvcPaths.map((msvcPath) => {
+            const hostArchDirName = currentArch === "x64"
+                ? "Hostx64"
+                : currentArch === "arm64"
+                    ? "Hostarm64"
+                    : "_";
+            const targetArchDirName = targetArch === "x64"
+                ? "x64"
+                : targetArch === "arm64"
+                    ? "arm64"
+                    : "_";
+
+            return path.join(msvcPath, "bin", hostArchDirName, targetArchDirName, "lib.exe");
+        });
+
+        const [
+            hasCmake,
+            hasNinja,
+            hasLibExe,
+            hasLlvm
+        ] = await Promise.all([
+            asyncSome(potentialCmakePaths.map((cmakePath) => fs.pathExists(cmakePath))),
+            asyncSome(potentialNinjaPaths.map((ninjaPath) => fs.pathExists(ninjaPath))),
+            asyncSome(potentialLibExePaths.map((libExePath) => fs.pathExists(libExePath))),
+            asyncSome(potentialLlvmPaths.map((llvmPath) => isLlvmPathValid(llvmPath)))
+        ]);
+
+        return {
+            hasCmake,
+            hasNinja,
+            hasLlvm,
+            hasLibExe
+        };
+    } catch (err) {
+        return {
+            hasCmake: false,
+            hasNinja: false,
+            hasLlvm: false,
+            hasLibExe: false
+        };
+    }
+}
+
+async function isLlvmPathValid(llvmPath: string): Promise<boolean> {
+    if (!(await fs.pathExists(llvmPath)))
+        return false;
+
+    return await asyncEvery([
+        fs.pathExists(path.join(llvmPath, "clang.exe")),
+        fs.pathExists(path.join(llvmPath, "clang++.exe")),
+        fs.pathExists(path.join(llvmPath, "llvm-rc.exe"))
+    ]);
+}
diff --git a/src/utils/cmake.ts b/src/utils/cmake.ts
index 8195591e..e72ba175 100644
--- a/src/utils/cmake.ts
+++ b/src/utils/cmake.ts
@@ -9,7 +9,7 @@ import {
 } from "../config.js";
 import {logDistroInstallInstruction} from "../bindings/utils/logDistroInstallInstruction.js";
 import {getPlatform} from "../bindings/utils/getPlatform.js";
-import {getWindowsProgramFilesPaths} from "../bindings/utils/detectAvailableComputeLayers.js";
+import {getWindowsVisualStudioEditionPaths} from "../bindings/utils/detectBuildTools.js";
 import {spawnCommand} from "./spawnCommand.js";
 import withStatusLogs from "./withStatusLogs.js";
 import {withLockfile} from "./withLockfile.js";
@@ -111,41 +111,7 @@ async function findExistingCmake() {
     const platform = getPlatform();
 
     if (platform === "win") {
-        const programFilesPaths = await getWindowsProgramFilesPaths();
-        const potentialVisualStudioPaths = programFilesPaths
-            .map((programFilesPath) => `${programFilesPath}/Microsoft Visual Studio`);
-
-        const versionPaths = (await Promise.all(
-            potentialVisualStudioPaths.map(async (vsPath) => {
-                if (await fs.pathExists(vsPath)) {
-                    const versions = await fs.readdir(vsPath);
-                    return versions
-                        .sort((a, b) => {
-                            const aNumber = parseInt(a);
-                            const bNumber = parseInt(b);
-
-                            if (Number.isFinite(aNumber) && Number.isFinite(bNumber))
-                                return bNumber - aNumber;
-                            else if (Number.isFinite(aNumber))
-                                return -1;
-                            else if (Number.isFinite(bNumber))
-                                return 1;
-
-                            return 0;
-                        })
-                        .map((version) => path.join(vsPath, version));
-                }
-
-                return [];
-            })
-        )).flat();
-
-        const editionPaths = (await Promise.all(
-            versionPaths.map(async (versionPath) => {
-                const editions = await fs.readdir(versionPath);
-                return editions.map((edition) => path.join(versionPath, edition));
-            })
-        )).flat();
+        const editionPaths = await getWindowsVisualStudioEditionPaths();
 
         const potentialCmakePaths = editionPaths.map((editionPath) => (
             path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")

From 729de00990bf19d33759bff4238df910146f8301 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 30 Dec 2024 00:07:40 +0200
Subject: [PATCH 22/73] fix: switch from `"nextTick"` to `"nextCycle"` for the
 default batch dispatcher

for improved performance
---
 src/evaluator/LlamaContext/LlamaContext.ts | 17 +++++++++++++----
 src/evaluator/LlamaContext/types.ts        |  6 +++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 3f2321d6..47ea4bfa 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -74,7 +74,7 @@ export class LlamaContext {
         flashAttention = _model.defaultContextFlashAttention,
         threads,
         batching: {
-            dispatchSchedule: batchingDispatchSchedule = "nextTick",
+            dispatchSchedule: batchingDispatchSchedule = "nextCycle",
             itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism"
         } = {},
         performanceTracking = false,
@@ -675,10 +675,19 @@ export class LlamaContext {
 
         if (this._queuedDecodeSequenceIds.size === this._totalSequences)
             dispatch();
-        if (dispatchSchedule === "nextTick")
-            setTimeout(dispatch, 0);
-        else
+        if (dispatchSchedule === "nextCycle") {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        } else if (typeof dispatchSchedule === "function")
             dispatchSchedule(dispatch);
+        else {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        }
     }
 
     /** @internal */
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index aee06bd8..18db4525 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -215,12 +215,12 @@ export type LlamaContextSequenceRepeatPenalty = {
 export type BatchingOptions = {
     /**
      * The strategy used to dispatch items to be processed when there are items pending to be processed.
-     * - **`"nextTick"`** - dispatch the items on the next event loop tick.
+     * - **`"nextCycle"`** - dispatch the items on the next event loop cycle.
      * You can provide a custom function to define a custom dispatch schedule.
      *
-     * Defaults to `"nextTick"`.
+     * Defaults to `"nextCycle"`.
      */
-    dispatchSchedule?: "nextTick" | CustomBatchingDispatchSchedule,
+    dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule,
 
     /**
      * The strategy used to prioritize pending items to be processed.

From 93346f98dcea1d93571fd6babfbe5dfbc327dc36 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 30 Dec 2024 03:58:29 +0200
Subject: [PATCH 23/73] feat(minor): improve memory usage estimation

---
 src/gguf/insights/GgufInsights.ts | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index 633fe0df..babf2565 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -355,6 +355,7 @@ export class GgufInsights {
         gpu: GgufTensorInfo[]
     } {
         const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
+        const architecture = this._ggufFileInfo.metadata?.general?.architecture;
 
         if (gpuLayers === 0) {
             return {
@@ -369,7 +370,15 @@ export class GgufInsights {
         const gpuTensors: GgufTensorInfo[] = [];
         const cpuTensors: GgufTensorInfo[] = [];
 
+        let tokenEmbedLayer: GgufTensorInfo | undefined;
+        let mainOutputLayer: GgufTensorInfo | undefined;
+
         for (const singleTensorInfo of tensorInfo) {
+            if (isMainOutputLayer(singleTensorInfo.name))
+                mainOutputLayer = singleTensorInfo;
+            else if (isTokenEmbedLayer(singleTensorInfo.name))
+                tokenEmbedLayer = singleTensorInfo;
+
             // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
             // loaded with `model.dev_input`, which is always set to the CPU
             if (isInputLayer(singleTensorInfo.name)) {
@@ -391,8 +400,6 @@ export class GgufInsights {
             const {layerNumber} = parseTensorName(singleTensorInfo.name);
 
             if (gpuLayers !== this.totalLayers) {
-                const architecture = this._ggufFileInfo.metadata?.general?.architecture;
-
                 if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) {
                     if (layerNumber != null && layerNumber >= startGpuLayer)
                         gpuTensors.push(singleTensorInfo);
@@ -409,6 +416,9 @@ export class GgufInsights {
                 cpuTensors.push(singleTensorInfo);
         }
 
+        if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer))
+            gpuTensors.push(tokenEmbedLayer);
+
         return {
             cpu: cpuTensors,
             gpu: gpuTensors
@@ -626,3 +636,15 @@ function isOutputLayer(layerName: string) {
 
     return false;
 }
+
+function isMainOutputLayer(layerName: string) {
+    const [firstPart] = layerName.split(".");
+
+    return firstPart === "output";
+}
+
+function isTokenEmbedLayer(layerName: string) {
+    const [firstPart] = layerName.split(".");
+
+    return firstPart === "token_embd";
+}

From a5365d2441250825fe1676f3d5f61554e52884bf Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 31 Dec 2024 02:52:17 +0200
Subject: [PATCH 24/73] feat(minor): improve memory usage estimation

---
 llama/addon/addon.cpp                         |  18 +
 llama/addon/globals/getGpuInfo.cpp            |  24 +-
 llama/addon/globals/getGpuInfo.h              |   4 +
 llama/addon/globals/getMemoryInfo.cpp         |  62 +++
 llama/addon/globals/getMemoryInfo.h           |   4 +
 src/bindings/AddonTypes.ts                    |   4 +
 src/bindings/Llama.ts                         |   6 +
 src/cli/commands/ChatCommand.ts               |  20 +-
 src/cli/commands/CompleteCommand.ts           |  20 +-
 src/cli/commands/InfillCommand.ts             |  20 +-
 .../commands/InspectEstimateCommand.ts        |  14 +-
 .../inspect/commands/InspectGgufCommand.ts    |  14 +
 .../inspect/commands/InspectGpuCommand.ts     |   4 +-
 .../inspect/commands/InspectMeasureCommand.ts | 361 ++++++++++++++----
 src/cli/utils/ConsoleTable.ts                 |  10 +-
 src/cli/utils/interactivelyAskForModel.ts     |  25 +-
 src/cli/utils/printCommonInfoLines.ts         |   9 +
 src/cli/utils/resolveCommandGgufPath.ts       |   7 +-
 src/evaluator/LlamaModel/LlamaModel.ts        |  14 +-
 src/gguf/insights/GgufInsights.ts             |  67 +++-
 .../GgufInsightsConfigurationResolver.ts      |  30 +-
 .../utils/resolveModelGpuLayersOption.ts      |  28 +-
 src/gguf/parser/GgufV2Parser.ts               |  44 ++-
 src/gguf/parser/parseGguf.ts                  |  22 +-
 src/gguf/readGgufFileInfo.ts                  |  15 +-
 src/gguf/types/GgufFileInfoTypes.ts           |   3 +-
 src/gguf/types/GgufMetadataTypes.ts           |   2 +-
 src/gguf/types/GgufTensorInfoTypes.ts         |  17 +-
 .../functionaryModelGpuLayersOptions.test.ts  | 197 +++++-----
 .../__snapshots__/ggufParser.test.ts.snap     |  32 ++
 .../functionary/gguf/ggufInsights.test.ts     |  35 +-
 .../llama3.1/tokenPredictor.test.ts           |  41 +-
 .../stableCodeModelGpuLayersOptions.test.ts   |  76 ++--
 .../ggufStandaloneParser.test.ts.snap         |  16 +
 34 files changed, 935 insertions(+), 330 deletions(-)
 create mode 100644 llama/addon/globals/getMemoryInfo.cpp
 create mode 100644 llama/addon/globals/getMemoryInfo.h

diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
index ed4b2fb9..f2a6669c 100644
--- a/llama/addon/addon.cpp
+++ b/llama/addon/addon.cpp
@@ -9,6 +9,7 @@
 #include "globals/addonProgress.h"
 #include "globals/getGpuInfo.h"
 #include "globals/getSwapInfo.h"
+#include "globals/getMemoryInfo.h"
 
 bool backendInitialized = false;
 bool backendDisposed = false;
@@ -25,6 +26,21 @@ Napi::Value addonGetSupportsMmap(const Napi::CallbackInfo& info) {
     return Napi::Boolean::New(info.Env(), llama_supports_mmap());
 }
 
+Napi::Value addonGetGpuSupportsMmap(const Napi::CallbackInfo& info) {
+    const auto llamaSupportsMmap = llama_supports_mmap();
+    const auto gpuDevice = getGpuDevice().first;
+
+    if (gpuDevice == nullptr) {
+        return Napi::Boolean::New(info.Env(), false);
+    }
+
+    ggml_backend_dev_props props;
+    ggml_backend_dev_get_props(gpuDevice, &props);
+
+    const bool gpuSupportsMmap = llama_supports_mmap() && props.caps.buffer_from_host_ptr;
+    return Napi::Boolean::New(info.Env(), gpuSupportsMmap);
+}
+
 Napi::Value addonGetSupportsMlock(const Napi::CallbackInfo& info) {
     return Napi::Boolean::New(info.Env(), llama_supports_mlock());
 }
@@ -210,6 +226,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("systemInfo", systemInfo),
         Napi::PropertyDescriptor::Function("getSupportsGpuOffloading", addonGetSupportsGpuOffloading),
         Napi::PropertyDescriptor::Function("getSupportsMmap", addonGetSupportsMmap),
+        Napi::PropertyDescriptor::Function("getGpuSupportsMmap", addonGetGpuSupportsMmap),
         Napi::PropertyDescriptor::Function("getSupportsMlock", addonGetSupportsMlock),
         Napi::PropertyDescriptor::Function("getMathCores", addonGetMathCores),
         Napi::PropertyDescriptor::Function("getBlockSizeForGgmlType", addonGetBlockSizeForGgmlType),
@@ -221,6 +238,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo),
         Napi::PropertyDescriptor::Function("getGpuType", getGpuType),
         Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo),
+        Napi::PropertyDescriptor::Function("getMemoryInfo", getMemoryInfo),
         Napi::PropertyDescriptor::Function("loadBackends", addonLoadBackends),
         Napi::PropertyDescriptor::Function("init", addonInit),
         Napi::PropertyDescriptor::Function("dispose", addonDispose),
diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp
index cb15501f..61715c3c 100644
--- a/llama/addon/globals/getGpuInfo.cpp
+++ b/llama/addon/globals/getGpuInfo.cpp
@@ -89,17 +89,17 @@ Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info) {
     return result;
 }
 
-Napi::Value getGpuType(const Napi::CallbackInfo& info) {
+std::pair<ggml_backend_dev_t, std::string> getGpuDevice() {
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
         ggml_backend_dev_t device = ggml_backend_dev_get(i);
         const auto deviceName = std::string(ggml_backend_dev_name(device));
         
         if (deviceName == "Metal") {
-            return Napi::String::New(info.Env(), "metal");
+            return std::pair<ggml_backend_dev_t, std::string>(device, "metal");
         } else if (std::string(deviceName).find("Vulkan") == 0) {
-            return Napi::String::New(info.Env(), "vulkan");
+            return std::pair<ggml_backend_dev_t, std::string>(device, "vulkan");
         } else if (std::string(deviceName).find("CUDA") == 0 || std::string(deviceName).find("ROCm") == 0 || std::string(deviceName).find("MUSA") == 0) {
-            return Napi::String::New(info.Env(), "cuda");
+            return std::pair<ggml_backend_dev_t, std::string>(device, "cuda");
         }
     }
 
@@ -108,9 +108,23 @@ Napi::Value getGpuType(const Napi::CallbackInfo& info) {
         const auto deviceName = std::string(ggml_backend_dev_name(device));
         
         if (deviceName == "CPU") {
-            return Napi::Boolean::New(info.Env(), false);
+            return std::pair<ggml_backend_dev_t, std::string>(device, "cpu");
         }
     }
 
+    return std::pair<ggml_backend_dev_t, std::string>(nullptr, "");
+}
+
+Napi::Value getGpuType(const Napi::CallbackInfo& info) {
+    const auto gpuDeviceRes = getGpuDevice();
+    const auto device = gpuDeviceRes.first;
+    const auto deviceType = gpuDeviceRes.second;
+    
+    if (deviceType == "cpu") {
+        return Napi::Boolean::New(info.Env(), false);
+    } else if (device != nullptr && deviceType != "") {
+        return Napi::String::New(info.Env(), deviceType);
+    }
+
     return info.Env().Undefined();
 }
diff --git a/llama/addon/globals/getGpuInfo.h b/llama/addon/globals/getGpuInfo.h
index c32de9d5..ef8509ed 100644
--- a/llama/addon/globals/getGpuInfo.h
+++ b/llama/addon/globals/getGpuInfo.h
@@ -1,6 +1,10 @@
 #pragma once
+#include <utility>
+#include <string>
 #include "napi.h"
+#include "llama.h"
 
 Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info);
 Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info);
+std::pair<ggml_backend_dev_t, std::string> getGpuDevice();
 Napi::Value getGpuType(const Napi::CallbackInfo& info);
\ No newline at end of file
diff --git a/llama/addon/globals/getMemoryInfo.cpp b/llama/addon/globals/getMemoryInfo.cpp
new file mode 100644
index 00000000..26472d3a
--- /dev/null
+++ b/llama/addon/globals/getMemoryInfo.cpp
@@ -0,0 +1,62 @@
+#include "getMemoryInfo.h"
+#include "addonLog.h"
+
+#ifdef __APPLE__
+#include <iostream>
+#include <mach/mach.h>
+#include <sys/sysctl.h>
+#elif __linux__
+#include <iostream>
+#include <sys/sysinfo.h>
+#elif _WIN32
+#include <iostream>
+#include <windows.h>
+#include <psapi.h>
+#endif
+
+
+Napi::Value getMemoryInfo(const Napi::CallbackInfo& info) {
+    uint64_t totalMemoryUsage = 0;
+
+#ifdef __APPLE__
+    struct mach_task_basic_info taskInfo;
+    mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+    if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&taskInfo, &infoCount) == KERN_SUCCESS) {
+        totalMemoryUsage = taskInfo.virtual_size;
+    } else {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
+    }
+#elif __linux__
+    std::ifstream procStatus("/proc/self/status");
+    std::string line;
+    bool foundMemoryUsage = false;
+    while (std::getline(procStatus, line)) {
+        if (line.rfind("VmSize:", 0) == 0) { // Resident Set Size (current memory usage)
+            std::istringstream iss(line);
+            std::string key, unit;
+            size_t value;
+            if (iss >> key >> value >> unit) {
+                totalMemoryUsage = value * 1024; // Convert from kB to bytes
+                foundMemoryUsage = true;
+            }
+            break;
+        }
+    }
+
+    if (!foundMemoryUsage) {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
+    }
+#elif _WIN32
+    PROCESS_MEMORY_COUNTERS memCounters;
+    
+    if (GetProcessMemoryInfo(GetCurrentProcess(), &memCounters, sizeof(memCounters))) {
+        totalMemoryUsage = memCounters.PrivateUsage;
+    } else {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
+    }
+#endif
+    
+    Napi::Object obj = Napi::Object::New(info.Env());
+    obj.Set("total", Napi::Number::New(info.Env(), totalMemoryUsage));
+    return obj;
+}
diff --git a/llama/addon/globals/getMemoryInfo.h b/llama/addon/globals/getMemoryInfo.h
new file mode 100644
index 00000000..fbcf2de4
--- /dev/null
+++ b/llama/addon/globals/getMemoryInfo.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "napi.h"
+
+Napi::Value getMemoryInfo(const Napi::CallbackInfo& info);
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index c5d1459d..d1ec4a12 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -48,6 +48,7 @@ export type BindingModule = {
     systemInfo(): string,
     getSupportsGpuOffloading(): boolean,
     getSupportsMmap(): boolean,
+    getGpuSupportsMmap(): boolean,
     getSupportsMlock(): boolean,
     getMathCores(): number,
     getBlockSizeForGgmlType(ggmlType: number): number | undefined,
@@ -76,6 +77,9 @@ export type BindingModule = {
         maxSize: number,
         free: number
     },
+    getMemoryInfo(): {
+        total: number
+    },
     init(): Promise<void>,
     loadBackends(forceLoadLibrariesSearchPath?: string): void,
     dispose(): Promise<void>
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index 8ae6644b..dfeb5578 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -46,6 +46,7 @@ export class Llama {
     /** @internal */ private readonly _cmakeOptions: Readonly<Record<string, string>>;
     /** @internal */ private readonly _supportsGpuOffloading: boolean;
     /** @internal */ private readonly _supportsMmap: boolean;
+    /** @internal */ private readonly _gpuSupportsMmap: boolean;
     /** @internal */ private readonly _supportsMlock: boolean;
     /** @internal */ private readonly _mathCores: number;
     /** @internal */ private readonly _llamaCppRelease: {
@@ -110,6 +111,7 @@ export class Llama {
         this._gpu = bindings.getGpuType() ?? false;
         this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
         this._supportsMmap = bindings.getSupportsMmap();
+        this._gpuSupportsMmap = bindings.getGpuSupportsMmap();
         this._supportsMlock = bindings.getSupportsMlock();
         this._mathCores = bindings.getMathCores();
         this._consts = bindings.getConsts();
@@ -175,6 +177,10 @@ export class Llama {
         return this._supportsMmap;
     }
 
+    public get gpuSupportsMmap() {
+        return this._gpuSupportsMmap;
+    }
+
     public get supportsMlock() {
         return this._supportsMlock;
     }
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index 621dbb68..401f7711 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -68,6 +68,7 @@ type ChatCommand = {
     debug: boolean,
     meter: boolean,
     timing: boolean,
+    noMmap: boolean,
     printTimings: boolean
 };
 
@@ -293,6 +294,11 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Print how how long it took to generate each response"
             })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -306,7 +312,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
-        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
+        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunChat({
@@ -314,7 +320,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
-                timing, printTimings
+                timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -330,7 +336,7 @@ async function RunChat({
     contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
-    tokenPredictionModelContextSize, debug, meter, timing, printTimings
+    tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -353,13 +359,16 @@ async function RunChat({
             logLevel: llamaLogLevel
         });
     const logBatchSize = batchSize != null;
+    const useMmap = !noMmap && llama.supportsMmap;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
-        flashAttention
+        flashAttention,
+        useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
+            useMmap,
             consoleTitle: "Draft model file"
         })
         : undefined;
@@ -404,6 +413,7 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -436,6 +446,7 @@ async function RunChat({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    useMmap,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -541,6 +552,7 @@ async function RunChat({
     const padTitle = await printCommonInfoLines({
         context,
         draftContext,
+        useMmap,
         printBos: true,
         printEos: true,
         logBatchSize,
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index 1ca9b9e6..f8c7790e 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -50,6 +50,7 @@ type CompleteCommand = {
     debug: boolean,
     meter: boolean,
     timing: boolean,
+    noMmap: boolean,
     printTimings: boolean
 };
 
@@ -220,6 +221,11 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Print how how long it took to generate each response"
             })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -232,14 +238,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
         flashAttention, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, meter, timing, printTimings
+        debug, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunCompletion({
                 modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -254,7 +260,7 @@ async function RunCompletion({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
 }: CompleteCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -276,13 +282,16 @@ async function RunCompletion({
             logLevel: llamaLogLevel
         });
     const logBatchSize = batchSize != null;
+    const useMmap = !noMmap && llama.supportsMmap;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
-        flashAttention
+        flashAttention,
+        useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
+            useMmap,
             consoleTitle: "Draft model file"
         })
         : undefined;
@@ -320,6 +329,7 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -352,6 +362,7 @@ async function RunCompletion({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    useMmap,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -429,6 +440,7 @@ async function RunCompletion({
     const padTitle = await printCommonInfoLines({
         context,
         draftContext,
+        useMmap,
         minTitleLength: "Complete".length + 1,
         logBatchSize,
         tokenMeterEnabled: meter
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index bd87a1c9..7a4a536b 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -52,6 +52,7 @@ type InfillCommand = {
     debug: boolean,
     meter: boolean,
     timing: boolean,
+    noMmap: boolean,
     printTimings: boolean
 };
 
@@ -230,6 +231,11 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Print how how long it took to generate each response"
             })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -242,14 +248,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
         flashAttention, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, meter, timing, printTimings
+        debug, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -264,7 +270,7 @@ async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
 }: InfillCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -286,13 +292,16 @@ async function RunInfill({
             logLevel: llamaLogLevel
         });
     const logBatchSize = batchSize != null;
+    const useMmap = !noMmap && llama.supportsMmap;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
-        flashAttention
+        flashAttention,
+        useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
+            useMmap,
             consoleTitle: "Draft model file"
         })
         : undefined;
@@ -344,6 +353,7 @@ async function RunInfill({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -376,6 +386,7 @@ async function RunInfill({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    useMmap,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -453,6 +464,7 @@ async function RunInfill({
     const padTitle = await printCommonInfoLines({
         context,
         draftContext,
+        useMmap,
         logBatchSize,
         tokenMeterEnabled: meter
     });
diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
index 85156f8e..7c3f4f5b 100644
--- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
@@ -27,7 +27,8 @@ type InspectEstimateCommand = {
     gpu?: BuildGpu | "auto",
     gpuLayers?: number | "max",
     contextSize?: number | "train",
-    embedding?: boolean
+    embedding?: boolean,
+    noMmap?: boolean
 };
 
 export const InspectEstimateCommand: CommandModule<object, InspectEstimateCommand> = {
@@ -105,10 +106,15 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 description: "Whether to estimate for creating an embedding context",
                 default: false,
                 group: "Optional:"
+            })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding
+        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap
     }: InspectEstimateCommand) {
         if (gpuLayers === -1) gpuLayers = undefined;
         if (gpuLayers === -2) gpuLayers = "max";
@@ -131,6 +137,7 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 logLevel: LlamaLogLevel.error
             });
 
+        const useMmap = !noMmap && llama.supportsMmap;
         printModelDestination(resolvedModelDestination);
 
         if (embedding)
@@ -159,7 +166,8 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 flashAttention,
                 targetContextSize: contextSize,
                 targetGpuLayers: gpuLayers,
-                embeddingContext: embedding
+                embeddingContext: embedding,
+                useMmap
             });
         }
 
diff --git a/src/cli/commands/inspect/commands/InspectGgufCommand.ts b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
index 49afe0ed..9be14cdd 100644
--- a/src/cli/commands/inspect/commands/InspectGgufCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
@@ -14,6 +14,7 @@ import withOra from "../../../../utils/withOra.js";
 import {resolveModelDestination} from "../../../../utils/resolveModelDestination.js";
 import {printModelDestination} from "../../../utils/printModelDestination.js";
 import {getGgufMetadataKeyValue} from "../../../../gguf/utils/getGgufMetadataKeyValue.js";
+import {GgufTensorInfo} from "../../../../gguf/types/GgufTensorInfoTypes.js";
 
 type InspectGgufCommand = {
     modelPath: string,
@@ -121,6 +122,8 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
                 });
             });
 
+        removeAdditionalTensorInfoFields(parsedMetadata.fullTensorInfo);
+
         const fileTypeName = getGgufFileTypeName(parsedMetadata.metadata.general?.file_type);
 
         if (plainJson || outputToJsonFile != null) {
@@ -211,3 +214,14 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
         }
     }
 };
+
+// these fields are added by the parser for ease of use and are not found in the gguf file itself
+function removeAdditionalTensorInfoFields(tensorInfo?: GgufTensorInfo[]) {
+    if (tensorInfo == null)
+        return;
+
+    for (const tensor of tensorInfo) {
+        delete (tensor as {fileOffset?: GgufTensorInfo["fileOffset"]}).fileOffset;
+        delete (tensor as {filePart?: GgufTensorInfo["filePart"]}).filePart;
+    }
+}
diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
index 4e8e47ac..a8edc254 100644
--- a/src/cli/commands/inspect/commands/InspectGpuCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
@@ -144,8 +144,10 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
         console.info();
         await logRamUsage(lastLlama?.cpuMathCores);
 
-        if (lastLlama != null)
+        if (lastLlama != null) {
             await logSwapUsage(lastLlama);
+            console.info(`${chalk.yellow("mmap:")} ${lastLlama.supportsMmap ? "supported" : "unsupported"}`);
+        }
     }
 };
 
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index fea3c525..9491f116 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -2,6 +2,7 @@ import path from "path";
 import process from "process";
 import {fileURLToPath} from "url";
 import {fork} from "node:child_process";
+import os from "os";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import bytes from "bytes";
@@ -19,6 +20,7 @@ import {getPrettyBuildGpuName} from "../../../../bindings/consts.js";
 import {getReadablePath} from "../../../utils/getReadablePath.js";
 import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js";
 import {documentationPageUrls} from "../../../../config.js";
+import {Llama} from "../../../../bindings/Llama.js";
 
 type InspectMeasureCommand = {
     modelPath?: string,
@@ -30,6 +32,8 @@ type InspectMeasureCommand = {
     maxContextSize?: number,
     flashAttention?: boolean,
     measures: number,
+    memory: "vram" | "ram" | "all",
+    noMmap: boolean,
     printHeaderBeforeEachLayer?: boolean,
     evaluateText?: string,
     repeatEvaluateText?: number
@@ -106,6 +110,17 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 default: 10,
                 description: "Number of context size measures to take for each gpu layers count"
             })
+            .option("memory", {
+                type: "string",
+                choices: ["vram", "ram", "all"] as const,
+                default: "vram" as const,
+                description: "Type of memory to measure"
+            })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printHeaderBeforeEachLayer", {
                 alias: "ph",
                 type: "boolean",
@@ -126,12 +141,13 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
     },
     async handler({
         modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10,
-        printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
+        memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
     }: InspectMeasureCommand) {
         if (maxLayers === -1) maxLayers = undefined;
         if (maxContextSize === -1) maxContextSize = undefined;
         if (minLayers < 1) minLayers = 1;
 
+        const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all";
         const headers = resolveHeaderFlag(headerArg);
 
         // ensure a llama build is available
@@ -144,10 +160,23 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 logLevel: LlamaLogLevel.error
             });
 
-        const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers);
+        const useMmap = !noMmap && llama.supportsMmap;
+        const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
+            flashAttention, useMmap
+        });
 
         console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
         console.info(`${chalk.yellow("GPU:")} ${getPrettyBuildGpuName(llama.gpu)}${gpu == null ? chalk.gray(" (last build)") : ""}`);
+        console.info(chalk.yellow("mmap:") + " " + (
+            !llama.supportsMmap
+                ? "unsupported"
+                : useMmap
+                    ? "enabled"
+                    : "disabled"
+        ));
+        if (measureMemoryType === "ram" || measureMemoryType === "all")
+            console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
+
         console.info();
 
         const ggufMetadata = await readGgufFileInfo(resolvedGgufPath, {
@@ -155,10 +184,13 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
         });
         const ggufInsights = await GgufInsights.from(ggufMetadata, llama);
         const totalVram = (await llama.getVramState()).total;
+        const totalRam = os.totalmem();
 
         let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers;
         let previousContextSizeCheck: undefined | number = undefined;
 
+        const measureTable = getMeasureTable(measureMemoryType);
+
         measureTable.logHeader({drawRowSeparator: !printHeaderBeforeEachLayer});
 
         while (lastGpuLayers >= (minLayers ?? 0)) {
@@ -174,6 +206,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
 
             const done = await measureModel({
                 modelPath: resolvedGgufPath,
+                useMmap,
                 gpu: gpu == null
                     ? undefined
                     : llama.gpu,
@@ -187,6 +220,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 evaluateText: evaluateText == null
                     ? undefined
                     : evaluateText.repeat(repeatEvaluateText ?? 1),
+                exitAfterMeasurement: exitAfterEachMeasurement,
                 onInfo({gpuLayers, result}) {
                     if (lastGpuLayers !== gpuLayers) {
                         lastGpuLayers = gpuLayers;
@@ -231,19 +265,31 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                         previousContextSizeCheck = result.contextSize;
                         hadSuccessInThisProcess = true;
 
-                        const modelVramEstimation = ggufInsights.estimateModelResourceRequirements({gpuLayers: lastGpuLayers}).gpuVram;
+                        const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({
+                            gpuLayers: lastGpuLayers,
+                            useMmap
+                        });
+                        const modelVramEstimation = modelResourceEstimation.gpuVram;
                         const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") +
                             bytes(Math.abs(result.modelVramUsage - modelVramEstimation));
                         const modelVramEstimationDiffText = modelVramEstimationDiffBytes.padEnd(9, " ") + " " +
                             padStartAnsi("(" + renderDiffPercentageWithColors(((modelVramEstimation / result.modelVramUsage) - 1) * 100) + ")", 9);
 
-                        const contextVramEstimation = previousContextSizeCheck == null
+                        const modelRamEstimation = modelResourceEstimation.cpuRam;
+                        const modelRamEstimationDiffBytes = (modelRamEstimation < result.modelRamUsage ? "-" : "") +
+                            bytes(Math.abs(result.modelRamUsage - modelRamEstimation));
+                        const modelRamEstimationDiffText = modelRamEstimationDiffBytes.padEnd(9, " ") + " " +
+                            padStartAnsi("(" + renderDiffPercentageWithColors(((modelRamEstimation / result.modelRamUsage) - 1) * 100) + ")", 9);
+
+                        const contextResourceEstimation = previousContextSizeCheck == null
                             ? undefined
                             : ggufInsights.estimateContextResourceRequirements({
                                 contextSize: previousContextSizeCheck,
                                 modelGpuLayers: lastGpuLayers,
                                 flashAttention
-                            }).gpuVram;
+                            });
+
+                        const contextVramEstimation = contextResourceEstimation?.gpuVram;
                         const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null)
                             ? undefined
                             : (
@@ -259,6 +305,22 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                                 padStartAnsi("(" + renderDiffPercentageWithColors(((contextVramEstimation / result.contextVramUsage) - 1) * 100) + ")", 9)
                             );
 
+                        const contextRamEstimation = contextResourceEstimation?.cpuRam;
+                        const contextRamEstimationDiffBytes = (result.contextRamUsage == null || contextRamEstimation == null)
+                            ? undefined
+                            : (
+                                (contextRamEstimation < result.contextRamUsage ? "-" : "") +
+                                bytes(Math.abs(result.contextRamUsage - contextRamEstimation))
+                            );
+                        const contextRamEstimationDiffText = (
+                            contextRamEstimation == null || contextRamEstimationDiffBytes == null || result.contextRamUsage == null
+                        )
+                            ? undefined
+                            : (
+                                contextRamEstimationDiffBytes.padEnd(9, " ") + " " +
+                                padStartAnsi("(" + renderDiffPercentageWithColors(((contextRamEstimation / result.contextRamUsage) - 1) * 100) + ")", 9)
+                            );
+
                         measureTable.logLine({
                             newProcess: getNewProccessValue(),
                             type: previousContextSizeCheck == null
@@ -271,7 +333,11 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
 
                             estimatedModelVram: bytes(modelVramEstimation),
                             actualModelVram: bytes(result.modelVramUsage),
-                            modelEstimationDiff: modelVramEstimationDiffText,
+                            modelVramEstimationDiff: modelVramEstimationDiffText,
+
+                            estimatedModelRam: bytes(modelRamEstimation),
+                            actualModelRam: bytes(result.modelRamUsage),
+                            modelRamEstimationDiff: modelRamEstimationDiffText,
 
                             estimatedContextVram: contextVramEstimation == null
                                 ? undefined
@@ -279,9 +345,19 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                             actualContextVram: result.contextVramUsage == null
                                 ? undefined
                                 : bytes(result.contextVramUsage),
-                            contextEstimationDiff: contextVramEstimationDiffText,
+                            contextVramEstimationDiff: contextVramEstimationDiffText,
                             totalVramUsage: ((result.totalVramUsage / totalVram) * 100).toFixed(2).padStart(5, "0") + "% " +
-                                chalk.gray("(" + bytes(result.totalVramUsage) + "/" + bytes(totalVram) + ")")
+                                chalk.gray("(" + bytes(result.totalVramUsage) + "/" + bytes(totalVram) + ")"),
+
+                            estimatedContextRam: contextRamEstimation == null
+                                ? undefined
+                                : bytes(contextRamEstimation),
+                            actualContextRam: result.contextRamUsage == null
+                                ? undefined
+                                : bytes(result.contextRamUsage),
+                            contextRamEstimationDiff: contextRamEstimationDiffText,
+                            totalRamUsage: ((result.totalRamUsage / totalRam) * 100).toFixed(2).padStart(5, "0") + "% " +
+                                chalk.gray("(" + bytes(result.totalRamUsage) + "/" + bytes(totalRam) + ")")
                         });
                     }
                 }
@@ -293,55 +369,100 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
     }
 };
 
-const measureTable = new ConsoleTable([{
-    key: "newProcess",
-    title: " ",
-    width: 1
-}, {
-    key: "type",
-    title: "Type",
-    width: Math.max("Type".length, "Model".length, "Context".length),
-    canSpanOverEmptyColumns: true
-}, {
-    key: "gpuLayers",
-    title: "Layers",
-    width: "Layers".length,
-    canSpanOverEmptyColumns: true
-}, {
-    key: "contextSize",
-    title: "Context size",
-    width: "Context size".length,
-    canSpanOverEmptyColumns: true
-}, {
-    key: "estimatedModelVram",
-    title: "Estimated model VRAM",
-    width: "Estimated model VRAM".length,
-    canSpanOverEmptyColumns: true
-}, {
-    key: "actualModelVram",
-    title: "Model VRAM",
-    width: "Model VRAM".length
-}, {
-    key: "modelEstimationDiff",
-    title: "Diff",
-    width: Math.max("Diff".length, 9 + 1 + 9)
-}, {
-    key: "estimatedContextVram",
-    title: "Estimated context VRAM",
-    width: "Estimated context VRAM".length
-}, {
-    key: "actualContextVram",
-    title: "Context VRAM",
-    width: "Context VRAM".length
-}, {
-    key: "contextEstimationDiff",
-    title: "Diff",
-    width: Math.max("Diff".length, 9 + 1 + 9)
-}, {
-    key: "totalVramUsage",
-    title: "VRAM usage",
-    width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
-}] as const satisfies readonly ConsoleTableColumn[]);
+function getMeasureTable(memoryType: InspectMeasureCommand["memory"]) {
+    return new ConsoleTable([{
+        key: "newProcess",
+        title: " ",
+        width: 1
+    }, {
+        key: "type",
+        title: "Type",
+        width: Math.max("Type".length, "Model".length, "Context".length),
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "gpuLayers",
+        title: "Layers",
+        width: "Layers".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "contextSize",
+        title: "Context size",
+        width: "Context size".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "estimatedModelVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Estimated model VRAM",
+        width: "Estimated model VRAM".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "actualModelVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Model VRAM",
+        width: "Model VRAM".length
+    }, {
+        key: "modelVramEstimationDiff",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "estimatedModelRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Estimated model RAM",
+        width: "Estimated model RAM".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "actualModelRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Model RAM",
+        width: "Model RAM".length
+    }, {
+        key: "modelRamEstimationDiff",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "estimatedContextVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Estimated context VRAM",
+        width: "Estimated context VRAM".length
+    }, {
+        key: "actualContextVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Context VRAM",
+        width: "Context VRAM".length
+    }, {
+        key: "contextVramEstimationDiff",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "totalVramUsage",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "VRAM usage",
+        width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
+    }, {
+        key: "estimatedContextRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Estimated context RAM",
+        width: "Estimated context RAM".length
+    }, {
+        key: "actualContextRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Context RAM",
+        width: "Context RAM".length
+    }, {
+        key: "contextRamEstimationDiff",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "totalRamUsage",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "RAM usage",
+        width: Math.max("RAM usage".length, 8 + 1 + 8 + 1 + 8)
+    }] as const satisfies readonly ConsoleTableColumn[]);
+}
 
 function renderDiffPercentageWithColors(percentage: number, {
     greenBright = 2,
@@ -374,10 +495,11 @@ const detectedFileName = path.basename(__filename);
 const expectedFileName = "InspectMeasureCommand";
 
 async function measureModel({
-    modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText,
-    onInfo
+    modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention,
+    evaluateText, exitAfterMeasurement = false, onInfo
 }: {
     modelPath: string,
+    useMmap?: boolean,
     gpu?: BuildGpu | "auto",
     tests: number,
     initialMaxContextSize?: number,
@@ -387,6 +509,7 @@ async function measureModel({
     minGpuLayers?: number,
     flashAttention?: boolean,
     evaluateText?: string,
+    exitAfterMeasurement?: boolean,
     onInfo(data: {
         gpuLayers: number,
         result: {
@@ -399,10 +522,13 @@ async function measureModel({
         } | {
             type: "success",
             modelVramUsage: number,
+            modelRamUsage: number,
             contextSize?: number,
             contextVramUsage?: number,
+            contextRamUsage?: number,
             contextStateSize?: number,
-            totalVramUsage: number
+            totalVramUsage: number,
+            totalRamUsage: number
         }
     }): void
 }) {
@@ -429,6 +555,7 @@ async function measureModel({
         }
     });
     let isPlannedExit = false;
+    let isDone = false;
     let forkSucceeded = false;
     let timeoutHandle: ReturnType<typeof setTimeout> | null = null;
     const processCreationTimeout = 1000 * 60 * 5;
@@ -464,12 +591,12 @@ async function measureModel({
                 }
             }, processCreationTimeout);
         }),
-        new Promise<boolean>((resolve, reject) => {
+        new Promise<boolean | undefined>((resolve, reject) => {
             function done() {
                 if (!forkSucceeded)
                     reject(new Error(`Measuring a model failed to run a sub-process via file "${__filename}"`));
-                else
-                    resolve(isPlannedExit);
+                else if (isPlannedExit)
+                    resolve(isPlannedExit && isDone);
 
                 cleanup();
             }
@@ -480,6 +607,7 @@ async function measureModel({
                     subProcess.send({
                         type: "start",
                         modelPath,
+                        useMmap,
                         tests,
                         initialMaxContextSize,
                         maxContextSize,
@@ -487,7 +615,8 @@ async function measureModel({
                         maxGpuLayers,
                         minGpuLayers,
                         flashAttention,
-                        evaluateText
+                        evaluateText,
+                        exitAfterMeasurement
                     } satisfies ParentToChildMessage);
 
                     if (timeoutHandle != null) {
@@ -495,6 +624,10 @@ async function measureModel({
                         timeoutHandle = null;
                     }
                 } else if (message.type === "done") {
+                    isPlannedExit = true;
+                    isDone = true;
+                    subProcess.send({type: "exit"} satisfies ParentToChildMessage);
+                } else if (message.type === "exit") {
                     isPlannedExit = true;
                     subProcess.send({type: "exit"} satisfies ParentToChildMessage);
                 } else if (message.type === "error") {
@@ -516,10 +649,13 @@ async function measureModel({
                         result: {
                             type: "success",
                             modelVramUsage: message.modelVramUsage,
+                            modelRamUsage: message.modelRamUsage,
                             contextSize: message.contextSize,
                             contextVramUsage: message.contextVramUsage,
+                            contextRamUsage: message.contextRamUsage,
                             contextStateSize: message.contextStateSize,
-                            totalVramUsage: message.totalVramUsage
+                            totalVramUsage: message.totalVramUsage,
+                            totalRamUsage: message.totalRamUsage
                         }
                     });
                 }
@@ -580,11 +716,13 @@ async function runTestWorkerLogic() {
     }
 
     async function testContextSizes({
-        model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText
+        model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText,
+        exitAfterMeasurement = false
     }: {
-        model: LlamaModel, modelVramUsage: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number,
-        tests: number, flashAttention?: boolean, evaluateText?: string
+        model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number,
+        minContextSize?: number, tests: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
+        let measurementsDone: number = 0;
         const contextSizeCheckPlan = getContextSizesCheckPlan(
             maxContextSize != null
                 ? Math.min(model.trainContextSize, maxContextSize)
@@ -603,6 +741,7 @@ async function runTestWorkerLogic() {
 
             try {
                 const preContextVramUsage = (await llama.getVramState()).used;
+                const preContextRamUsage = getMemoryUsage(llama);
                 const context = await model.createContext({
                     contextSize: currentContextSizeCheck ?? (
                         maxContextSize != null
@@ -620,15 +759,20 @@ async function runTestWorkerLogic() {
                 }
 
                 const postContextVramUsage = (await llama.getVramState()).used;
+                const postContextRamUsage = getMemoryUsage(llama);
+                measurementsDone++;
 
                 sendInfoBack({
                     type: "stats",
                     gpuLayers: model.gpuLayers,
                     modelVramUsage,
+                    modelRamUsage,
                     contextSize: context.contextSize,
                     contextVramUsage: postContextVramUsage - preContextVramUsage,
+                    contextRamUsage: postContextRamUsage - preContextRamUsage,
                     contextStateSize: context.stateSize,
-                    totalVramUsage: postContextVramUsage
+                    totalVramUsage: postContextVramUsage,
+                    totalRamUsage: postContextRamUsage
                 });
                 currentContextSizeCheck = context.contextSize;
 
@@ -650,44 +794,59 @@ async function runTestWorkerLogic() {
             }
 
             currentContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, currentContextSizeCheck);
+
+            if (exitAfterMeasurement)
+                return measurementsDone;
         }
+
+        return measurementsDone;
     }
 
     async function testWithGpuLayers({
-        modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText
+        modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText,
+        exitAfterMeasurement = false
     }: {
-        modelPath: string, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number,
-        flashAttention?: boolean, evaluateText?: string
+        modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number,
+        minContextSize?: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
         try {
             const preModelVramUsage = (await llama.getVramState()).used;
+            const preModelRamUsage = getMemoryUsage(llama);
             const model = await llama.loadModel({
                 modelPath,
+                useMmap,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
                 ignoreMemorySafetyChecks: true
             });
             const postModelVramUsage = (await llama.getVramState()).used;
+            const postModelRamUsage = getMemoryUsage(llama);
 
             sendInfoBack({
                 type: "stats",
                 gpuLayers: model.gpuLayers,
                 modelVramUsage: postModelVramUsage - preModelVramUsage,
-                totalVramUsage: postModelVramUsage
+                modelRamUsage: postModelRamUsage - preModelRamUsage,
+                totalVramUsage: postModelVramUsage,
+                totalRamUsage: postModelRamUsage
             });
 
-            await testContextSizes({
+            const measurementsDone = await testContextSizes({
                 model,
                 modelVramUsage: postModelVramUsage - preModelVramUsage,
+                modelRamUsage: postModelRamUsage - preModelRamUsage,
                 startContextSize,
                 maxContextSize,
                 minContextSize,
                 flashAttention,
                 tests,
-                evaluateText
+                evaluateText,
+                exitAfterMeasurement
             });
 
             await model.dispose();
+
+            return measurementsDone;
         } catch (err) {
             sendInfoBack({
                 type: "error",
@@ -695,13 +854,31 @@ async function runTestWorkerLogic() {
                 gpuLayers: gpuLayers
             });
         }
+
+        return 0;
     }
 
     process.on("message", async (message: ParentToChildMessage) => {
         if (message.type === "start") {
             for (let gpuLayers = message.maxGpuLayers; gpuLayers >= (message.minGpuLayers ?? 0); gpuLayers--) {
-                await testWithGpuLayers({
+                if (gpuLayers == message.maxGpuLayers && message.initialMaxContextSize != null) {
+                    const ggufInsights = await GgufInsights.from(await readGgufFileInfo(message.modelPath), llama);
+                    const contextSizeCheckPlan = getContextSizesCheckPlan(
+                        message.maxContextSize != null
+                            ? Math.min(ggufInsights.trainContextSize ?? 4096, message.maxContextSize)
+                            : ggufInsights.trainContextSize ?? 4096,
+                        message.tests,
+                        message.minContextSize
+                    );
+
+                    const firstContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, message.initialMaxContextSize);
+                    if (firstContextSizeCheck == null)
+                        continue;
+                }
+
+                const measurementsDone = await testWithGpuLayers({
                     modelPath: message.modelPath,
+                    useMmap: message.useMmap,
                     gpuLayers,
                     tests: message.tests,
                     startContextSize: gpuLayers == message.maxGpuLayers
@@ -710,8 +887,14 @@ async function runTestWorkerLogic() {
                     maxContextSize: message.maxContextSize,
                     minContextSize: message.minContextSize,
                     flashAttention: message.flashAttention,
-                    evaluateText: message.evaluateText
+                    evaluateText: message.evaluateText,
+                    exitAfterMeasurement: message.exitAfterMeasurement
                 });
+
+                if (measurementsDone > 0 && message.exitAfterMeasurement) {
+                    sendInfoBack({type: "exit"});
+                    return;
+                }
             }
 
             sendInfoBack({type: "done"});
@@ -788,6 +971,7 @@ function getNextItemInCheckContextSizesPlan(plan: number[], currentSize: number)
 type ParentToChildMessage = {
     type: "start",
     modelPath: string,
+    useMmap?: boolean,
     tests: number,
     maxGpuLayers: number,
     minGpuLayers?: number,
@@ -795,21 +979,25 @@ type ParentToChildMessage = {
     initialMaxContextSize?: number,
     maxContextSize?: number,
     minContextSize?: number,
-    evaluateText?: string
+    evaluateText?: string,
+    exitAfterMeasurement?: boolean
 } | {
     type: "exit"
 };
 
 type ChildToParentMessage = {
-    type: "ready" | "done"
+    type: "ready" | "done" | "exit"
 } | {
     type: "stats",
     gpuLayers: number,
     modelVramUsage: number,
+    modelRamUsage: number,
     contextSize?: number,
     contextVramUsage?: number,
+    contextRamUsage?: number,
     contextStateSize?: number,
-    totalVramUsage: number
+    totalVramUsage: number,
+    totalRamUsage: number
 } | {
     type: "error",
     error: string,
@@ -822,3 +1010,16 @@ function padStartAnsi(text: string, length: number, padChar: string = " ") {
 
     return padChar.repeat(Math.max(0, length - textWithoutAnsi.length)) + text;
 }
+
+function getMemoryUsage(llama: Llama) {
+    const totalMemoryUsage = llama._bindings.getMemoryInfo().total;
+    const vramUsage = llama._bindings.getGpuVramInfo();
+
+    let memoryUsage = totalMemoryUsage;
+
+    const unifiedMemoryVramUsage = Math.min(vramUsage.unifiedSize, vramUsage.used);
+    if (unifiedMemoryVramUsage <= memoryUsage)
+        memoryUsage -= unifiedMemoryVramUsage;
+
+    return memoryUsage;
+}
diff --git a/src/cli/utils/ConsoleTable.ts b/src/cli/utils/ConsoleTable.ts
index a458c9fb..98644778 100644
--- a/src/cli/utils/ConsoleTable.ts
+++ b/src/cli/utils/ConsoleTable.ts
@@ -14,7 +14,7 @@ export class ConsoleTable<const T extends readonly ConsoleTableColumn[]> {
         columnSeparator?: string,
         drawHeaderRowSeparator?: boolean
     } = {}) {
-        this._columns = columns;
+        this._columns = filterHiddenColumns(columns);
         this._columnSeparator = columnSeparator;
         this._drawHeaderRowSeparator = drawHeaderRowSeparator;
     }
@@ -120,7 +120,8 @@ export type ConsoleTableColumn<K extends string = string> = {
     readonly titleFormatter?: (value: string) => string,
     readonly width?: number,
     readonly valueFormatter?: (value: string) => string,
-    readonly canSpanOverEmptyColumns?: boolean
+    readonly canSpanOverEmptyColumns?: boolean,
+    readonly visible?: boolean
 };
 
 function getColumnWidth(column: ConsoleTableColumn) {
@@ -130,3 +131,8 @@ function getColumnWidth(column: ConsoleTableColumn) {
 function toOneLine(text: string) {
     return text.replaceAll("\n", chalk.gray("\\n"));
 }
+
+function filterHiddenColumns<const T extends readonly ConsoleTableColumn[]>(columns: T): T {
+    return columns
+        .filter((column) => column.visible !== false) as readonly ConsoleTableColumn[] as T;
+}
diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts
index 3d3907ca..8748e26f 100644
--- a/src/cli/utils/interactivelyAskForModel.ts
+++ b/src/cli/utils/interactivelyAskForModel.ts
@@ -59,13 +59,15 @@ export async function interactivelyAskForModel({
     modelsDirectory,
     allowLocalModels = true,
     downloadIntent = true,
-    flashAttention = false
+    flashAttention = false,
+    useMmap
 }: {
     llama: Llama,
     modelsDirectory?: string,
     allowLocalModels?: boolean,
     downloadIntent?: boolean,
-    flashAttention?: boolean
+    flashAttention?: boolean,
+    useMmap?: boolean
 }): Promise<string> {
     let localModelFileOptions: (ModelOption & {type: "localModel"})[] = [];
     const recommendedModelOptions: (ModelOption & {type: "recommendedModel"})[] = [];
@@ -117,7 +119,8 @@ export async function interactivelyAskForModel({
                         progressUpdater.setProgress(readItems / ggufFileNames.length, renderProgress());
 
                         const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({
-                            flashAttention: flashAttention && ggufInsights?.flashAttentionSupported
+                            flashAttention: flashAttention && ggufInsights?.flashAttentionSupported,
+                            useMmap
                         });
 
                         return {
@@ -289,7 +292,7 @@ export async function interactivelyAskForModel({
                 },
                 items: options,
                 renderItem(item, focused, rerender) {
-                    return renderSelectionItem(item, focused, rerender, activeInteractionController.signal, llama, flashAttention);
+                    return renderSelectionItem(item, focused, rerender, activeInteractionController.signal, llama, flashAttention, useMmap);
                 },
                 canFocusItem(item) {
                     return item.type === "recommendedModel" || item.type === "localModel" || item.type === "action";
@@ -404,7 +407,8 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise<string |
 }
 
 function renderSelectionItem(
-    item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean
+    item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean,
+    useMmap?: boolean
 ) {
     if (item.type === "localModel") {
         let modelText = item.title instanceof Function
@@ -430,7 +434,8 @@ function renderSelectionItem(
                     abortSignal,
                     rerenderOption: rerender,
                     llama,
-                    flashAttention
+                    flashAttention,
+                    useMmap
                 });
             }
 
@@ -552,13 +557,14 @@ function renderRecommendedModelTechnicalInfo(
 }
 
 async function selectFileForModelRecommendation({
-    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention
+    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, useMmap
 }: {
     recommendedModelOption: ModelOption & {type: "recommendedModel"},
     llama: Llama,
     abortSignal: AbortSignal,
     rerenderOption(): void,
-    flashAttention: boolean
+    flashAttention: boolean,
+    useMmap?: boolean
 }) {
     try {
         let bestScore: number | undefined = undefined;
@@ -579,7 +585,8 @@ async function selectFileForModelRecommendation({
                     return;
 
                 const compatibilityScore = await ggufInsights.configurationResolver.scoreModelConfigurationCompatibility({
-                    flashAttention
+                    flashAttention,
+                    useMmap
                 });
 
                 if (bestScore == null || compatibilityScore.compatibilityScore > bestScore) {
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
index 5eefe335..59268c3a 100644
--- a/src/cli/utils/printCommonInfoLines.ts
+++ b/src/cli/utils/printCommonInfoLines.ts
@@ -8,6 +8,7 @@ export async function printCommonInfoLines({
     context,
     draftContext,
     minTitleLength = 0,
+    useMmap,
     logBatchSize = false,
     tokenMeterEnabled = false,
     printBos = false,
@@ -16,6 +17,7 @@ export async function printCommonInfoLines({
     context: LlamaContext,
     draftContext?: LlamaContext,
     minTitleLength?: number,
+    useMmap?: boolean,
     logBatchSize?: boolean,
     tokenMeterEnabled?: boolean,
     printBos?: boolean,
@@ -70,6 +72,13 @@ export async function printCommonInfoLines({
             value: `${model.gpuLayers}/${model.fileInsights.totalLayers} offloaded ${
                 chalk.dim(`(${Math.floor((model.gpuLayers / model.fileInsights.totalLayers) * 100)}%)`)
             }`
+        }, {
+            title: "mmap",
+            value: !model._llama.supportsMmap
+                ? "unsupported"
+                : (useMmap || useMmap == null)
+                    ? "enabled"
+                    : "disabled"
         }, {
             show: printBos,
             title: "BOS",
diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts
index 3db583c3..464a7ab9 100644
--- a/src/cli/utils/resolveCommandGgufPath.ts
+++ b/src/cli/utils/resolveCommandGgufPath.ts
@@ -10,9 +10,9 @@ import {getReadablePath} from "./getReadablePath.js";
 import {interactivelyAskForModel} from "./interactivelyAskForModel.js";
 
 export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record<string, string>, {
-    targetDirectory = cliModelsDirectory, flashAttention = false, consoleTitle = "File"
+    targetDirectory = cliModelsDirectory, flashAttention = false, useMmap, consoleTitle = "File"
 }: {
-    targetDirectory?: string, flashAttention?: boolean, consoleTitle?: string
+    targetDirectory?: string, flashAttention?: boolean, useMmap?: boolean, consoleTitle?: string
 } = {}) {
     if (ggufPath == null)
         ggufPath = await interactivelyAskForModel({
@@ -20,7 +20,8 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
             modelsDirectory: targetDirectory,
             allowLocalModels: true,
             downloadIntent: true,
-            flashAttention
+            flashAttention,
+            useMmap
         });
 
     const resolvedModelDestination = resolveModelDestination(ggufPath);
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index f8d609e8..208c7a9c 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -292,7 +292,9 @@ export class LlamaModel {
     }
 
     /**
-     * Total model size in memory in bytes
+     * Total model size in memory in bytes.
+     *
+     * When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
      */
     public get size() {
         this._ensureNotDisposed();
@@ -666,7 +668,7 @@ export class LlamaModel {
         _llama: Llama
     }) {
         const {loadSignal, defaultContextFlashAttention} = modelOptions;
-        const useMmap = modelOptions.useMmap ?? defaultUseMmap;
+        const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
 
         const fileInfo = await readGgufFileInfo(modelOptions.modelPath, {
             sourceType: "filesystem",
@@ -680,9 +682,13 @@ export class LlamaModel {
             : false;
         const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
             ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
-            defaultContextFlashAttention: resolvedDefaultContextFlashAttention
+            defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
+            useMmap
+        });
+        const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({
+            gpuLayers: gpuLayers,
+            useMmap
         });
-        const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers});
 
         const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, {
             _fileInfo: fileInfo,
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index babf2565..04fd3687 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -23,7 +23,7 @@ export class GgufInsights {
         this._llama = llama;
         this._ggufFileInfo = ggufFileInfo;
 
-        this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama);
+        this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true);
         this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
     }
 
@@ -133,12 +133,16 @@ export class GgufInsights {
         return false;
     }
 
-    public estimateModelResourceRequirements({gpuLayers}: {gpuLayers: number}): GgufInsightsResourceRequirements {
+    public estimateModelResourceRequirements({
+        gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap
+    }: {
+        gpuLayers: number, useMmap?: boolean, gpuSupportsMmap?: boolean
+    }): GgufInsightsResourceRequirements {
         const {cpu, gpu} = this._getTensorResourceSplit(gpuLayers);
 
         return {
-            cpuRam: calculateTensorsSize(cpu, this._llama),
-            gpuVram: calculateTensorsSize(gpu, this._llama)
+            cpuRam: calculateTensorsSize(cpu, this._llama, false),
+            gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap)
         };
     }
 
@@ -524,10 +528,59 @@ function parseTensorName(tensorName?: string): {
     return {layerNumber: undefined};
 }
 
-function calculateTensorsSize(tensorsInfo: GgufTensorInfo[], llama: Llama) {
+function calculateTensorsSize(
+    tensorsInfo: GgufTensorInfo[],
+    llama: Llama,
+    useMmap: boolean,
+    startFromTensorDataOffset: boolean = false
+) {
+    if (!useMmap) {
+        let size = 0;
+        for (const tensorInfo of tensorsInfo)
+            size += calculateTensorSize(tensorInfo, llama);
+
+        return size;
+    }
+
+    const fileStats = new Map<number, {
+        tensorsSize: number,
+        startOffset?: number | bigint,
+        endOffset?: number | bigint
+    }>();
+    for (const tensorInfo of tensorsInfo) {
+        let stats = fileStats.get(tensorInfo.filePart);
+        if (stats == null) {
+            stats = {
+                tensorsSize: 0
+            };
+            fileStats.set(tensorInfo.filePart, stats);
+        }
+
+        const tensorSize = calculateTensorSize(tensorInfo, llama);
+        stats.tensorsSize += tensorSize;
+        const startOffset = tensorInfo.offset;
+        const endOffset = typeof startOffset === "number"
+            ? startOffset + tensorSize
+            : startOffset + BigInt(tensorSize);
+
+        if (startFromTensorDataOffset)
+            stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset));
+        else if (stats.startOffset == null || startOffset < stats.startOffset)
+            stats.startOffset = startOffset;
+
+        if (stats.endOffset == null || endOffset > stats.endOffset)
+            stats.endOffset = endOffset;
+    }
+
     let size = 0;
-    for (const tensorInfo of tensorsInfo)
-        size += calculateTensorSize(tensorInfo, llama);
+    for (const [, stats] of fileStats) {
+        const offsetSize = (stats.endOffset == null || stats.startOffset == null)
+            ? 0
+            : Number(BigInt(stats.endOffset) - BigInt(stats.startOffset));
+        const tensorsSize = stats.tensorsSize;
+
+        size += Math.max(offsetSize, tensorsSize);
+    }
 
     return size;
 }
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index aab938af..05595c98 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -38,12 +38,14 @@ export class GgufInsightsConfigurationResolver {
         targetGpuLayers,
         targetContextSize,
         embeddingContext = false,
-        flashAttention = false
+        flashAttention = false,
+        useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         targetGpuLayers?: number | "max",
         targetContextSize?: number,
         embeddingContext?: boolean,
-        flashAttention?: boolean
+        flashAttention?: boolean,
+        useMmap?: boolean
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
@@ -64,7 +66,8 @@ export class GgufInsightsConfigurationResolver {
             contextSize: targetContextSize,
             embeddingContext,
             forceGpuLayers: targetGpuLayers,
-            forceStrictContextSize: targetContextSize != null
+            forceStrictContextSize: targetContextSize != null,
+            useMmap
         }, {
             getVramState,
             getRamState,
@@ -105,7 +108,8 @@ export class GgufInsightsConfigurationResolver {
         maximumFittedContextSizeMultiplier = 100,
         maximumUnfitConfigurationResourceMultiplier = 100,
         forceStrictContextSize = false,
-        forceGpuLayers
+        forceGpuLayers,
+        useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         contextSize?: number,
         embeddingContext?: boolean,
@@ -120,7 +124,8 @@ export class GgufInsightsConfigurationResolver {
          */
         forceStrictContextSize?: boolean,
 
-        forceGpuLayers?: number | "max"
+        forceGpuLayers?: number | "max",
+        useMmap?: boolean
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
@@ -204,7 +209,8 @@ export class GgufInsightsConfigurationResolver {
                     llamaGpu,
                     llamaSupportsGpuOffloading,
                     defaultContextFlashAttention: flashAttention,
-                    ignoreMemorySafetyChecks: forceGpuLayers != null
+                    ignoreMemorySafetyChecks: forceGpuLayers != null,
+                    useMmap
                 }
             );
             gpuLayersFitMemory = true;
@@ -215,7 +221,8 @@ export class GgufInsightsConfigurationResolver {
 
         const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false;
         const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({
-            gpuLayers: resolvedGpuLayers
+            gpuLayers: resolvedGpuLayers,
+            useMmap
         });
 
         let resolvedContextSize = Math.min(
@@ -363,10 +370,12 @@ export class GgufInsightsConfigurationResolver {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu,
         llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading,
-        defaultContextFlashAttention = false
+        defaultContextFlashAttention = false,
+        useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>,
-        llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean
+        llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean,
+        useMmap?: boolean
     } = {}) {
         return resolveModelGpuLayersOption(gpuLayers, {
             ggufInsights: this._ggufInsights,
@@ -375,7 +384,8 @@ export class GgufInsightsConfigurationResolver {
             llamaVramPaddingSize,
             llamaGpu,
             llamaSupportsGpuOffloading,
-            defaultContextFlashAttention
+            defaultContextFlashAttention,
+            useMmap
         });
     }
 
diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
index d9dc4369..c6065b7f 100644
--- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
+++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
@@ -11,11 +11,11 @@ const fitContextExtraMemoryPaddingPercentage = 0.5;
 
 export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], {
     ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize,
-    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention
+    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, useMmap
 }: {
     ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean,
     getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu,
-    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean
+    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, useMmap?: boolean
 }): Promise<number> {
     if (gpuLayers == null)
         gpuLayers = "auto";
@@ -36,7 +36,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
             gpuLayers: resolvedGpuLayers,
             ggufInsights,
             currentVram: vramState.free,
-            defaultContextFlashAttention
+            defaultContextFlashAttention,
+            useMmap
         });
 
         if (maxLayersRequirements == null)
@@ -71,7 +72,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
             maxGpuLayers: typeof gpuLayers === "object"
                 ? gpuLayers.max
                 : undefined,
-            defaultContextFlashAttention
+            defaultContextFlashAttention,
+            useMmap
         });
 
         const hasGpuLayersRequirements = typeof gpuLayers === "object" &&
@@ -92,14 +94,16 @@ function getBestGpuLayersForFreeVram({
     fitContext,
     minGpuLayers,
     maxGpuLayers,
-    defaultContextFlashAttention
+    defaultContextFlashAttention,
+    useMmap
 }: {
     ggufInsights: GgufInsights,
     freeVram: number,
     fitContext?: {contextSize?: number, embeddingContext?: boolean},
     minGpuLayers?: number,
     maxGpuLayers?: number,
-    defaultContextFlashAttention: boolean
+    defaultContextFlashAttention: boolean,
+    useMmap?: boolean
 }) {
     return findBestOption({
         *generator() {
@@ -118,7 +122,8 @@ function getBestGpuLayersForFreeVram({
                 ggufInsights,
                 currentVram: freeVram,
                 fitContext,
-                defaultContextFlashAttention
+                defaultContextFlashAttention,
+                useMmap
             });
 
             if (layersRequirements == null)
@@ -177,12 +182,15 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer
 }
 
 function getVramRequiredForGpuLayers({
-    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false
+    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, useMmap
 }: {
     gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean},
-    defaultContextFlashAttention: boolean
+    defaultContextFlashAttention: boolean, useMmap?: boolean
 }) {
-    const modelVram = ggufInsights.estimateModelResourceRequirements({gpuLayers}).gpuVram;
+    const modelVram = ggufInsights.estimateModelResourceRequirements({
+        gpuLayers,
+        useMmap
+    }).gpuVram;
 
     if (modelVram > currentVram)
         return null;
diff --git a/src/gguf/parser/GgufV2Parser.ts b/src/gguf/parser/GgufV2Parser.ts
index 4c0f922e..10a6f557 100644
--- a/src/gguf/parser/GgufV2Parser.ts
+++ b/src/gguf/parser/GgufV2Parser.ts
@@ -9,6 +9,9 @@ import {GgmlType, GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
 import {convertMetadataKeyValueRecordToNestedObject} from "../utils/convertMetadataKeyValueRecordToNestedObject.js";
 import {promisableLoop, Promisable, transformPromisable, transformPromisables} from "../../utils/transformPromisable.js";
 import {noDirectSubNestingGGufMetadataKeys} from "../consts.js";
+import {Writable} from "../../utils/utilTypes.js";
+
+const ggufDefaultAlignment = 32;
 
 export class GgufV2Parser {
     private readonly _fileReader: GgufFileReader;
@@ -33,8 +36,16 @@ export class GgufV2Parser {
         const headerReadResult = headerReadResultPromisable instanceof Promise
             ? await headerReadResultPromisable
             : headerReadResultPromisable;
+        const alignmentHeader = headerReadResult.metadata["general.alignment"];
+        const ggufAlignment = (
+            alignmentHeader != null &&
+            (typeof alignmentHeader === "number" || typeof alignmentHeader === "bigint") &&
+            Number.isFinite(Number(alignmentHeader))
+        )
+            ? Number(alignmentHeader)
+            : ggufDefaultAlignment;
         const tensorReadResultPromisable = this._shouldReadTensorInfo
-            ? await this._readTensorInfo(headerReadResult.tensorCount, readOffset)
+            ? await this._readTensorInfo(headerReadResult.tensorCount, readOffset, ggufAlignment)
             : null;
         const tensorReadResult = tensorReadResultPromisable instanceof Promise
             ? await tensorReadResultPromisable
@@ -50,7 +61,8 @@ export class GgufV2Parser {
             metadata: metadata as any as GgufMetadata,
             tensorInfo: tensorReadResult?.tensorInfo,
             metadataSize: headerReadResult.headerSize + initialOffset,
-            tensorInfoSize: tensorReadResult?.tensorInfoSize
+            tensorInfoSize: tensorReadResult?.tensorInfoSize,
+            tensorDataOffset: tensorReadResult?.tensorDataOffset
         };
     }
 
@@ -133,7 +145,7 @@ export class GgufV2Parser {
         });
     }
 
-    private _readTensorInfo(tensorCount: number | bigint, readOffset: GgufReadOffset) {
+    private _readTensorInfo(tensorCount: number | bigint, readOffset: GgufReadOffset, ggufAlignment: number) {
         const initialOffset = readOffset.offset;
         const tensorInfo: GgufTensorInfo[] = [];
 
@@ -164,7 +176,9 @@ export class GgufV2Parser {
                                     name,
                                     dimensions,
                                     ggmlType: ggmlType as GgmlType,
-                                    offset: GgufFileReader.castNumberIfSafe(offset)
+                                    offset: GgufFileReader.castNumberIfSafe(offset),
+                                    fileOffset: 0, // will be set later
+                                    filePart: 1 // will be updated later if needed
                                 });
                             });
                         }
@@ -172,10 +186,24 @@ export class GgufV2Parser {
                 });
             },
             afterthought: () => void i++,
-            returnValue: () => ({
-                tensorInfo,
-                tensorInfoSize: readOffset.offset - initialOffset
-            })
+            returnValue: () => {
+                const fileTensorDataOffset = alignOffset(readOffset.offset, ggufAlignment);
+
+                for (const tensor of tensorInfo)
+                    (tensor as Writable<GgufTensorInfo>).fileOffset = typeof tensor.offset === "bigint"
+                        ? BigInt(fileTensorDataOffset) + tensor.offset
+                        : fileTensorDataOffset + tensor.offset;
+
+                return {
+                    tensorInfo,
+                    tensorInfoSize: readOffset.offset - initialOffset,
+                    tensorDataOffset: fileTensorDataOffset
+                };
+            }
         });
     }
 }
+
+function alignOffset(offset: number, alignment: number) {
+    return offset + (alignment - (offset % alignment)) % alignment;
+}
diff --git a/src/gguf/parser/parseGguf.ts b/src/gguf/parser/parseGguf.ts
index 8f79b130..8f7a9919 100644
--- a/src/gguf/parser/parseGguf.ts
+++ b/src/gguf/parser/parseGguf.ts
@@ -23,7 +23,7 @@ export async function parseGguf({
 }): Promise<GgufFileInfo> {
     const readOffset = new GgufReadOffset(0);
     const magicAndVersion = await parseMagicAndVersion(fileReader, readOffset);
-    const gguifInfo = await parseGgufUsingASpecificVersionParser({
+    const ggufInfo = await parseGgufUsingASpecificVersionParser({
         fileReader,
         readTensorInfo,
         ignoreKeys,
@@ -32,21 +32,21 @@ export async function parseGguf({
         readOffset,
         logWarnings
     });
-    const architectureMetadata = getGgufMetadataArchitectureData(gguifInfo.metadata);
+    const architectureMetadata = getGgufMetadataArchitectureData(ggufInfo.metadata);
 
     return {
         version: magicAndVersion.version,
-        tensorCount: gguifInfo.tensorCount,
-        metadata: gguifInfo.metadata,
+        tensorCount: ggufInfo.tensorCount,
+        metadata: ggufInfo.metadata,
         architectureMetadata: architectureMetadata,
-        tensorInfo: gguifInfo.tensorInfo,
-        metadataSize: gguifInfo.metadataSize,
+        tensorInfo: ggufInfo.tensorInfo,
+        metadataSize: ggufInfo.metadataSize,
         splicedParts: 1,
-        totalTensorInfoSize: gguifInfo.tensorInfoSize,
-        totalTensorCount: gguifInfo.tensorCount,
-        totalMetadataSize: gguifInfo.metadataSize,
-        fullTensorInfo: gguifInfo.tensorInfo,
-        tensorInfoSize: gguifInfo.tensorInfoSize
+        totalTensorInfoSize: ggufInfo.tensorInfoSize,
+        totalTensorCount: ggufInfo.tensorCount,
+        totalMetadataSize: ggufInfo.metadataSize,
+        fullTensorInfo: ggufInfo.tensorInfo,
+        tensorInfoSize: ggufInfo.tensorInfoSize
     };
 }
 
diff --git a/src/gguf/readGgufFileInfo.ts b/src/gguf/readGgufFileInfo.ts
index 90ded44d..10595bd7 100644
--- a/src/gguf/readGgufFileInfo.ts
+++ b/src/gguf/readGgufFileInfo.ts
@@ -2,6 +2,7 @@ import retry from "async-retry";
 import {isUrl} from "../utils/isUrl.js";
 import {ModelFileAccessTokens} from "../utils/modelFileAccesTokens.js";
 import {isModelUri, parseModelUri} from "../utils/parseModelUri.js";
+import {Writable} from "../utils/utilTypes.js";
 import {parseGguf} from "./parser/parseGguf.js";
 import {GgufNetworkFetchFileReader} from "./fileReaders/GgufNetworkFetchFileReader.js";
 import {GgufFsFileReader} from "./fileReaders/GgufFsFileReader.js";
@@ -9,6 +10,7 @@ import {ggufDefaultFetchRetryOptions} from "./consts.js";
 import {normalizeGgufDownloadUrl} from "./utils/normalizeGgufDownloadUrl.js";
 import {resolveSplitGgufParts} from "./utils/resolveSplitGgufParts.js";
 import {GgufFileInfo} from "./types/GgufFileInfoTypes.js";
+import {GgufTensorInfo} from "./types/GgufTensorInfoTypes.js";
 
 
 /**
@@ -94,14 +96,21 @@ export async function readGgufFileInfo(pathOrUri: string, {
         throw new Error(`Unsupported sourceType: ${sourceType}`);
     }
 
-    async function readSingleFile(pathOrUri: string) {
+    async function readSingleFile(pathOrUri: string, splitPartNumber: number = 1) {
         const fileReader = createFileReader(pathOrUri);
-        return await parseGguf({
+        const res = await parseGguf({
             fileReader,
             ignoreKeys,
             readTensorInfo,
             logWarnings
         });
+
+        if (splitPartNumber > 1) {
+            for (const tensor of res.tensorInfo ?? [])
+                (tensor as Writable<GgufTensorInfo>).filePart = splitPartNumber;
+        }
+
+        return res;
     }
 
     if (!spliceSplitFiles)
@@ -113,7 +122,7 @@ export async function readGgufFileInfo(pathOrUri: string, {
         return await readSingleFile(allSplitPartPaths[0]!);
 
     const [first, ...rest] = await Promise.all(
-        allSplitPartPaths.map((partPath) => readSingleFile(partPath))
+        allSplitPartPaths.map((partPath, index) => readSingleFile(partPath, index + 1))
     );
 
     if (first == null)
diff --git a/src/gguf/types/GgufFileInfoTypes.ts b/src/gguf/types/GgufFileInfoTypes.ts
index bda0fb02..0ff9a20c 100644
--- a/src/gguf/types/GgufFileInfoTypes.ts
+++ b/src/gguf/types/GgufFileInfoTypes.ts
@@ -96,5 +96,6 @@ export type GgufVersionParserResult = {
     metadata: GgufMetadata,
     tensorInfo?: GgufTensorInfo[],
     metadataSize: number,
-    tensorInfoSize?: number
+    tensorInfoSize?: number,
+    tensorDataOffset?: number
 };
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index a7f9e6b7..3fc686dd 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -142,7 +142,7 @@ export type GgufMetadataGeneral<A extends GgufArchitectureType = GgufArchitectur
      * writers may not write the alignment. If the alignment is not specified,
      * assume it is `32`.
      */
-    readonly alignment?: string,
+    readonly alignment?: number,
 
     /**
      * The name of the model. This should be a human-readable name that can be
diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts
index d144935e..28ae45c3 100644
--- a/src/gguf/types/GgufTensorInfoTypes.ts
+++ b/src/gguf/types/GgufTensorInfoTypes.ts
@@ -2,7 +2,22 @@ export type GgufTensorInfo = {
     readonly name: string,
     readonly dimensions: readonly (number | bigint)[],
     readonly ggmlType: GgmlType,
-    readonly offset: number | bigint
+    readonly offset: number | bigint,
+
+    /**
+     * Adjusted offset relative to the file.
+     * 
+     * Added by the GGUF parser - not part of the file's metadata.
+     */
+    readonly fileOffset: number | bigint,
+
+    /**
+     * For spliced metadata of multiple file parts, this will be the file part number.
+     * Starts from `1`.
+     *
+     * Added by the GGUF parser - not part of the file's metadata.
+     */
+    readonly filePart: number
 };
 
 export const enum GgmlType {
diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
index 69b6128f..03d5942a 100644
--- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
@@ -36,12 +36,14 @@ describe("functionary", () => {
                     }),
                     llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram),
                     llamaGpu,
-                    llamaSupportsGpuOffloading: llamaGpu !== false
+                    llamaSupportsGpuOffloading: llamaGpu !== false,
+                    useMmap: true
                 });
 
                 async function resolveAutoContextSize() {
                     const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({
-                        targetGpuLayers: resolvedGpuLayers
+                        targetGpuLayers: resolvedGpuLayers,
+                        useMmap: true
                     }, {
                         llamaGpu,
                         getVramState: async () => ({
@@ -249,11 +251,11 @@ describe("functionary", () => {
                         const res = await resolveGpuLayers(16, {
                             totalVram: s1GB * 6,
                             freeVram: s1GB * 3,
-                            totalRam: s1GB * 3,
-                            freeRam: s1GB * 2
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 4.5
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -323,44 +325,44 @@ describe("functionary", () => {
                 test("some unified RAM", async () => {
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6,
-                            totalRam: s1GB * 6,
-                            freeRam: s1GB * 6,
-                            unifiedMemorySize: s1GB * 6
+                            totalVram: s1GB * 8,
+                            freeVram: s1GB * 8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8,
+                            unifiedMemorySize: s1GB * 8
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7411");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6,
-                            totalRam: s1GB * 6,
-                            freeRam: s1GB * 5,
-                            unifiedMemorySize: s1GB * 6
+                            totalVram: s1GB * 7,
+                            freeVram: s1GB * 7,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 5.5,
+                            unifiedMemorySize: s1GB * 7
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2168");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2086");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6,
-                            totalRam: s1GB * 6,
-                            freeRam: s1GB * 5,
-                            unifiedMemorySize: s1GB * 5
+                            totalVram: s1GB * 6.4,
+                            freeVram: s1GB * 6.4,
+                            totalRam: s1GB * 6.4,
+                            freeRam: s1GB * 5.3,
+                            unifiedMemorySize: s1GB * 5.3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7411");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6804");
                     }
                     try {
                         await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
+                            totalVram: s1GB * 8,
                             freeVram: s1GB * 0,
                             totalRam: s1GB * 3,
                             freeRam: s1GB * 2,
-                            unifiedMemorySize: s1GB * 6
+                            unifiedMemorySize: s1GB * 8
                         });
                         expect.unreachable("Should have thrown an error");
                     } catch (err) {
@@ -368,11 +370,11 @@ describe("functionary", () => {
                     }
                     try {
                         await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
+                            totalVram: s1GB * 8,
                             freeVram: s1GB * 0.2,
                             totalRam: s1GB * 3,
                             freeRam: s1GB * 2,
-                            unifiedMemorySize: s1GB * 6
+                            unifiedMemorySize: s1GB * 8
                         });
                         expect.unreachable("Should have thrown an error");
                     } catch (err) {
@@ -380,7 +382,7 @@ describe("functionary", () => {
                     }
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
+                            totalVram: s1GB * 8,
 
                             // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
                             // to create a context
@@ -388,7 +390,7 @@ describe("functionary", () => {
 
                             totalRam: s1GB * 3,
                             freeRam: s1GB * 2,
-                            unifiedMemorySize: s1GB * 6,
+                            unifiedMemorySize: s1GB * 8,
 
                             ignoreMemorySafetyChecks: true
                         });
@@ -435,7 +437,7 @@ describe("functionary", () => {
                             freeSwap: s1GB * 3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -572,7 +574,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(32);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     try {
                         await resolveGpuLayers(32, {
@@ -637,12 +639,13 @@ describe("functionary", () => {
             describe("attempts to resolve 33 gpuLayers", () => {
                 test("no RAM", async () => {
                     {
+                        // some RAM is always used to load the model (for the input layer)
                         const res = await resolveGpuLayers(33, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6
+                            totalVram: s1GB * 8,
+                            freeVram: s1GB * 8
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.eql(null);
                     }
                     try {
                         await resolveGpuLayers(33, {
@@ -693,7 +696,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -703,7 +706,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -713,7 +716,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 4
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     try {
                         await resolveGpuLayers(33, {
@@ -900,18 +903,22 @@ describe("functionary", () => {
                 {
                     const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.7
+                        freeVram: s1GB * 4.4,
+                        totalRam: s1GB * 1,
+                        freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("607");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("502");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.8
+                        freeVram: s1GB * 4.5,
+                        totalRam: s1GB * 1,
+                        freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1142");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1010");
                 }
             });
 
@@ -944,8 +951,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3606");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -955,7 +962,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -964,8 +971,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("10");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5856");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -974,7 +981,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -984,7 +991,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("13");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -994,7 +1001,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1004,7 +1011,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1014,7 +1021,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1024,7 +1031,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1034,8 +1041,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8076");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7977");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1044,8 +1051,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8140");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("25");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8043");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1055,7 +1062,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3282");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4754");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1065,7 +1072,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6492");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7964");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1075,7 +1082,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                 });
 
@@ -1107,8 +1114,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3606");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1118,7 +1125,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1127,8 +1134,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("10");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5856");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1137,7 +1144,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1147,7 +1154,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("13");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1157,7 +1164,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1167,7 +1174,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1177,7 +1184,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1187,7 +1194,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1197,8 +1204,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8076");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7977");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1207,8 +1214,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8140");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("25");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8043");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1218,7 +1225,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3282");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4754");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1228,7 +1235,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6492");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7964");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1238,7 +1245,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                 });
             });
@@ -1317,7 +1324,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.be.gte(16);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1329,7 +1336,7 @@ describe("functionary", () => {
                         });
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1342,7 +1349,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
                     }
                 });
 
@@ -1419,7 +1426,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.be.gte(16);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1431,7 +1438,7 @@ describe("functionary", () => {
                         });
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1444,7 +1451,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
                     }
                 });
             });
@@ -1472,8 +1479,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5561");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5737");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1484,8 +1491,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5246");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1496,7 +1503,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1508,7 +1515,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1561,8 +1568,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6548");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5737");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1573,8 +1580,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5246");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1585,7 +1592,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1597,7 +1604,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
diff --git a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
index f21a39e4..51500140 100644
--- a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
+++ b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
@@ -24,6 +24,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -32,6 +34,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -41,6 +45,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -50,6 +56,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
@@ -156,6 +164,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -164,6 +174,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -173,6 +185,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -182,6 +196,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
@@ -219,6 +235,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -227,6 +245,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -236,6 +256,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -245,6 +267,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
@@ -351,6 +375,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -359,6 +385,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -368,6 +396,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -377,6 +407,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
index c79a18d2..6ce0ed4f 100644
--- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts
+++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
@@ -38,38 +38,38 @@ describe("gguf", async () => {
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 1}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "3.54GB",
-                "gpuVram": "809.84MB",
+                "cpuRam": "4.22GB",
+                "gpuVram": "528.01MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 8}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "2.74GB",
-                "gpuVram": "1.59GB",
+                "cpuRam": "3.42GB",
+                "gpuVram": "1.32GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 16}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "1.83GB",
-                "gpuVram": "2.51GB",
+                "cpuRam": "2.51GB",
+                "gpuVram": "2.34GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 24}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "936.25MB",
-                "gpuVram": "3.42GB",
+                "cpuRam": "1.59GB",
+                "gpuVram": "3.14GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 32}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
-                "gpuVram": "4.33GB",
+                "cpuRam": "692.8MB",
+                "gpuVram": "4.06GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 33}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
-                "gpuVram": "4.33GB",
+                "cpuRam": "281.81MB",
+                "gpuVram": "4.06GB",
               }
             `);
         });
@@ -95,11 +95,14 @@ describe("gguf", async () => {
             const s300MB = 300 * Math.pow(1024, 2);
             const s5MB = 5 * Math.pow(1024, 2);
 
-            const estimatedModelVramUsage = ggufInsights.estimateModelResourceRequirements({gpuLayers: ggufInsights.totalLayers}).gpuVram;
-            expect(bytes(estimatedModelVramUsage)).toMatchInlineSnapshot('"4.33GB"');
-            expect(Math.abs(modelVramUsageDiff - estimatedModelVramUsage)).to.be.lte(s300MB);
+            const estimatedModelResourceUsage = ggufInsights.estimateModelResourceRequirements({
+                gpuLayers: ggufInsights.totalLayers
+            });
+            expect(bytes(estimatedModelResourceUsage.gpuVram)).toMatchInlineSnapshot('"4.06GB"');
+            expect(bytes(estimatedModelResourceUsage.cpuRam)).toMatchInlineSnapshot('"281.81MB"');
+            expect(Math.abs(modelVramUsageDiff - estimatedModelResourceUsage.gpuVram)).to.be.lte(s300MB);
 
-            const modelEstimationDiffWithActual = estimatedModelVramUsage - model.size;
+            const modelEstimationDiffWithActual = estimatedModelResourceUsage.gpuVram + estimatedModelResourceUsage.cpuRam - model.size;
             expect(Math.abs(modelEstimationDiffWithActual)).to.be.lte(s5MB); // tolerate such a small difference
 
             if (modelEstimationDiffWithActual !== 0)
diff --git a/test/modelDependent/llama3.1/tokenPredictor.test.ts b/test/modelDependent/llama3.1/tokenPredictor.test.ts
index d20d072f..e1854d9d 100644
--- a/test/modelDependent/llama3.1/tokenPredictor.test.ts
+++ b/test/modelDependent/llama3.1/tokenPredictor.test.ts
@@ -213,17 +213,36 @@ describe("llama 3.1", () => {
                 const sequence = context.getSequence({
                     tokenPredictor: predictor
                 });
+
+                // // script to find the right maxTokens value for this test
+                // {
+                //     for (let maxTokens = 0; maxTokens < 80; maxTokens++) {
+                //         const chatSession = new LlamaChatSession({
+                //             contextSequence: sequence
+                //         });
+                //
+                //         await chatSession.prompt("Summarize this text:\n\n" + exampleParagraph, {
+                //             maxTokens
+                //         });
+                //         const actualContextTokensLength = sequence._contextTokens.length;
+                //         const exposedContextTokensLength = sequence.contextTokens.length;
+                //
+                //         if (actualContextTokensLength !== exposedContextTokensLength)
+                //             console.log("max tokens with validated predictions:", maxTokens);
+                //     }
+                // }
+
                 const chatSession = new LlamaChatSession({
                     contextSequence: sequence
                 });
 
                 await chatSession.prompt("Summarize this text:\n\n" + exampleParagraph, {
-                    maxTokens: 80
+                    maxTokens: 23
                 });
 
-                expect(sequence.tokenPredictions.validated).toMatchInlineSnapshot("8");
-                expect(sequence.tokenPredictions.refuted).toMatchInlineSnapshot("47");
-                expect(sequence.tokenPredictions.used).toMatchInlineSnapshot("7");
+                expect(sequence.tokenPredictions.validated).toMatchInlineSnapshot("2");
+                expect(sequence.tokenPredictions.refuted).toMatchInlineSnapshot("8");
+                expect(sequence.tokenPredictions.used).toMatchInlineSnapshot("1");
                 expect(sequence.tokenPredictions.unused).toMatchInlineSnapshot("1");
 
                 const exposedNextTokenIndex = sequence.nextTokenIndex;
@@ -232,9 +251,9 @@ describe("llama 3.1", () => {
                     const actualContextTokensLength = sequence._contextTokens.length;
                     const exposedContextTokensLength = sequence.contextTokens.length;
 
-                    expect(exposedContextTokensLength).toMatchInlineSnapshot("598");
-                    expect(actualContextTokensLength).toMatchInlineSnapshot("599");
-                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("598");
+                    expect(exposedContextTokensLength).toMatchInlineSnapshot("541");
+                    expect(actualContextTokensLength).toMatchInlineSnapshot("542");
+                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("541");
                     expect(exposedContextTokensLength).to.not.be.eql(actualContextTokensLength);
                 }
 
@@ -252,7 +271,7 @@ describe("llama 3.1", () => {
 
                 expect(addedTokens).toMatchInlineSnapshot(`
                   [
-                    10318,
+                    315,
                   ]
                 `);
 
@@ -262,9 +281,9 @@ describe("llama 3.1", () => {
                     const actualContextTokensLength = sequence._contextTokens.length;
                     const exposedContextTokensLength = sequence.contextTokens.length;
 
-                    expect(exposedContextTokensLength).toMatchInlineSnapshot("598");
-                    expect(actualContextTokensLength).toMatchInlineSnapshot("598");
-                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("598");
+                    expect(exposedContextTokensLength).toMatchInlineSnapshot("541");
+                    expect(actualContextTokensLength).toMatchInlineSnapshot("541");
+                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("541");
                     expect(exposedNextTokenIndex).to.be.eql(sequence.nextTokenIndex);
                     expect(exposedContextTokensLength).to.be.eql(actualContextTokensLength);
                     expect(sequence.contextTokens.at(-1)).to.not.be.eql(exampleToken);
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
index 23947b5b..43145a6d 100644
--- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
@@ -36,12 +36,14 @@ describe("stableCode", () => {
                     }),
                     llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram),
                     llamaGpu,
-                    llamaSupportsGpuOffloading: llamaGpu !== false
+                    llamaSupportsGpuOffloading: llamaGpu !== false,
+                    useMmap: true
                 });
 
                 async function resolveAutoContextSize() {
                     const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({
-                        targetGpuLayers: resolvedGpuLayers
+                        targetGpuLayers: resolvedGpuLayers,
+                        useMmap: true
                     }, {
                         llamaGpu,
                         getVramState: async () => ({
@@ -109,7 +111,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8687");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7177");
                 }
                 try {
                     await resolveGpuLayers(16, {
@@ -172,7 +174,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10905");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
                 }
                 try {
                     await resolveGpuLayers(32, {
@@ -221,7 +223,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10905");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
                 }
                 try {
                     await resolveGpuLayers(33, {
@@ -309,7 +311,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -317,7 +319,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6647");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6866");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -325,7 +327,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7712");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7931");
                 }
             });
 
@@ -351,16 +353,16 @@ describe("stableCode", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3287");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8724");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 1.4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("9");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4478");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6203");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -368,7 +370,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 2.4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1325");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1544");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -376,7 +378,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.1
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3187");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3407");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -384,7 +386,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3720");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3939");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -392,7 +394,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4252");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("4471");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -400,7 +402,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5050");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5270");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -408,7 +410,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -416,7 +418,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6381");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6600");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -424,7 +426,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6913");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7133");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -432,7 +434,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7712");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7931");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -440,7 +442,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.2
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8776");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8995");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -448,7 +450,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10373");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("10592");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -456,7 +458,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10905");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
                 }
             });
 
@@ -502,7 +504,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("13167");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11658");
                 }
                 try {
                     await resolveGpuLayers({min: 16}, {
@@ -520,7 +522,7 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -529,8 +531,8 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("24");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8405");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("22");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8160");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -539,8 +541,8 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8112");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7177");
                 }
             });
 
@@ -563,7 +565,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -572,8 +574,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 2,
                         freeVram: s1GB * 1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5127");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9426");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -582,8 +584,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8867");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9167");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -592,8 +594,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 1,
                         freeVram: s1GB * 1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8962");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9426");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
diff --git a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap
index ba18991a..062de497 100644
--- a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap
+++ b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap
@@ -20,6 +20,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         59392,
       ],
+      "fileOffset": 2585504,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.ffn_up.weight",
       "offset": 0,
@@ -29,6 +31,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         14848,
       ],
+      "fileOffset": 725980064,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_output.weight",
       "offset": 723394560,
@@ -38,6 +42,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         15872,
       ],
+      "fileOffset": 906828704,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_qkv.weight",
       "offset": 904243200,
@@ -47,6 +53,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         65024,
       ],
+      "fileOffset": 1100149664,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "token_embd.weight",
       "offset": 1097564160,
@@ -135,6 +143,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         59392,
       ],
+      "fileOffset": 2585504,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.ffn_up.weight",
       "offset": 0,
@@ -144,6 +154,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         14848,
       ],
+      "fileOffset": 725980064,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_output.weight",
       "offset": 723394560,
@@ -153,6 +165,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         15872,
       ],
+      "fileOffset": 906828704,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_qkv.weight",
       "offset": 904243200,
@@ -162,6 +176,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         65024,
       ],
+      "fileOffset": 1100149664,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "token_embd.weight",
       "offset": 1097564160,

From 6c4e11db6df4be79fb7aedd8389cddccb0408a6e Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 31 Dec 2024 02:54:19 +0200
Subject: [PATCH 25/73] docs: improve building from source

---
 docs/guide/building-from-source.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/guide/building-from-source.md b/docs/guide/building-from-source.md
index 60a7d50c..cd1ac0bb 100644
--- a/docs/guide/building-from-source.md
+++ b/docs/guide/building-from-source.md
@@ -25,19 +25,23 @@ This is useful for building from source on machines that aren't connected to the
 :::
 
 ::: info
-
 If `cmake` is not installed on your machine, `node-llama-cpp` will automatically download `cmake` to an internal directory and try to use it to build `llama.cpp` from source.
 
 If the build fails, make sure you have the required dependencies of `cmake` installed on your machine. More info is available [here](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake) (you don't have to install `cmake` or `cmake-js`, just the dependencies).
+:::
 
-If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`, try running `xcode-select --install` to install the Xcode command line tools.
-
+::: details Dependencies for macOS
+If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`,
+try running this command to install the Xcode command line tools:
+```shell
+xcode-select --install
+```
 :::
 
 ::: details Dependencies for Windows x64
 If the build fails on your machine, ensure you have all the necessary build tools installed.
 
-You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using these commands:
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
 ```shell
 winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
 ```
@@ -59,7 +63,7 @@ You can also install all the dependencies manually using the [Visual C++ Build T
 ::: details Dependencies for Windows on Arm
 On Windows on Arm you need to install additional build tools to build `llama.cpp` from source.
 
-You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using these commands:
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
 ```shell
 winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.Tools.ARM64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATL.ARM64 Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.MFC.ARM64 Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
 ```

From 07bbc4e0d3f733bf7e5f474814c05c7e0fb2b0ca Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 31 Dec 2024 03:15:04 +0200
Subject: [PATCH 26/73] fix: check for Rosetta usage on macOS x64 when using
 the `inspect gpu` command

---
 src/cli/commands/OnPostInstallCommand.ts             |  5 +++--
 .../commands/inspect/commands/InspectGpuCommand.ts   | 12 ++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/cli/commands/OnPostInstallCommand.ts b/src/cli/commands/OnPostInstallCommand.ts
index ee602984..e81e9b0b 100644
--- a/src/cli/commands/OnPostInstallCommand.ts
+++ b/src/cli/commands/OnPostInstallCommand.ts
@@ -21,8 +21,9 @@ export const OnPostInstallCommand: CommandModule<object, OnPostInstallCommand> =
             console.error(
                 getConsoleLogPrefix(false, false),
                 chalk.red(
-                    "llama.cpp is not supported on Rosetta on Apple Silicone Macs. " +
-                    "Ensure that you're using a native arm64 node.js installation.")
+                    "llama.cpp is not supported under Rosetta on Apple Silicone Macs. " +
+                    "Ensure that you're using a native arm64 node.js installation."
+                )
             );
             console.error(
                 getConsoleLogPrefix(false, false),
diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
index a8edc254..199b0e06 100644
--- a/src/cli/commands/inspect/commands/InspectGpuCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
@@ -13,6 +13,7 @@ import {documentationPageUrls} from "../../../../config.js";
 import {Llama} from "../../../../bindings/Llama.js";
 import {getPlatformInfo} from "../../../../bindings/utils/getPlatformInfo.js";
 import {getLinuxDistroInfo} from "../../../../bindings/utils/getLinuxDistroInfo.js";
+import {isRunningUnderRosetta} from "../../../utils/isRunningUnderRosetta.js";
 
 type InspectGpuCommand = {
     // no options for now
@@ -91,6 +92,17 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
                 gpusToLogVramUsageOf.push("metal");
             }
         } else if (platform === "mac") {
+            if (await isRunningUnderRosetta()) {
+                console.error(
+                    chalk.red(
+                        "llama.cpp is not supported under Rosetta on Apple Silicone Macs. " +
+                        "Ensure that you're using a native arm64 node.js installation."
+                    )
+                );
+                console.error("process.platform: " + process.platform + ", process.arch: " + process.arch);
+                console.error("troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction);
+            }
+
             console.info(`${chalk.yellow("Metal:")} ${chalk.red("not supported by llama.cpp on Intel Macs")}`);
 
             const llama = await loadLlamaForGpu(false);

From 6a13bbfd56ab5967838eb6fc9730c8c6f202010e Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 05:09:53 +0200
Subject: [PATCH 27/73] feat: `experimentalChunkDocument`

---
 .vitepress/config.ts                          |   1 +
 docs/guide/index.md                           |   4 +
 docs/guide/low-level-api.md                   | 322 ++++++++++++++++
 eslint.config.js                              |   3 +-
 llama/addon/AddonContext.cpp                  |  25 +-
 src/chatWrappers/utils/resolveChatWrapper.ts  |  13 +-
 src/evaluator/LlamaChat/LlamaChat.ts          |   9 +-
 src/evaluator/LlamaContext/LlamaContext.ts    |  89 ++---
 src/evaluator/LlamaContext/types.ts           |  54 +++
 src/evaluator/utils/chunkDocument.ts          | 358 ++++++++++++++++++
 src/index.ts                                  |   8 +-
 .../llama3.1/chunkDocument.test.ts            |  82 ++++
 12 files changed, 899 insertions(+), 69 deletions(-)
 create mode 100644 docs/guide/low-level-api.md
 create mode 100644 src/evaluator/utils/chunkDocument.ts
 create mode 100644 test/modelDependent/llama3.1/chunkDocument.test.ts

diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index c4cd0c9a..1ae85a08 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -491,6 +491,7 @@ export default defineConfig({
                     {text: "Chat Context Shift", link: "/chat-context-shift"},
                     {text: "Batching", link: "/batching"},
                     {text: "Token Prediction", link: "/token-prediction"},
+                    {text: "Low Level API", link: "/low-level-api"},
                     {text: "Awesome List", link: "/awesome"},
                     {text: "Troubleshooting", link: "/troubleshooting"},
                     {text: "Tips and Tricks", link: "/tips-and-tricks"}
diff --git a/docs/guide/index.md b/docs/guide/index.md
index ac218614..7f7eb6f0 100644
--- a/docs/guide/index.md
+++ b/docs/guide/index.md
@@ -264,6 +264,10 @@ console.log("AI: " + a1);
 ```
 
 ### Raw
+::: tip NOTE
+To learn more about using low level APIs, read the [low level API guide](./low-level-api.md).
+:::
+
 ```typescript
 import {fileURLToPath} from "url";
 import path from "path";
diff --git a/docs/guide/low-level-api.md b/docs/guide/low-level-api.md
new file mode 100644
index 00000000..a92104fc
--- /dev/null
+++ b/docs/guide/low-level-api.md
@@ -0,0 +1,322 @@
+---
+outline: deep
+description: Learn how to use the low-level API of node-llama-cpp
+---
+# Low Level API
+`node-llama-cpp` provides high-level APIs for the most common use cases to make it easy to use.
+However, it also provides low-level APIs for more advanced use cases.
+
+There are various low-level APIs that you can use - the more high level you can go, the more optimizations and features you can leverage. 
+
+## Background {#background}
+Before you can use the low-level API, here are a few concepts you should be familiar with:
+
+### Context Sequence {#context-sequence}
+A [`LlamaContextSequence`](../api/classes/LlamaContextSequence.md) is an isolated component that holds an inference state.
+
+The state is constructed from tokens you evaluate to "append" to the state, and you can access the current state tokens using [`.contextTokens`](../api/classes/LlamaContextSequence.md#contexttokens).
+
+When evaluating input (tokens) onto a context sequence, you can choose to generate a "next token" for each of the input tokens you evaluate.
+When choosing to generate a "next token" for a given token,
+the model will "see" all the tokens up to it (input tokens and the current context sequence state tokens),
+and the generated token will be in the generation result you get from the API and won't be appended to the context sequence state.
+
+### Probabilities List {#probabilities-list}
+When generating a token, the model actually generates a list of probabilities for each token in the vocabulary to be the next token.
+
+It then uses the probabilities to choose the next token based on the heuristics you provide (like [`temperature`](../api/type-aliases/SequenceEvaluateOptions#temperature), for example).
+
+The operation of applying such heuristics to choose the next token is also called _sampling_.
+
+When you pass sampling options (like [`temperature`](../api/type-aliases/SequenceEvaluateOptions#temperature), for example) for the generation of a token,
+it may make adjustments to the probabilities list so it can choose the next token based on the heuristics you provide.
+
+The sampling is done on the native side of `node-llama-cpp` for performance reasons.
+However, you can still opt to get the full probabilities list after the sampling is done,
+and you can pass no sampling options to avoid making any adjustments to the probabilities list.
+
+It's best to avoid getting the full probabilities list unless you really need it,
+as passing it to the JavaScript side can be slow.
+
+## Simple Evaluation {#simple-evaluation}
+You can evaluate the given input tokens onto a context sequence using [`.evaluate`](../api/classes/LlamaContextSequence.md#evaluate)
+and generate the next token for the last input token.
+
+On each iteration of the returned iterator, the generated token is then added to the context sequence state and the next token is generated for it, and so on.
+
+When using [`.evaluate`](../api/classes/LlamaContextSequence.md#evaluate), the configured [token predictor](./token-prediction.md) is used to speed up the generation process.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const maxTokens = 10;
+const res: Token[] = [];
+const options: SequenceEvaluateOptions = {
+    temperature: 0.8
+};
+
+for await (const generatedToken of sequence.evaluate(tokens, options)) {
+    res.push(generatedToken);
+    if (res.length >= maxTokens)
+        break;
+}
+
+const resText = model.detokenize(res);
+console.log("Result: " + resText);
+```
+> For generating text completion, it's better to use [`LlamaCompletion`](./text-completion.md) instead of manually evaluating input,
+> since it supports all models, and provides many more features and optimizations
+
+### Replacement Token(s) {#replacement-tokens}
+You can manually iterate over the evaluation iterator and provide a replacement to the generated token.
+You you provide a replacement token(s), it'll be appended to the context sequence state instead of the generated token.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const options: SequenceEvaluateOptions = {
+    temperature: 0.8
+};
+const maxTokens = 10;
+const res: Token[] = [];
+
+// fill this with tokens to replace
+const replacementMap = new Map<Token, Token>();
+
+const iterator = sequence.evaluate(tokens, options);
+let replacementToken: Token | undefined;
+
+while (true) {
+    const {value: token, done} = await iterator.next(replacementToken);
+    replacementToken = undefined;
+    if (done || token == null)
+        break;
+
+    replacementToken = replacementMap.get(token);
+
+    res.push(replacementToken ?? token);
+    if (res.length >= maxTokens)
+        break;
+}
+
+const resText = model.detokenize(res);
+console.log("Result: " + resText);
+```
+> If you want to adjust the token probabilities when generating output, consider using [token bias](./token-bias.md) instead
+
+### No Generation {#evaluation-without-generation}
+To evaluate the input tokens onto a context sequence without generating new tokens,
+you can use [`.evaluateWithoutGeneratingNewTokens`](../api/classes/LlamaContextSequence.md#evaluatewithoutgeneratingnewtokens).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+await sequence.evaluateWithoutGeneratingNewTokens(tokens);
+```
+
+## Controlled Evaluation {#controlled-evaluation}
+To manually control for which of the input tokens to generate output, you can use [`.controlledEvaluate`](../api/classes/LlamaContextSequence.md#controlledevaluate).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, ControlledEvaluateInputItem} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const evaluateInput: ControlledEvaluateInputItem[] = tokens.slice();
+
+// generate output for the last token only
+const lastToken = evaluateInput.pop() as Token;
+if (lastToken != null)
+    evaluateInput.push([lastToken, {
+        generateNext: {
+            singleToken: true,
+            probabilitiesList: true,
+            options: {
+                temperature: 0.8
+            }
+        }
+    }])
+
+const res = await sequence.controlledEvaluate(evaluateInput);
+const lastTokenResult = res[evaluateInput.length - 1];
+if (lastTokenResult != null) {
+    const {next} = lastTokenResult;
+
+    if (next.token != null)
+        console.log(
+            "next token",
+            next.token,
+            model.detokenize([next.token], true)
+        );
+
+    if (next.probabilities != null)
+        console.log(
+            "next probabilities",
+            [...next.probabilities.entries()]
+                .slice(0, 5) // top 5 probabilities
+                .map(([token, probability]) => (
+                    [model.detokenize([token], true), probability]
+                ))
+        );
+    
+    // next: evalute `next.token` onto the context sequence
+    // and generate the next token for it
+}
+```
+
+## State Manipulation {#state-manipulation}
+You can manipulate the context sequence state by erasing tokens from it or shifting tokens in it.
+
+Make sure that you don't attempt to manipulate the state while waiting for a generation result from an evaluation operation,
+as it may lead to unexpected results.
+
+### Erase State Ranges {#erase-state-ranges}
+To erase a range of tokens from the context sequence state,
+you can use [`.eraseContextTokenRanges`](../api/classes/LlamaContextSequence.md#erasecontexttokenranges).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+await sequence.evaluateWithoutGeneratingNewTokens(tokens);
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+
+// erase the last token from the state
+if (sequence.nextTokenIndex > 0)
+    await sequence.eraseContextTokenRanges([{
+        start: sequence.nextTokenIndex - 1,
+        end: sequence.nextTokenIndex
+    }]);
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+```
+
+### Adapt State to Tokens {#adapt-state-to-tokens}
+You can adapt the existing context state to a new input to avoid re-evaluating some of the tokens you've already evaluated.
+
+::: tip NOTE
+All the high-level APIs provided by `node-llama-cpp` automatically do this to improve efficiency and performance.
+:::
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+await sequence.evaluateWithoutGeneratingNewTokens(tokens);
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+
+const newInput = "The best method to";
+const newTokens = model.tokenize(newInput);
+
+// only align the current state if the length
+// of the new tokens won't incur a context shift
+if (newTokens.length < sequence.contextSize && newTokens.length > 0) {
+    // ensure we have at least one token to evalute
+    const lastToken = newTokens.pop()!;
+
+    await sequence.adaptStateToTokens(newTokens);
+    newTokens.push(lastToken);
+
+    // remove the tokens that already exist in the state
+    newTokens.splice(0, sequence.nextTokenIndex)
+}
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+console.log(
+    "New tokens:",
+    model.detokenize(newTokens, true),
+    newTokens
+);
+```
diff --git a/eslint.config.js b/eslint.config.js
index 394fc136..77950eba 100644
--- a/eslint.config.js
+++ b/eslint.config.js
@@ -55,7 +55,8 @@ export default tseslint.config({
             exemptDestructuredRootsFromChecks: true,
 
             tagNamePreference: {
-                hidden: "hidden"
+                hidden: "hidden",
+                experimental: "experimental"
             }
         }
     },
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 6abe8cf9..3cfb9567 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -192,8 +192,10 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
         AddonSampler* sampler;
         bool arrayResult = false;
         bool returnLogprobs = false;
-        Napi::Array logprobs;
         bool has_logprobs = false;
+        size_t logprobs_size;
+        llama_token * logprobs_tokens;
+        float * logprobs_probs;
         int32_t batchLogitIndex;
         llama_token result;
         bool no_output = false;
@@ -213,6 +215,11 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
         ~AddonContextSampleTokenWorker() {
             ctx->Unref();
             sampler->Unref();
+
+            if (has_logprobs) {
+                delete[] logprobs_tokens;
+                delete[] logprobs_probs;
+            }
         }
 
         Napi::Promise GetPromise() {
@@ -245,7 +252,7 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             auto & candidates = sampler->tokenCandidates;
             for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};;
+                candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
             }
 
             llama_token_data_array cur_p = {
@@ -265,10 +272,13 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
                     cur_p.sorted = true;
                 }
 
-                logprobs = Napi::Array::New(Env(), cur_p.size * 2);
+                logprobs_size = cur_p.size;
+                logprobs_tokens = new llama_token[logprobs_size];
+                logprobs_probs = new float[logprobs_size];
+
                 for (size_t i = 0; i < cur_p.size; i++) {
-                    logprobs.Set(i * 2, Napi::Number::New(Env(), cur_p.data[i].id));
-                    logprobs.Set(i * 2 + 1, Napi::Number::New(Env(), cur_p.data[i].logit));
+                    logprobs_tokens[i] = cur_p.data[i].id;
+                    logprobs_probs[i] = cur_p.data[i].logit;
                 }
 
                 has_logprobs = true;
@@ -300,6 +310,11 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
             resultArray.Set(Napi::Number::New(Env(), 0), resultToken);
             
             if (has_logprobs) {
+                Napi::Array logprobs = Napi::Array::New(Env(), logprobs_size * 2);
+                for (size_t i = 0; i < logprobs_size; i++) {
+                    logprobs.Set(i * 2, Napi::Number::New(Env(), logprobs_tokens[i]));
+                    logprobs.Set(i * 2 + 1, Napi::Number::New(Env(), logprobs_probs[i]));
+                }
                 resultArray.Set(1, logprobs);
             }
 
diff --git a/src/chatWrappers/utils/resolveChatWrapper.ts b/src/chatWrappers/utils/resolveChatWrapper.ts
index 8be8dc27..d19cec7a 100644
--- a/src/chatWrappers/utils/resolveChatWrapper.ts
+++ b/src/chatWrappers/utils/resolveChatWrapper.ts
@@ -15,6 +15,7 @@ import {Llama3_2LightweightChatWrapper} from "../Llama3_2LightweightChatWrapper.
 import {MistralChatWrapper} from "../MistralChatWrapper.js";
 import {Tokenizer} from "../../types.js";
 import {includesText} from "../../utils/includesText.js";
+import {LlamaModel} from "../../evaluator/LlamaModel/LlamaModel.js";
 import {isJinjaTemplateEquivalentToSpecializedChatWrapper} from "./isJinjaTemplateEquivalentToSpecializedChatWrapper.js";
 import {getModelLinageNames} from "./getModelLinageNames.js";
 import type {GgufFileInfo} from "../../gguf/types/GgufFileInfoTypes.js";
@@ -123,7 +124,17 @@ export type ResolveChatWrapperOptions = {
  * }) ?? new GeneralChatWrapper()
  * ```
  */
-export function resolveChatWrapper(options: ResolveChatWrapperOptions): BuiltInChatWrapperType | null {
+export function resolveChatWrapper(options: ResolveChatWrapperOptions): BuiltInChatWrapperType | null;
+export function resolveChatWrapper(options: LlamaModel): BuiltInChatWrapperType;
+export function resolveChatWrapper(options: ResolveChatWrapperOptions | LlamaModel): BuiltInChatWrapperType | null {
+    if (options instanceof LlamaModel)
+        return resolveChatWrapper({
+            bosString: options.tokens.bosString,
+            filename: options.filename,
+            fileInfo: options.fileInfo,
+            tokenizer: options.tokenizer
+        }) ?? new GeneralChatWrapper();
+
     const {
         type = "auto",
         bosString,
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 3baeca9d..d76a0f14 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -330,14 +330,7 @@ export class LlamaChat {
         this._disposeAggregator.add(this.onDispose.dispatchEvent);
 
         this._chatWrapper = chatWrapper === "auto"
-            ? (
-                resolveChatWrapper({
-                    bosString: contextSequence.model.tokens.bosString,
-                    filename: contextSequence.model.filename,
-                    fileInfo: contextSequence.model.fileInfo,
-                    tokenizer: contextSequence.model.tokenizer
-                }) ?? new GeneralChatWrapper()
-            )
+            ? resolveChatWrapper(contextSequence.model)
             : chatWrapper;
     }
 
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 47ea4bfa..ab579581 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -13,8 +13,8 @@ import {ThreadsSplitterConsumer} from "../../utils/ThreadsSplitter.js";
 import {pushAll} from "../../utils/pushAll.js";
 import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {
-    BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, EvaluationPriority,
-    LlamaContextOptions, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, SequenceEvaluateOptions
+    BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem,
+    EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, SequenceEvaluateOptions
 } from "./types.js";
 import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import {LlamaSampler} from "./LlamaSampler.js";
@@ -988,10 +988,17 @@ export class LlamaContextSequence {
         return this._context.model;
     }
 
+    /** The maximum number of tokens that the sequence state can hold */
+    public get contextSize() {
+        return this._context.contextSize;
+    }
+
+    /** The index where the next evaluated token will be placed in the context */
     public get nextTokenIndex() {
         return this._nextTokenIndex - this._loadedTokenPredictions.length;
     }
 
+    /** The current context state tokens */
     public get contextTokens() {
         if (this._loadedTokenPredictions.length === 0)
             return this._contextTokens.slice();
@@ -1255,7 +1262,7 @@ export class LlamaContextSequence {
      *
      * This method uses the token predictor (when provided) to generate new tokens faster.
      */
-    public evaluate(tokens: Token[], options: SequenceEvaluateOptions = {}): AsyncGenerator<Token, void | Token> {
+    public evaluate(tokens: Token[], options: SequenceEvaluateOptions = {}): AsyncGenerator<Token, void, void | Token | Token[]> {
         const {
             temperature = 0,
             minP = 0,
@@ -1397,48 +1404,7 @@ export class LlamaContextSequence {
      *
      * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
      */
-    public async controlledEvaluate(input: Array<Token | [Token, {
-        generateNext?: {
-            /**
-             * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
-             *
-             * Only enable when needed, since it impacts the performance.
-             *
-             * Defaults to `false`.
-             */
-            probabilitiesList?: boolean,
-
-            /**
-             * Generate the next token with the provided options using sampling.
-             *
-             * Setting this to `true` will generate probabilities for the next token and sample it.
-             */
-            singleToken?: boolean,
-
-            options?: {
-                temperature?: number, minP?: number, topK?: number, topP?: number,
-
-                /**
-                 * Used to control the randomness of the generated text.
-                 *
-                 * Change the seed to get different results.
-                 *
-                 * Defaults to the current epoch time.
-                 *
-                 * Only relevant when using `temperature`.
-                 */
-                seed?: number,
-                repeatPenalty?: LlamaContextSequenceRepeatPenalty,
-
-                /**
-                 * Adjust the probability of tokens being generated.
-                 * Can be used to bias the model to generate tokens that you want it to lean towards,
-                 * or to avoid generating tokens that you want it to avoid.
-                 */
-                tokenBias?: TokenBias | (() => TokenBias)
-            }
-        }
-    }]>, options?: {
+    public async controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
         /**
          * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
          * evaluated based on the strategy chosen for the context.
@@ -1582,7 +1548,7 @@ export class LlamaContextSequence {
         yieldEogToken?: boolean,
         _noSampling?: boolean,
         _skipLock?: boolean
-    }): AsyncGenerator<Token, void | Token> {
+    }): AsyncGenerator<Token, void, void | Token | Token[]> {
         this._ensureNotDisposed();
 
         let evalTokens = tokens;
@@ -1654,10 +1620,12 @@ export class LlamaContextSequence {
                     evaluatorLock?.dispose();
                 }
 
-                const replacementToken = (yield nextToken) as undefined | Token;
+                const replacementToken = yield nextToken;
 
                 // set the tokens for the next evaluation
-                if (replacementToken != null)
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
                     evalTokens = [replacementToken];
                 else
                     evalTokens = [nextToken];
@@ -1687,7 +1655,7 @@ export class LlamaContextSequence {
         repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias),
         evaluationPriority?: EvaluationPriority, contextShiftOptions: Required<ContextShiftOptions>,
         yieldEogToken?: boolean, tokenPredictor: TokenPredictor
-    }): AsyncGenerator<Token, void | Token> {
+    }): AsyncGenerator<Token, void, void | Token | Token[]> {
         this._ensureNotDisposed();
 
         let evalTokens = tokens.slice();
@@ -1879,10 +1847,12 @@ export class LlamaContextSequence {
                     evaluatorLock.dispose();
                 }
 
-                const replacementToken = (yield nextToken) as undefined | Token;
+                const replacementToken = yield nextToken;
 
                 // set the tokens for the next evaluation
-                if (replacementToken != null)
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
                     evalTokens = [replacementToken];
                 else
                     evalTokens = [nextToken];
@@ -2166,14 +2136,29 @@ function reviveTokenProbabilities(probabilitiesList?: (Token | number)[]) {
         return undefined;
 
     const res = new Map<Token, number>();
+    let maxLogit: number | undefined = undefined;
 
-    for (let i = 1; i < probabilitiesList.length; i++) {
+    for (let i = 1; i < probabilitiesList.length; i += 2) {
         const token = probabilitiesList[i - 1]! as Token;
         const probability = probabilitiesList[i]! as number;
 
+        if (maxLogit == null || probability > maxLogit)
+            maxLogit = probability;
+
         res.set(token, probability);
     }
 
+    if (maxLogit != null) {
+        for (const [token, logit] of res)
+            res.set(token, Math.exp(logit - maxLogit));
+
+        const sum = Array.from(res.values()).reduce((a, b) => a + b, 0);
+
+        // normalize the probabilities
+        for (const [token, logit] of res)
+            res.set(token, logit / sum);
+    }
+
     return res;
 }
 
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 18db4525..5c78f7e2 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -314,9 +314,63 @@ export type SequenceEvaluateOptions = {
     _noSampling?: boolean
 };
 
+export type ControlledEvaluateInputItem = Token | [token: Token, options: {
+    generateNext?: {
+        /**
+         * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+         *
+         * Only enable when needed, since it impacts the performance.
+         *
+         * Defaults to `false`.
+         */
+        probabilitiesList?: boolean,
+
+        /**
+         * Generate the next token with the provided options using sampling.
+         *
+         * Setting this to `true` will generate probabilities for the next token and sample it.
+         */
+        singleToken?: boolean,
+
+        options?: {
+            temperature?: number, minP?: number, topK?: number, topP?: number,
+
+            /**
+             * Used to control the randomness of the generated text.
+             *
+             * Change the seed to get different results.
+             *
+             * Defaults to the current epoch time.
+             *
+             * Only relevant when using `temperature`.
+             */
+            seed?: number,
+            repeatPenalty?: LlamaContextSequenceRepeatPenalty,
+
+            /**
+             * Adjust the probability of tokens being generated.
+             * Can be used to bias the model to generate tokens that you want it to lean towards,
+             * or to avoid generating tokens that you want it to avoid.
+             */
+            tokenBias?: TokenBias | (() => TokenBias)
+        }
+    }
+}];
+
 export type ControlledEvaluateIndexOutput = {
     next: {
         token?: Token | null,
+
+        /**
+         * The probabilities of the tokens from the vocabulary to be the next token.
+         *
+         * A probability is a number from `0` to `1`.
+         *
+         * The map is sorted by the probability of the tokens from the highest to the lowest,
+         * and is reflected in the order of the entries when iterating over the map.
+         * Use `.entries().next().value` to get the top probability pair
+         * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+         */
         probabilities?: Map<Token, number>
     }
 };
diff --git a/src/evaluator/utils/chunkDocument.ts b/src/evaluator/utils/chunkDocument.ts
new file mode 100644
index 00000000..0d40be7a
--- /dev/null
+++ b/src/evaluator/utils/chunkDocument.ts
@@ -0,0 +1,358 @@
+import {LlamaContextSequence} from "../LlamaContext/LlamaContext.js";
+import {ChatHistoryItem, Token, Tokenizer} from "../../types.js";
+import {LlamaModel} from "../LlamaModel/LlamaModel.js";
+import {LlamaText, SpecialTokensText} from "../../utils/LlamaText.js";
+import {resolveChatWrapper} from "../../chatWrappers/utils/resolveChatWrapper.js";
+import {ControlledEvaluateInputItem} from "../LlamaContext/types.js";
+import {safeEventCallback} from "../../utils/safeEventCallback.js";
+import {maxRecentDetokenizerTokens} from "../../consts.js";
+
+/**
+ * Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
+ *
+ * This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
+ *
+ * Based on https://github.com/ZeroEntropy-AI/llama-chunk
+ * @experimental - this API is experimental and may change or be removed in subsequent releases
+ * @hidden
+ */
+export async function experimentalChunkDocument(options: {
+    contextSequence: LlamaContextSequence,
+    document: string,
+
+    /**
+     * The tokens to use as separators for chunking the document.
+     * Passed to the `getSystemPrompt` function to generate the prompt.
+     */
+    separatorTokens?: Token[],
+    getSystemPrompt?(options: {separatorTokens: Token[], tokenizer: Tokenizer, maxChunkSize?: number}): LlamaText | string,
+
+    /**
+     * Maximum number of tokens to allow in a chunk.
+     *
+     * As a chunk size approaches this limit, the higher the probability of a separator token being inserted.
+     *
+     * Set to `0` to disable this mechanism.
+     *
+     * Defaults to `500`.
+     */
+    maxChunkSize?: number,
+
+    /**
+     * The alignment curve for the maximum chunk size mechanism.
+     *
+     * Adjust the value based on the behavior of the model.
+     *
+     * Play around with values between `1` and `4` to see what works best for you.
+     *
+     * Set to `1` to disable this mechanism.
+     *
+     * Defaults to `4`.
+     */
+    maxChunkSizeAlignmentCurve?: number,
+
+    /**
+     * Append the next few tokens (up to `maxTokens`) to the current chunk if their trimmed content
+     * matches any of the texts in `trimmedTexts`
+     */
+    syntaxAlignment?: {
+        /**
+         * The maximum number of tokens to append to the current chunk if their trimmed content matches any of the texts in `trimmedTexts`.
+         *
+         * Default: `4`
+         */
+        maxTokens?: number,
+
+        /**
+         * The trimmed texts to match for, to append the token to the current chunk.
+         *
+         * Default: `["", ".", ";"]`
+         */
+        trimmedTexts?: string[]
+    },
+
+    /**
+     * The number of tokens to skip before starting to use the generated separator tokens to split the document.
+     */
+    skipFirstTokens?: number,
+
+    /**
+     * The number of recent probabilities to keep in the trail for normalization.
+     *
+     * Adjust the value based on the behavior of the model.
+     *
+     * Defaults to `200`.
+     */
+    normalizationTrailSize?: number,
+
+    /**
+     * Called when a chunk is generated with the tokens that make up the chunk and the separator token used to split the chunk.
+     */
+    onChunkTokens?(chunkTokens: Token[], usedSeparatorToken: Token): void,
+
+    /**
+     * Called when a chunk is generated with the text that makes up the chunk and the separator token used to split the chunk.
+     */
+    onChunkText?(chunkText: string, usedSeparatorToken: Token): void
+}) {
+    const {
+        contextSequence,
+        document,
+        separatorTokens = findAppropriateSeparatorTokens(contextSequence.model),
+        getSystemPrompt = getDefaultPrompt,
+        maxChunkSize = 500,
+        maxChunkSizeAlignmentCurve = 4,
+        syntaxAlignment: {
+            maxTokens: maxSyntaxAlignment = 4,
+            trimmedTexts: syntaxAlignmentTrimmedTexts = ["", ".", ";"]
+        } = {},
+        skipFirstTokens = 3,
+        normalizationTrailSize = 100
+    } = options;
+
+    const onChunkTokens = safeEventCallback(options.onChunkTokens);
+    const onChunkText = safeEventCallback(options.onChunkText);
+
+    if (separatorTokens.length === 0)
+        throw new Error("Separator tokens must be provided");
+
+    const chatHistory: ChatHistoryItem[] = [{
+        type: "system",
+        text: LlamaText(getSystemPrompt({
+            separatorTokens,
+            tokenizer: contextSequence.model.tokenizer,
+            maxChunkSize: maxChunkSize <= 0
+                ? undefined
+                : maxChunkSize
+        })).toJSON()
+    }, {
+        type: "user",
+        text: document
+    }, {
+        type: "model",
+        response: [""]
+    }];
+    const chatWrapper = resolveChatWrapper(contextSequence.model);
+    const {contextText} = chatWrapper.generateContextState({chatHistory});
+    const initialContextTokens = contextText.tokenize(contextSequence.model.tokenizer, "trimLeadingSpace");
+    const documentTokens = contextSequence.model.tokenize(document, false, "trimLeadingSpace");
+    const syntaxAlignmentTrimmedTextsSet = new Set(syntaxAlignmentTrimmedTexts);
+
+    if (initialContextTokens.length + documentTokens.length > contextSequence.context.contextSize)
+        throw new Error("The context size is too small to chunk the given document");
+
+    const evaluateInput: ControlledEvaluateInputItem[] = initialContextTokens.slice();
+    for (let i = 0; i < documentTokens.length - 1; i++) {
+        const token = documentTokens[i]!;
+        evaluateInput.push([token, {
+            generateNext: {
+                probabilitiesList: true
+            }
+        }]);
+    }
+
+    let weight = 1;
+    const recentProbabilitiesTrail: number[] = [];
+
+    let chunkStartIndex = 0;
+    let lastPushedSeparatorIndex = 0;
+    const chunks: Token[][] = [];
+    const res: string[] = [];
+    function pushSeparatorIndex(separateIndex: number, separatorToken: Token) {
+        lastPushedSeparatorIndex = separateIndex;
+
+        if (separateIndex <= chunkStartIndex)
+            return;
+
+        let endIndex = separateIndex;
+        for (let i = 0; i < maxSyntaxAlignment && documentTokens[endIndex + i] != null; i++) {
+            const text = contextSequence.model.detokenize([documentTokens[endIndex + i]!]);
+            if (!syntaxAlignmentTrimmedTextsSet.has(text.trim()))
+                break;
+
+            endIndex++;
+        }
+
+        const chunk = documentTokens.slice(chunkStartIndex, endIndex);
+        const text = contextSequence.model.detokenize(
+            chunk,
+            false,
+            documentTokens.slice(chunkStartIndex - maxRecentDetokenizerTokens, chunkStartIndex)
+        );
+        chunks.push(chunk);
+        chunkStartIndex = endIndex;
+
+        onChunkTokens?.(chunk, separatorToken);
+        onChunkText?.(text, separatorToken);
+        res.push(text);
+    }
+
+    await contextSequence.controlledEvaluate(evaluateInput, {
+        onTokenResult(inputTokenIndex, result) {
+            const i = inputTokenIndex - initialContextTokens.length;
+            const nextProbabilities = result?.next?.probabilities;
+            const nextDocumentToken = documentTokens[i + 1];
+
+            if (nextProbabilities == null)
+                throw new Error("received no result for token " + i);
+
+            const topProbabilityScore = nextProbabilities.entries()
+                .next().value?.[1];
+            const [usedSeparatorToken, separatorProbability] = separatorTokens
+                .filter((token) => token !== nextDocumentToken) // avoid splitting on document tokens
+                .map((token) => [token, nextProbabilities.get(token)] as [token: Token, probability: number | undefined])
+                .filter((pair): pair is [token: Token, probability: number] => pair[1] != null)
+                .reduce(([tokenA, probabilityA], [tokenB, probabilityB]) => {
+                    if (probabilityA >= probabilityB)
+                        return [tokenA, probabilityA];
+
+                    return [tokenB, probabilityB];
+                }, [separatorTokens[0]!, 0]);
+
+            if (topProbabilityScore == null || separatorProbability == null || separatorProbability === 0)
+                return;
+
+            // console.log(
+            //     i, contextSequence.model.detokenize([documentTokens[i]!]),
+            //     Array.from(nextProbabilities.entries()).slice(0, 5)
+            //         .map(([token, probability]) => [contextSequence.model.detokenize([token], true), probability])
+            // );
+
+            if (separatorProbability >= topProbabilityScore)
+                pushSeparatorIndex(i + 1, usedSeparatorToken);
+            else if (i > skipFirstTokens) {
+                const adjustedProbability = separatorProbability + (weight * (1 - separatorProbability));
+                let maxChunkSizeAlignment = 0;
+                if (maxChunkSize !== 0 && adjustedProbability < topProbabilityScore) {
+                    const leftProbability = 1 - adjustedProbability;
+                    const currentChunkSize = Math.max(0, 1 + i - chunkStartIndex);
+                    maxChunkSizeAlignment = currentChunkSize === 0
+                        ? 0
+                        : adjustExponential(
+                            leftProbability * Math.min(1, currentChunkSize / maxChunkSize),
+                            maxChunkSizeAlignmentCurve <= 0
+                                ? 1
+                                : maxChunkSizeAlignmentCurve,
+                            0.8
+                        );
+
+                    if (currentChunkSize === maxChunkSize)
+                        maxChunkSizeAlignment = 1;
+                }
+
+                if (adjustedProbability + maxChunkSizeAlignment >= topProbabilityScore && adjustedProbability > 0) {
+                    pushSeparatorIndex(i + 1, usedSeparatorToken);
+
+                    // update the weight of the current token with the adjusted probability in the trail
+                    if (recentProbabilitiesTrail.length > 1) {
+                        weight /= recentProbabilitiesTrail.pop()!;
+                        recentProbabilitiesTrail.push(adjustedProbability);
+                        weight *= adjustedProbability;
+                    }
+                }
+            }
+
+            const nextDocumentTokenProbability = nextDocumentToken == null
+                ? undefined
+                : nextProbabilities.get(nextDocumentToken);
+            if (nextDocumentTokenProbability != null && nextDocumentTokenProbability > 0) {
+                recentProbabilitiesTrail.push(nextDocumentTokenProbability);
+                weight *= nextDocumentTokenProbability;
+
+                if (recentProbabilitiesTrail.length > normalizationTrailSize)
+                    weight /= recentProbabilitiesTrail.shift()!;
+            }
+        }
+    });
+
+    if (lastPushedSeparatorIndex !== documentTokens.length)
+        pushSeparatorIndex(documentTokens.length, separatorTokens[0]!);
+
+    return res;
+}
+
+const idealTokenTexts = [
+    "\u6bb5", // means "section" in Chinese (according to https://github.com/ZeroEntropy-AI/llama-chunk)
+    "\u987f", // means "pause" in Chinese (according to Llama 3.1 8B and Qwen 2.5 3B)
+    "\u00a1", // inverted exclamation mark
+    "|",
+    "_"
+];
+function findAppropriateSeparatorTokens(model: LlamaModel, maxTokens: number = 2): Token[] {
+    const idealTextsSet = new Set(idealTokenTexts);
+    const foundTokens: Token[] = [];
+
+    for (const token of model.iterateAllTokens()) {
+        if (model.isSpecialToken(token))
+            continue;
+
+        const text = model.detokenize([token]);
+        const trimmedText = text.trim();
+        if (idealTextsSet.has(trimmedText)) {
+            const textIndex = idealTokenTexts.findIndex((idealText) => idealText === trimmedText);
+
+            if (foundTokens[textIndex] == null || text === trimmedText)
+                foundTokens[textIndex] = token;
+        }
+    }
+
+    const res: Token[] = [];
+    for (let i = 0; i < idealTokenTexts.length; i++) {
+        const token = foundTokens[i];
+
+        if (token != null)
+            res.push(token);
+    }
+
+    return res.slice(0, maxTokens);
+}
+
+function getDefaultPrompt({
+    separatorTokens, tokenizer, maxChunkSize = 500
+}: {
+    separatorTokens: Token[], tokenizer: Tokenizer, maxChunkSize?: number
+}): LlamaText {
+    if (separatorTokens.length === 0)
+        throw new Error("No separator tokens provided");
+    else if (separatorTokens.length > 2)
+        throw new Error("Maximum of 2 separator tokens are supported");
+
+    return LlamaText.joinValues("\n", [
+        'Your job is to act as a "Chunker", for usage in RAG pipelines. The user will provide a long document.',
+        "",
+        "You should repeat the exact same message verbatim. EXCEPT, you should insert split tokens throughout the document.",
+        "",
+        "# Instructions",
+        LlamaText([
+            "- For splits, use `",
+            new SpecialTokensText(tokenizer.detokenize([separatorTokens[0]!])),
+            '` as the "big split token" separator.'
+        ]),
+        separatorTokens.length > 1 && (
+            LlamaText([
+                "- For small splits, use `",
+                new SpecialTokensText(tokenizer.detokenize([separatorTokens[1]!])),
+                '` as the "big split token" separator.'
+            ])
+        ),
+        "- For example, in text document, small splits will be per-sentence, and big splits will be per-section. Do a big split BEFORE the header that defines a section.",
+        LlamaText([
+            "- You may get a user message that is unstructured or not structured cleanly. " +
+            "Still try to split that input as best as you can, even if it means doing a small split every ", Math.ceil(maxChunkSize / 5),
+            " characters, and a big split every ", Math.floor(maxChunkSize), " characters."
+        ]),
+        "- You should prefer to wait until the end of a newline or period to break, instead of breaking one or two tokens before that. If there are no newlines or periods, pick some other reasonable breakpoints instead.",
+        "- Your input could be anything - code, HTML, markdown, etc. You MUST try to output SOME split regardless of the input. Pick something reasonable! E.g. for nodejs, do a small split after every line or code block, and a big split after every function or class definitions.",
+        '- For HTML, add a small split token after every closing tag and sentence. Add a big split token after every closing tag of an "important" tag.',
+        "- Please note that you will sometimes not see your own splits in your previous output, that's OK, you MUST continue to try to output split tokens"
+    ].filter((x) => x !== false));
+}
+
+function adjustExponential(value: number, exponent: number, weight: number) {
+    if (value < 0)
+        return 0;
+    else if (value > 1)
+        return 1;
+
+    return (value * (1 - weight)) + (weight * Math.pow(value, exponent));
+}
diff --git a/src/index.ts b/src/index.ts
index 37d74ace..04f97ce8 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -18,7 +18,8 @@ import {LlamaEmbedding, type LlamaEmbeddingOptions, type LlamaEmbeddingJSON} fro
 import {
     type LlamaContextOptions, type SequenceEvaluateOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty,
     type CustomBatchingDispatchSchedule, type CustomBatchingPrioritizationStrategy, type BatchItem, type PrioritizedBatchItem,
-    type ContextShiftOptions, type ContextTokensDeleteRange, type EvaluationPriority, type ControlledEvaluateIndexOutput
+    type ContextShiftOptions, type ContextTokensDeleteRange, type EvaluationPriority, type ControlledEvaluateInputItem,
+    type ControlledEvaluateIndexOutput
 } from "./evaluator/LlamaContext/types.js";
 import {TokenBias} from "./evaluator/TokenBias.js";
 import {
@@ -82,6 +83,7 @@ import {
     type CombinedModelDownloaderOptions
 } from "./utils/createModelDownloader.js";
 import {jsonDumps} from "./chatWrappers/utils/jsonDumps.js";
+import {experimentalChunkDocument} from "./evaluator/utils/chunkDocument.js";
 
 import {
     type ChatHistoryItem, type ChatModelFunctionCall, type ChatModelFunctions, type ChatModelResponse,
@@ -143,6 +145,7 @@ export {
     type ContextTokensDeleteRange,
     type EvaluationPriority,
     type LlamaContextSequenceRepeatPenalty,
+    type ControlledEvaluateInputItem,
     type ControlledEvaluateIndexOutput,
     TokenBias,
     LlamaEmbeddingContext,
@@ -290,5 +293,6 @@ export {
     CombinedModelDownloader,
     type CombinedModelDownloaderOptions,
     jsonDumps,
-    type OverridesObject
+    type OverridesObject,
+    experimentalChunkDocument
 };
diff --git a/test/modelDependent/llama3.1/chunkDocument.test.ts b/test/modelDependent/llama3.1/chunkDocument.test.ts
new file mode 100644
index 00000000..4484b3c2
--- /dev/null
+++ b/test/modelDependent/llama3.1/chunkDocument.test.ts
@@ -0,0 +1,82 @@
+import {describe, expect, test} from "vitest";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+import {experimentalChunkDocument} from "../../../src/index.js";
+
+// made up example paragraph
+const exampleParagraph = [
+    "The Luminawing (genus: Luxavis, species: nocturna) is a rare and enigmatic nocturnal creature native to the dense forests of the remote continent of Aethoria.",
+    "Characterized by its striking appearance and unique adaptations, this mystical animal has garnered significant attention from scientists and naturalists.",
+    "",
+    "## Physical Characteristics",
+    "The Luminawing's most distinctive feature is its pair of iridescent wings, which reflect the colors of its surroundings through a complex process involving microscopic crystals embedded in the wing membrane.",
+    "This remarkable ability allows the creature to blend seamlessly into the night sky, making it nearly invisible to predators and prey alike.",
+    "",
+    "Its slender body measures approximately 30-40 centimeters in length, covered in soft, glowing fur that shimmers like starlight under ultraviolet light. The Luminawing's large, round eyes are capable of perceiving even the faintest glows, allowing it to navigate through the dark forest with ease.",
+    "",
+    "## Behavior and Habitat",
+    "The Luminawing is a solitary creature, only coming together with others of its kind during the mating season.",
+    "It inhabits the dense forests of Aethoria, where it feeds on the nectar of rare, moon-blooming flowers (genus: Lunaria).",
+    "These flowers are said to possess magical properties, which are believed to be absorbed by the Luminawing through its diet.",
+    "",
+    "The creature's haunting melody can be heard echoing through the forest at dusk, a siren call that beckons in the night creatures and fills the air with wonder. This unique vocalization is thought to play a crucial role in the Luminawing's mating rituals and territorial defense.",
+    "",
+    "## Conservation Status",
+    "Due to its elusive nature and limited range, the Luminawing is currently listed as a species of special concern by the Aethorian Conservation Society.",
+    "Efforts are being made to protect its habitat and study its behavior, but more research is needed to fully understand this enigmatic creature's place in the ecosystem."
+].join("\n");
+
+describe("llama 3.1", () => {
+    describe("chunk document", () => {
+        test("DraftModelTokenPredictor", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const contextSequence = context.getSequence();
+
+            const res = await experimentalChunkDocument({
+                contextSequence,
+                document: exampleParagraph
+            });
+
+            expect(res.join("\n---\n")).toMatchInlineSnapshot(`
+              "The Luminawing
+              ---
+               (genus: Luxavis, species: nocturna)
+              ---
+               is a rare and enigmatic nocturnal creature native to the dense forests of the remote continent of Aethoria.
+
+              ---
+              Characterized by its striking appearance and unique adaptations, this mystical animal has garnered significant attention from scientists and naturalists.
+
+              ## Physical Characteristics
+
+              ---
+              The Luminawing's most distinctive feature is its pair of iridescent wings, which reflect the colors of its surroundings through a complex process involving microscopic crystals embedded in the wing membrane.
+              This remarkable ability allows the creature to blend seamlessly into the night sky, making it nearly invisible to predators and prey alike.
+
+              Its slender body measures approximately 30-40 centimeters in length, covered in soft, glowing fur that shimmers like starlight under ultraviolet light. The Luminawing's large, round eyes are capable of perceiving even the faintest glows, allowing it to navigate through the dark forest with ease.
+
+              ## Behavior and Habitat
+              The Luminawing is a solitary creature, only coming together with others of its kind during the mating season.
+              It inhabits the dense forests of Aethoria, where it feeds on the nectar of rare, moon-blooming flowers (genus: Lunaria).
+              These flowers are said to possess magical properties, which are believed to be absorbed by the Luminawing through its diet.
+
+              The creature's haunting melody can be heard echoing through the forest at dusk, a siren call that beckons in the night creatures and fills the air with wonder. This unique vocalization is thought to play a crucial role in the Luminawing's mating rituals and territorial defense.
+
+              ## Conservation Status
+
+              ---
+              Due to its elusive nature and limited range, the Luminawing is currently listed as a species of special concern by the Aethorian Conservation Society.
+              Efforts are being made to protect its habitat and study its behavior, but more research is needed to fully understand this enigmatic creature's place in the ecosystem."
+            `);
+        });
+    });
+});
+

From 8c2a54bd93678f4b838628431edda92718812d7b Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 05:14:21 +0200
Subject: [PATCH 28/73] fix: switch back to the latest `llama.cpp` release in
 the CI

---
 .github/workflows/build.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7f3da7a2..7d9960d5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,8 +23,7 @@ jobs:
       - name: Download latest llama.cpp release
         env:
           CI: true
-        # pinned to `b4291` temporarily until the Windows on Arm64 build is fixed
-        run: node ./dist/cli/cli.js source download --release b4291 --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
+        run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:

From 084dfd4a30f20b07b2b4dffed93eb7d41ca94f04 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 05:30:35 +0200
Subject: [PATCH 29/73] fix: missing includes

---
 llama/addon/globals/getMemoryInfo.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llama/addon/globals/getMemoryInfo.cpp b/llama/addon/globals/getMemoryInfo.cpp
index 26472d3a..50477caf 100644
--- a/llama/addon/globals/getMemoryInfo.cpp
+++ b/llama/addon/globals/getMemoryInfo.cpp
@@ -6,8 +6,9 @@
 #include <mach/mach.h>
 #include <sys/sysctl.h>
 #elif __linux__
-#include <iostream>
-#include <sys/sysinfo.h>
+#include <fstream>
+#include <sstream>
+#include <string>
 #elif _WIN32
 #include <iostream>
 #include <windows.h>

From a5b8ad450676129263c94de1789f9345548b2eca Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 05:37:37 +0200
Subject: [PATCH 30/73] fix: Windows cmake build

---
 llama/profiles/llvm.win32.host-arm64.target-arm64.cmake | 3 +++
 llama/profiles/llvm.win32.host-x64.target-arm64.cmake   | 2 ++
 llama/profiles/llvm.win32.host-x64.target-x64.cmake     | 2 ++
 llama/toolchains/llvm.win32.host-x64.target-x64.cmake   | 2 ++
 llama/toolchains/win32.host-arm64.target-arm64.cmake    | 3 +++
 llama/toolchains/win32.host-x64.target-arm64.cmake      | 2 ++
 6 files changed, 14 insertions(+)

diff --git a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
index 0ba604b4..c1422b27 100644
--- a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
@@ -1,6 +1,9 @@
 set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
 set(PROGRAMFILES "$ENV{ProgramFiles}")
 set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+file(TO_CMAKE_PATH "${PROGRAMFILES_ARM64}" PROGRAMFILES_ARM64)
+file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
 set(PROGRAMFILES_PATHS
     "${PROGRAMFILES_ARM64}"
     "${PROGRAMFILES}"
diff --git a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
index 7eff4f49..e9796e02 100644
--- a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
@@ -1,5 +1,7 @@
 set(PROGRAMFILES "$ENV{ProgramFiles}")
 set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
 set(PROGRAMFILES_PATHS
     "${PROGRAMFILES}"
     "${PROGRAMFILES_X86}"
diff --git a/llama/profiles/llvm.win32.host-x64.target-x64.cmake b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
index f02b0e8e..ed0e2e71 100644
--- a/llama/profiles/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
@@ -1,5 +1,7 @@
 set(PROGRAMFILES "$ENV{ProgramFiles}")
 set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
 set(PROGRAMFILES_PATHS
     "${PROGRAMFILES}"
     "${PROGRAMFILES_X86}"
diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
index ec4d495c..63e3bd52 100644
--- a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
@@ -9,6 +9,8 @@ set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llv
 
 set(PROGRAMFILES "$ENV{ProgramFiles}")
 set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
 set(PROGRAMFILES_PATHS
     "${PROGRAMFILES}"
     "${PROGRAMFILES_X86}"
diff --git a/llama/toolchains/win32.host-arm64.target-arm64.cmake b/llama/toolchains/win32.host-arm64.target-arm64.cmake
index 9e7c94ef..6eaabd72 100644
--- a/llama/toolchains/win32.host-arm64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-arm64.target-arm64.cmake
@@ -14,6 +14,9 @@ set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llv
 set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
 set(PROGRAMFILES "$ENV{ProgramFiles}")
 set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+file(TO_CMAKE_PATH "${PROGRAMFILES_ARM64}" PROGRAMFILES_ARM64)
+file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
 set(PROGRAMFILES_PATHS
     "${PROGRAMFILES_ARM64}"
     "${PROGRAMFILES}"
diff --git a/llama/toolchains/win32.host-x64.target-arm64.cmake b/llama/toolchains/win32.host-x64.target-arm64.cmake
index da82b61b..243d1906 100644
--- a/llama/toolchains/win32.host-x64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-x64.target-arm64.cmake
@@ -13,6 +13,8 @@ set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llv
 
 set(PROGRAMFILES "$ENV{ProgramFiles}")
 set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
 set(PROGRAMFILES_PATHS
     "${PROGRAMFILES}"
     "${PROGRAMFILES_X86}"

From 8ba59389c1a54e95b3b3c00c529a1d3707e87b65 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 19:38:13 +0200
Subject: [PATCH 31/73] fix: Windows cmake build

---
 llama/cmake/win32.ensureNinjaPath.cmake       | 54 +++++++++++
 llama/cmake/win32.ensureNodeLib.cmake         | 34 +++++++
 .../win32.llvmApplyGnuModeAdaptations.cmake   | 12 +++
 llama/cmake/win32.llvmEnsureCmakeAr.cmake     | 34 +++++++
 .../cmake/win32.llvmUseGnuModeCompilers.cmake | 76 +++++++++++++++
 llama/cmake/win32.programFilesPaths.cmake     | 29 ++++++
 .../llvm.win32.host-arm64.target-arm64.cmake  | 83 ++---------------
 .../llvm.win32.host-x64.target-arm64.cmake    | 79 ++--------------
 .../llvm.win32.host-x64.target-x64.cmake      | 79 ++--------------
 .../llvm.win32.host-x64.target-x64.cmake      | 90 ++----------------
 .../win32.host-arm64.target-arm64.cmake       | 92 ++-----------------
 .../win32.host-x64.target-arm64.cmake         | 88 ++----------------
 src/bindings/utils/compileLLamaCpp.ts         |  2 +-
 13 files changed, 285 insertions(+), 467 deletions(-)
 create mode 100644 llama/cmake/win32.ensureNinjaPath.cmake
 create mode 100644 llama/cmake/win32.ensureNodeLib.cmake
 create mode 100644 llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
 create mode 100644 llama/cmake/win32.llvmEnsureCmakeAr.cmake
 create mode 100644 llama/cmake/win32.llvmUseGnuModeCompilers.cmake
 create mode 100644 llama/cmake/win32.programFilesPaths.cmake

diff --git a/llama/cmake/win32.ensureNinjaPath.cmake b/llama/cmake/win32.ensureNinjaPath.cmake
new file mode 100644
index 00000000..b0d38f7c
--- /dev/null
+++ b/llama/cmake/win32.ensureNinjaPath.cmake
@@ -0,0 +1,54 @@
+function(ensureNinjaPath PROGRAMFILES_PATHS)
+    if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}" OR NOT CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
+        find_program(NINJA_EXECUTABLE ninja)
+
+        set(CMAKE_MAKE_PROGRAM "")
+
+        if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
+            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            set(PROGRAMDATA_PATH "$ENV{ProgramData}")
+            file(TO_CMAKE_PATH "${PROGRAMDATA_PATH}" PROGRAMDATA_PATH)
+
+            if (PROGRAMDATA_PATH AND EXISTS "${PROGRAMDATA_PATH}")
+                file(GLOB_RECURSE FOUND_NINJA_EXE "${PROGRAMDATA_PATH}/chocolatey/bin/ninja.exe")
+
+                if(FOUND_NINJA_EXE)
+                    list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                endif()
+            else()
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            set(LOCALAPPDATA_PATH "$ENV{LOCALAPPDATA}")
+            file(TO_CMAKE_PATH "${LOCALAPPDATA_PATH}" LOCALAPPDATA_PATH)
+
+            if (LOCALAPPDATA_PATH AND EXISTS "${LOCALAPPDATA_PATH}")
+                file(GLOB_RECURSE FOUND_NINJA_EXE "${LOCALAPPDATA_PATH}/Microsoft/WinGet/Packages/Ninja-build.Ninja_Microsoft.Winget.*/ninja.exe")
+
+                if(FOUND_NINJA_EXE)
+                    list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                endif()
+            else()
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+                file(GLOB_RECURSE FOUND_NINJA_EXE
+                    "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
+                    "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
+
+                if(FOUND_NINJA_EXE)
+                    list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                    break()
+                endif()
+            endforeach()
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
+        endif()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.ensureNodeLib.cmake b/llama/cmake/win32.ensureNodeLib.cmake
new file mode 100644
index 00000000..5908b32c
--- /dev/null
+++ b/llama/cmake/win32.ensureNodeLib.cmake
@@ -0,0 +1,34 @@
+function(ensureNodeLib PROGRAMFILES_PATHS HOST_ARCH TARGET_ARCH)
+    if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+        if (NOT DEFINED NODE_LIB_CMAKE_AR)
+            foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+                if(NODE_LIB_CMAKE_AR)
+                    break()
+                endif()
+
+                file(GLOB_RECURSE FOUND_LIB_EXE
+                    "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Host${HOST_ARCH}/${TARGET_ARCH}/lib.exe"
+                    "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Host${HOST_ARCH}/${TARGET_ARCH}/lib.exe")
+
+                if(FOUND_LIB_EXE)
+                    list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
+                    break()
+                endif()
+            endforeach()
+        endif()
+
+        set(NODE_LIB_CMAKE_AR_MACHINE_FLAG "")
+        if (TARGET_ARCH STREQUAL "x64")
+            set(NODE_LIB_CMAKE_AR_MACHINE_FLAG "/MACHINE:X64")
+        elseif (TARGET_ARCH STREQUAL "arm64")
+            set(NODE_LIB_CMAKE_AR_MACHINE_FLAG "/MACHINE:ARM64")
+        endif()
+
+        if (EXISTS "${NODE_LIB_CMAKE_AR}")
+            # Generate node.lib
+            execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} ${NODE_LIB_CMAKE_AR_MACHINE_FLAG} /nologo)
+        else()
+            message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
+        endif()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
new file mode 100644
index 00000000..4be6a324
--- /dev/null
+++ b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
@@ -0,0 +1,12 @@
+function(llvmApplyGnuModeAdaptations)
+    # adapt cmake-js to work with llvm in GNU mode
+    if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
+    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
+        CMAKE_SHARED_LINKER_FLAGS
+        "${CMAKE_SHARED_LINKER_FLAGS}")
+    endif()
+
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+endfunction()
diff --git a/llama/cmake/win32.llvmEnsureCmakeAr.cmake b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
new file mode 100644
index 00000000..7e93539f
--- /dev/null
+++ b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
@@ -0,0 +1,34 @@
+function(llvmEnsureCmakeAr PROGRAMFILES_PATHS CURRENT_ARCH)
+    set (LLVM_DIR_ARCH_NAME "")
+    if (CURRENT_ARCH STREQUAL "x64")
+        set (LLVM_DIR_ARCH_NAME "x64")
+    elseif (CURRENT_ARCH STREQUAL "arm64")
+        set (LLVM_DIR_ARCH_NAME "ARM64")
+    endif()
+
+    if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
+        set(LLVM_INSTALL_PATHS "")
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+            file(GLOB_RECURSE FOUND_LLVM_ROOT
+                "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/${CURRENT_ARCH}"
+                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/${CURRENT_ARCH}")
+
+            if(FOUND_LLVM_ROOT)
+                list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+            endif()
+        endforeach()
+
+        if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+            list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+        endif()
+
+        foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+            if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
+                set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
+                break()
+            endif()
+        endforeach()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
new file mode 100644
index 00000000..7851191a
--- /dev/null
+++ b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
@@ -0,0 +1,76 @@
+function(llvmUseGnuModeCompilers CURRENT_ARCH PROGRAMFILES_PATHS)
+    set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
+
+    set(CMAKE_C_COMPILER clang)
+    set(CMAKE_CXX_COMPILER clang++)
+    set(CMAKE_RC_COMPILER llvm-rc)
+
+
+    set (LLVM_DIR_ARCH_NAME "")
+    if (CURRENT_ARCH STREQUAL "x64")
+        set (LLVM_DIR_ARCH_NAME "x64")
+    elseif (CURRENT_ARCH STREQUAL "arm64")
+        set (LLVM_DIR_ARCH_NAME "ARM64")
+    endif()
+
+    set(LLVM_INSTALL_PATHS "")
+    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+        file(GLOB_RECURSE FOUND_LLVM_ROOT
+            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}"
+            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}")
+
+        if(FOUND_LLVM_ROOT)
+            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+        endif()
+    endforeach()
+
+    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+    endif()
+
+    set(LLVM_ROOT "")
+    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+        if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
+            set(LLVM_ROOT "${PATH}")
+            break()
+        endif()
+    endforeach()
+
+    if(LLVM_ROOT STREQUAL "")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "LLVM installation was not found. Please install LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "LLVM installation was not found. Please install LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+
+    if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
+        set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
+        set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
+        set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
+    endif()
+
+    if (NOT EXISTS "${CMAKE_C_COMPILER}")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+    if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+    if (NOT EXISTS "${CMAKE_RC_COMPILER}")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.programFilesPaths.cmake b/llama/cmake/win32.programFilesPaths.cmake
new file mode 100644
index 00000000..a0732489
--- /dev/null
+++ b/llama/cmake/win32.programFilesPaths.cmake
@@ -0,0 +1,29 @@
+function(setProgramFilesPaths CURRENT_ARCH PROGRAMFILES_PATHS)
+    set(PROGRAMFILES "$ENV{ProgramFiles}")
+    set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+    file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+    file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
+
+    if(CURRENT_ARCH STREQUAL "arm64")
+        set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
+        file(TO_CMAKE_PATH "${PROGRAMFILES_ARM64}" PROGRAMFILES_ARM64)
+
+        set(PROGRAMFILES_PATHS
+            "${PROGRAMFILES_ARM64}"
+            "${PROGRAMFILES}"
+            "${PROGRAMFILES_X86}"
+            "C:/Program Files (Arm)"
+            "C:/Program Files"
+            "C:/Program Files (x86)"
+            PARENT_SCOPE
+        )
+    else()
+        set(PROGRAMFILES_PATHS
+            "${PROGRAMFILES}"
+            "${PROGRAMFILES_X86}"
+            "C:/Program Files"
+            "C:/Program Files (x86)"
+            PARENT_SCOPE
+        )
+    endif()
+endfunction()
diff --git a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
index c1422b27..a59e7d09 100644
--- a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
@@ -1,78 +1,11 @@
-set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
-set(PROGRAMFILES "$ENV{ProgramFiles}")
-set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
-file(TO_CMAKE_PATH "${PROGRAMFILES_ARM64}" PROGRAMFILES_ARM64)
-file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
-file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
-set(PROGRAMFILES_PATHS
-    "${PROGRAMFILES_ARM64}"
-    "${PROGRAMFILES}"
-    "${PROGRAMFILES_X86}"
-    "C:/Program Files (Arm)"
-    "C:/Program Files"
-    "C:/Program Files (x86)"
-)
+include(../cmake/win32.programFilesPaths.cmake)
+setProgramFilesPaths("arm64" PROGRAMFILES_PATHS)
 
-if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
-    if (NOT DEFINED NODE_LIB_CMAKE_AR)
-        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-            if(NODE_LIB_CMAKE_AR)
-                break()
-            endif()
+include(../cmake/win32.ensureNodeLib.cmake)
+ensureNodeLib(PROGRAMFILES_PATHS "arm64" "arm64")
 
-            file(GLOB_RECURSE FOUND_LIB_EXE
-                "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Hostarm64/arm64/lib.exe"
-                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Hostarm64/arm64/lib.exe")
+include(../cmake/win32.llvmApplyGnuModeAdaptations.cmake)
+llvmApplyGnuModeAdaptations()
 
-            if(FOUND_LIB_EXE)
-                list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
-                break()
-            endif()
-        endforeach()
-    endif()
-
-    if (EXISTS "${NODE_LIB_CMAKE_AR}")
-        # Generate node.lib
-        execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} /MACHINE:ARM64 /nologo)
-    else()
-        message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
-    endif()
-endif()
-
-# adapt cmake-js to work with llvm in GNU mode
-if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
-    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
-        CMAKE_SHARED_LINKER_FLAGS
-        "${CMAKE_SHARED_LINKER_FLAGS}")
-endif()
-
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
-
-# ensure CMAKE_AR is configured
-if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
-    set(LLVM_INSTALL_PATHS "")
-    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
-
-        file(GLOB_RECURSE FOUND_LLVM_ROOT
-            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/ARM64"
-            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/ARM64")
-
-        if(FOUND_LLVM_ROOT)
-            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
-        endif()
-    endforeach()
-
-    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
-        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
-    endif()
-
-    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
-        if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
-            set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
-            break()
-        endif()
-    endforeach()
-endif()
\ No newline at end of file
+include(../cmake/win32.llvmEnsureCmakeAr.cmake)
+llvmEnsureCmakeAr(PROGRAMFILES_PATHS "arm64")
diff --git a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
index e9796e02..77d0ba2d 100644
--- a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
@@ -1,74 +1,11 @@
-set(PROGRAMFILES "$ENV{ProgramFiles}")
-set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
-file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
-file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
-set(PROGRAMFILES_PATHS
-    "${PROGRAMFILES}"
-    "${PROGRAMFILES_X86}"
-    "C:/Program Files"
-    "C:/Program Files (x86)"
-)
+include(../cmake/win32.programFilesPaths.cmake)
+setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
 
-if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
-    if (NOT DEFINED NODE_LIB_CMAKE_AR)
-        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-            if(NODE_LIB_CMAKE_AR)
-                break()
-            endif()
+include(../cmake/win32.ensureNodeLib.cmake)
+ensureNodeLib(PROGRAMFILES_PATHS "x64" "arm64")
 
-            file(GLOB_RECURSE FOUND_LIB_EXE
-                "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/lib.exe"
-                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/lib.exe")
+include(../cmake/win32.llvmApplyGnuModeAdaptations.cmake)
+llvmApplyGnuModeAdaptations()
 
-            if(FOUND_LIB_EXE)
-                list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
-                break()
-            endif()
-        endforeach()
-    endif()
-
-    if (EXISTS "${NODE_LIB_CMAKE_AR}")
-        # Generate node.lib
-        execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} /MACHINE:ARM64 /nologo)
-    else()
-        message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
-    endif()
-endif()
-
-# adapt cmake-js to work with llvm in GNU mode
-if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
-    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
-        CMAKE_SHARED_LINKER_FLAGS
-        "${CMAKE_SHARED_LINKER_FLAGS}")
-endif()
-
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
-
-# ensure CMAKE_AR is configured
-if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
-    set(LLVM_INSTALL_PATHS "")
-    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
-
-        file(GLOB_RECURSE FOUND_LLVM_ROOT
-            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
-            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
-
-        if(FOUND_LLVM_ROOT)
-            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
-        endif()
-    endforeach()
-
-    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
-        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
-    endif()
-
-    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
-        if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
-            set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
-            break()
-        endif()
-    endforeach()
-endif()
\ No newline at end of file
+include(../cmake/win32.llvmEnsureCmakeAr.cmake)
+llvmEnsureCmakeAr(PROGRAMFILES_PATHS "x64")
diff --git a/llama/profiles/llvm.win32.host-x64.target-x64.cmake b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
index ed0e2e71..3e71388f 100644
--- a/llama/profiles/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
@@ -1,74 +1,11 @@
-set(PROGRAMFILES "$ENV{ProgramFiles}")
-set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
-file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
-file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
-set(PROGRAMFILES_PATHS
-    "${PROGRAMFILES}"
-    "${PROGRAMFILES_X86}"
-    "C:/Program Files"
-    "C:/Program Files (x86)"
-)
+include(../cmake/win32.programFilesPaths.cmake)
+setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
 
-if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
-    if (NOT DEFINED NODE_LIB_CMAKE_AR)
-        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-            if(NODE_LIB_CMAKE_AR)
-                break()
-            endif()
+include(../cmake/win32.ensureNodeLib.cmake)
+ensureNodeLib(PROGRAMFILES_PATHS "x64" "x64")
 
-            file(GLOB_RECURSE FOUND_LIB_EXE
-                "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Hostx64/x64/lib.exe"
-                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Hostx64/x64/lib.exe")
+include(../cmake/win32.llvmApplyGnuModeAdaptations.cmake)
+llvmApplyGnuModeAdaptations()
 
-            if(FOUND_LIB_EXE)
-                list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
-                break()
-            endif()
-        endforeach()
-    endif()
-
-    if (EXISTS "${NODE_LIB_CMAKE_AR}")
-        # Generate node.lib
-        execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} /MACHINE:X64 /nologo)
-    else()
-        message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
-    endif()
-endif()
-
-# adapt cmake-js to work with llvm in GNU mode
-if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
-    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
-        CMAKE_SHARED_LINKER_FLAGS
-        "${CMAKE_SHARED_LINKER_FLAGS}")
-endif()
-
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
-
-# ensure CMAKE_AR is configured
-if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
-    set(LLVM_INSTALL_PATHS "")
-    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
-
-        file(GLOB_RECURSE FOUND_LLVM_ROOT
-            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
-            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
-
-        if(FOUND_LLVM_ROOT)
-            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
-        endif()
-    endforeach()
-
-    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
-        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
-    endif()
-
-    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
-        if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
-            set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
-            break()
-        endif()
-    endforeach()
-endif()
\ No newline at end of file
+include(../cmake/win32.llvmEnsureCmakeAr.cmake)
+llvmEnsureCmakeAr(PROGRAMFILES_PATHS "x64")
diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
index 63e3bd52..86d1ebbe 100644
--- a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
@@ -1,92 +1,20 @@
 set(CMAKE_SYSTEM_NAME Windows)
 set(CMAKE_SYSTEM_PROCESSOR x86_64)
 
-set(CMAKE_C_COMPILER clang)
-set(CMAKE_CXX_COMPILER clang++)
-set(CMAKE_RC_COMPILER llvm-rc)
+set(target x86_64-pc-windows-msvc)
+set(CMAKE_C_COMPILER_TARGET ${target})
+set(CMAKE_CXX_COMPILER_TARGET ${target})
 
-set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
+include(../cmake/win32.programFilesPaths.cmake)
+setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
 
-set(PROGRAMFILES "$ENV{ProgramFiles}")
-set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
-file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
-file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
-set(PROGRAMFILES_PATHS
-    "${PROGRAMFILES}"
-    "${PROGRAMFILES_X86}"
-    "C:/Program Files"
-    "C:/Program Files (x86)"
-)
+include(../cmake/win32.llvmUseGnuModeCompilers.cmake)
+llvmUseGnuModeCompilers("x64" PROGRAMFILES_PATHS)
 
-set(LLVM_INSTALL_PATHS "")
-foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-    list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
-
-    file(GLOB_RECURSE FOUND_LLVM_ROOT
-        "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
-        "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
-
-    if(FOUND_LLVM_ROOT)
-        list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
-    endif()
-endforeach()
-
-if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
-    list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
-endif()
-
-set(LLVM_ROOT "")
-foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
-    if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
-        set(LLVM_ROOT "${PATH}")
-        break()
-    endif()
-endforeach()
-
-if(LLVM_ROOT STREQUAL "")
-    message(FATAL_ERROR "LLVM installation was not found. Please install LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
-
-if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
-    set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
-    set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
-    set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
-endif()
-
-if (NOT EXISTS "${CMAKE_C_COMPILER}")
-    message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
-if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
-    message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
-if (NOT EXISTS "${CMAKE_RC_COMPILER}")
-    message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
+include(../cmake/win32.ensureNinjaPath.cmake)
+ensureNinjaPath(PROGRAMFILES_PATHS)
 
 set(arch_c_flags "-march=native")
 
 set(CMAKE_C_FLAGS_INIT "${arch_c_flags}")
 set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags}")
-
-if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
-    find_program(NINJA_EXECUTABLE ninja)
-
-    if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
-        set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
-    else()
-        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-            file(GLOB_RECURSE FOUND_NINJA_EXE
-                "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
-                "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
-
-            if(FOUND_NINJA_EXE)
-                list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
-                break()
-            endif()
-        endforeach()
-    endif()
-
-    if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
-        message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
-    endif()
-endif()
diff --git a/llama/toolchains/win32.host-arm64.target-arm64.cmake b/llama/toolchains/win32.host-arm64.target-arm64.cmake
index 6eaabd72..3a546402 100644
--- a/llama/toolchains/win32.host-arm64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-arm64.target-arm64.cmake
@@ -5,97 +5,17 @@ set(target arm64-pc-windows-msvc)
 set(CMAKE_C_COMPILER_TARGET ${target})
 set(CMAKE_CXX_COMPILER_TARGET ${target})
 
-set(CMAKE_C_COMPILER clang)
-set(CMAKE_CXX_COMPILER clang++)
-set(CMAKE_RC_COMPILER llvm-rc)
+include(../cmake/win32.programFilesPaths.cmake)
+setProgramFilesPaths("arm64" PROGRAMFILES_PATHS)
 
-set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
+include(../cmake/win32.llvmUseGnuModeCompilers.cmake)
+llvmUseGnuModeCompilers("arm64" PROGRAMFILES_PATHS)
 
-set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
-set(PROGRAMFILES "$ENV{ProgramFiles}")
-set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
-file(TO_CMAKE_PATH "${PROGRAMFILES_ARM64}" PROGRAMFILES_ARM64)
-file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
-file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
-set(PROGRAMFILES_PATHS
-    "${PROGRAMFILES_ARM64}"
-    "${PROGRAMFILES}"
-    "${PROGRAMFILES_X86}"
-    "C:/Program Files (Arm)"
-    "C:/Program Files"
-    "C:/Program Files (x86)"
-)
-
-set(LLVM_INSTALL_PATHS "")
-foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-    list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
-
-    file(GLOB_RECURSE FOUND_LLVM_ROOT
-        "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/ARM64"
-        "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/ARM64")
-
-    if(FOUND_LLVM_ROOT)
-        list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
-    endif()
-endforeach()
-
-if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
-    list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
-endif()
-
-set(LLVM_ROOT "")
-foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
-    if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
-        set(LLVM_ROOT "${PATH}")
-        break()
-    endif()
-endforeach()
-
-if(LLVM_ROOT STREQUAL "")
-    message(FATAL_ERROR "LLVM installation was not found. Please install LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
-endif()
-
-if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
-    set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
-    set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
-    set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
-endif()
-
-if (NOT EXISTS "${CMAKE_C_COMPILER}")
-    message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
-endif()
-if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
-    message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
-endif()
-if (NOT EXISTS "${CMAKE_RC_COMPILER}")
-    message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
-endif()
+include(../cmake/win32.ensureNinjaPath.cmake)
+ensureNinjaPath(PROGRAMFILES_PATHS)
 
 set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
 set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")
 
 set(CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
 set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
-
-if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
-    find_program(NINJA_EXECUTABLE ninja)
-
-    if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
-        set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
-    else()
-        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-            file(GLOB_RECURSE FOUND_NINJA_EXE
-                "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
-                "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
-
-            if(FOUND_NINJA_EXE)
-                list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
-                break()
-            endif()
-        endforeach()
-    endif()
-
-    if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
-        message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
-    endif()
-endif()
diff --git a/llama/toolchains/win32.host-x64.target-arm64.cmake b/llama/toolchains/win32.host-x64.target-arm64.cmake
index 243d1906..eae10f8d 100644
--- a/llama/toolchains/win32.host-x64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-x64.target-arm64.cmake
@@ -5,93 +5,17 @@ set(target arm64-pc-windows-msvc)
 set(CMAKE_C_COMPILER_TARGET ${target})
 set(CMAKE_CXX_COMPILER_TARGET ${target})
 
-set(CMAKE_C_COMPILER clang)
-set(CMAKE_CXX_COMPILER clang++)
-set(CMAKE_RC_COMPILER llvm-rc)
+include(../cmake/win32.programFilesPaths.cmake)
+setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
 
-set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
+include(../cmake/win32.llvmUseGnuModeCompilers.cmake)
+llvmUseGnuModeCompilers("x64" PROGRAMFILES_PATHS)
 
-set(PROGRAMFILES "$ENV{ProgramFiles}")
-set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
-file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
-file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
-set(PROGRAMFILES_PATHS
-    "${PROGRAMFILES}"
-    "${PROGRAMFILES_X86}"
-    "C:/Program Files"
-    "C:/Program Files (x86)"
-)
-
-set(LLVM_INSTALL_PATHS "")
-foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-    list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
-
-    file(GLOB_RECURSE FOUND_LLVM_ROOT
-        "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/x64"
-        "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/x64")
-
-    if(FOUND_LLVM_ROOT)
-        list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
-    endif()
-endforeach()
-
-if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
-    list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
-endif()
-
-set(LLVM_ROOT "")
-foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
-    if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
-        set(LLVM_ROOT "${PATH}")
-        break()
-    endif()
-endforeach()
-
-if(LLVM_ROOT STREQUAL "")
-    message(FATAL_ERROR "LLVM installation was not found. Please install LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
-
-if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
-    set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
-    set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
-    set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
-endif()
-
-if (NOT EXISTS "${CMAKE_C_COMPILER}")
-    message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
-if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
-    message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
-if (NOT EXISTS "${CMAKE_RC_COMPILER}")
-    message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
-endif()
+include(../cmake/win32.ensureNinjaPath.cmake)
+ensureNinjaPath(PROGRAMFILES_PATHS)
 
 set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
 set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")
 
 set(CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
 set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
-
-if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
-    find_program(NINJA_EXECUTABLE ninja)
-
-    if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
-        set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
-    else()
-        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
-            file(GLOB_RECURSE FOUND_NINJA_EXE
-                "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
-                "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
-
-            if(FOUND_NINJA_EXE)
-                list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
-                break()
-            endif()
-        endforeach()
-    endif()
-
-    if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
-        message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
-    endif()
-endif()
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 7f90aa4f..4540fe89 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -310,7 +310,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 chalk.yellow("To resolve errors related to Vulkan compilation, see the Vulkan guide: ") +
                 documentationPageUrls.Vulkan
             );
-        else if (useWindowsLlvm) {
+        else if (useWindowsLlvm && !ciMode) {
             if (buildOptions.progressLogs)
                 console.info(getConsoleLogPrefix(true) + "Trying to compile again without LLVM");
 

From 9d859eb73172209e9b83cc42a4be0e90668d9b3c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 20:07:37 +0200
Subject: [PATCH 32/73] fix: Windows cmake build

---
 llama/cmake/win32.ensureNinjaPath.cmake            |  2 +-
 llama/cmake/win32.ensureNodeLib.cmake              |  2 +-
 llama/cmake/win32.llvmEnsureCmakeAr.cmake          |  2 +-
 llama/cmake/win32.llvmUseGnuModeCompilers.cmake    |  2 +-
 llama/cmake/win32.programFilesPaths.cmake          |  2 +-
 .../llvm.win32.host-arm64.target-arm64.cmake       | 14 +++++++-------
 .../llvm.win32.host-x64.target-arm64.cmake         | 14 +++++++-------
 .../profiles/llvm.win32.host-x64.target-x64.cmake  | 14 +++++++-------
 .../llvm.win32.host-x64.target-x64.cmake           | 12 ++++++------
 .../toolchains/win32.host-arm64.target-arm64.cmake | 12 ++++++------
 llama/toolchains/win32.host-x64.target-arm64.cmake | 12 ++++++------
 11 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/llama/cmake/win32.ensureNinjaPath.cmake b/llama/cmake/win32.ensureNinjaPath.cmake
index b0d38f7c..6b0169e5 100644
--- a/llama/cmake/win32.ensureNinjaPath.cmake
+++ b/llama/cmake/win32.ensureNinjaPath.cmake
@@ -1,4 +1,4 @@
-function(ensureNinjaPath PROGRAMFILES_PATHS)
+function(ensureNinjaPath)
     if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}" OR NOT CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
         find_program(NINJA_EXECUTABLE ninja)
 
diff --git a/llama/cmake/win32.ensureNodeLib.cmake b/llama/cmake/win32.ensureNodeLib.cmake
index 5908b32c..a4ce0261 100644
--- a/llama/cmake/win32.ensureNodeLib.cmake
+++ b/llama/cmake/win32.ensureNodeLib.cmake
@@ -1,4 +1,4 @@
-function(ensureNodeLib PROGRAMFILES_PATHS HOST_ARCH TARGET_ARCH)
+function(ensureNodeLib HOST_ARCH TARGET_ARCH)
     if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
         if (NOT DEFINED NODE_LIB_CMAKE_AR)
             foreach(PATH IN LISTS PROGRAMFILES_PATHS)
diff --git a/llama/cmake/win32.llvmEnsureCmakeAr.cmake b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
index 7e93539f..b8ad4acf 100644
--- a/llama/cmake/win32.llvmEnsureCmakeAr.cmake
+++ b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
@@ -1,4 +1,4 @@
-function(llvmEnsureCmakeAr PROGRAMFILES_PATHS CURRENT_ARCH)
+function(llvmEnsureCmakeAr CURRENT_ARCH)
     set (LLVM_DIR_ARCH_NAME "")
     if (CURRENT_ARCH STREQUAL "x64")
         set (LLVM_DIR_ARCH_NAME "x64")
diff --git a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
index 7851191a..5eb2299b 100644
--- a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
+++ b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
@@ -1,4 +1,4 @@
-function(llvmUseGnuModeCompilers CURRENT_ARCH PROGRAMFILES_PATHS)
+function(llvmUseGnuModeCompilers CURRENT_ARCH)
     set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
 
     set(CMAKE_C_COMPILER clang)
diff --git a/llama/cmake/win32.programFilesPaths.cmake b/llama/cmake/win32.programFilesPaths.cmake
index a0732489..949d8c96 100644
--- a/llama/cmake/win32.programFilesPaths.cmake
+++ b/llama/cmake/win32.programFilesPaths.cmake
@@ -1,4 +1,4 @@
-function(setProgramFilesPaths CURRENT_ARCH PROGRAMFILES_PATHS)
+function(setProgramFilesPaths CURRENT_ARCH)
     set(PROGRAMFILES "$ENV{ProgramFiles}")
     set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
     file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
diff --git a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
index a59e7d09..2d44dc1a 100644
--- a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
@@ -1,11 +1,11 @@
-include(../cmake/win32.programFilesPaths.cmake)
-setProgramFilesPaths("arm64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("arm64")
 
-include(../cmake/win32.ensureNodeLib.cmake)
-ensureNodeLib(PROGRAMFILES_PATHS "arm64" "arm64")
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNodeLib.cmake")
+ensureNodeLib("arm64" "arm64")
 
-include(../cmake/win32.llvmApplyGnuModeAdaptations.cmake)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmApplyGnuModeAdaptations.cmake")
 llvmApplyGnuModeAdaptations()
 
-include(../cmake/win32.llvmEnsureCmakeAr.cmake)
-llvmEnsureCmakeAr(PROGRAMFILES_PATHS "arm64")
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
+llvmEnsureCmakeAr("arm64")
diff --git a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
index 77d0ba2d..9171a6ec 100644
--- a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
@@ -1,11 +1,11 @@
-include(../cmake/win32.programFilesPaths.cmake)
-setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
 
-include(../cmake/win32.ensureNodeLib.cmake)
-ensureNodeLib(PROGRAMFILES_PATHS "x64" "arm64")
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNodeLib.cmake")
+ensureNodeLib("x64" "arm64")
 
-include(../cmake/win32.llvmApplyGnuModeAdaptations.cmake)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmApplyGnuModeAdaptations.cmake")
 llvmApplyGnuModeAdaptations()
 
-include(../cmake/win32.llvmEnsureCmakeAr.cmake)
-llvmEnsureCmakeAr(PROGRAMFILES_PATHS "x64")
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
+llvmEnsureCmakeAr("x64")
diff --git a/llama/profiles/llvm.win32.host-x64.target-x64.cmake b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
index 3e71388f..e8e15664 100644
--- a/llama/profiles/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
@@ -1,11 +1,11 @@
-include(../cmake/win32.programFilesPaths.cmake)
-setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
 
-include(../cmake/win32.ensureNodeLib.cmake)
-ensureNodeLib(PROGRAMFILES_PATHS "x64" "x64")
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNodeLib.cmake")
+ensureNodeLib("x64" "x64")
 
-include(../cmake/win32.llvmApplyGnuModeAdaptations.cmake)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmApplyGnuModeAdaptations.cmake")
 llvmApplyGnuModeAdaptations()
 
-include(../cmake/win32.llvmEnsureCmakeAr.cmake)
-llvmEnsureCmakeAr(PROGRAMFILES_PATHS "x64")
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
+llvmEnsureCmakeAr("x64")
diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
index 86d1ebbe..2107f8e1 100644
--- a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
@@ -5,14 +5,14 @@ set(target x86_64-pc-windows-msvc)
 set(CMAKE_C_COMPILER_TARGET ${target})
 set(CMAKE_CXX_COMPILER_TARGET ${target})
 
-include(../cmake/win32.programFilesPaths.cmake)
-setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
 
-include(../cmake/win32.llvmUseGnuModeCompilers.cmake)
-llvmUseGnuModeCompilers("x64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmUseGnuModeCompilers.cmake")
+llvmUseGnuModeCompilers("x64")
 
-include(../cmake/win32.ensureNinjaPath.cmake)
-ensureNinjaPath(PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
 
 set(arch_c_flags "-march=native")
 
diff --git a/llama/toolchains/win32.host-arm64.target-arm64.cmake b/llama/toolchains/win32.host-arm64.target-arm64.cmake
index 3a546402..2af386ea 100644
--- a/llama/toolchains/win32.host-arm64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-arm64.target-arm64.cmake
@@ -5,14 +5,14 @@ set(target arm64-pc-windows-msvc)
 set(CMAKE_C_COMPILER_TARGET ${target})
 set(CMAKE_CXX_COMPILER_TARGET ${target})
 
-include(../cmake/win32.programFilesPaths.cmake)
-setProgramFilesPaths("arm64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("arm64")
 
-include(../cmake/win32.llvmUseGnuModeCompilers.cmake)
-llvmUseGnuModeCompilers("arm64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmUseGnuModeCompilers.cmake")
+llvmUseGnuModeCompilers("arm64")
 
-include(../cmake/win32.ensureNinjaPath.cmake)
-ensureNinjaPath(PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
 
 set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
 set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")
diff --git a/llama/toolchains/win32.host-x64.target-arm64.cmake b/llama/toolchains/win32.host-x64.target-arm64.cmake
index eae10f8d..df99fd62 100644
--- a/llama/toolchains/win32.host-x64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-x64.target-arm64.cmake
@@ -5,14 +5,14 @@ set(target arm64-pc-windows-msvc)
 set(CMAKE_C_COMPILER_TARGET ${target})
 set(CMAKE_CXX_COMPILER_TARGET ${target})
 
-include(../cmake/win32.programFilesPaths.cmake)
-setProgramFilesPaths("x64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
 
-include(../cmake/win32.llvmUseGnuModeCompilers.cmake)
-llvmUseGnuModeCompilers("x64" PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmUseGnuModeCompilers.cmake")
+llvmUseGnuModeCompilers("x64")
 
-include(../cmake/win32.ensureNinjaPath.cmake)
-ensureNinjaPath(PROGRAMFILES_PATHS)
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
 
 set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
 set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")

From 89d6442c8590b79068effa6f8587a8569add2eb1 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 20:43:04 +0200
Subject: [PATCH 33/73] fix: Windows cmake build

---
 llama/cmake/win32.ensureNinjaPath.cmake       | 17 ++++++++-------
 .../win32.llvmApplyGnuModeAdaptations.cmake   | 13 ++++++------
 llama/cmake/win32.llvmEnsureCmakeAr.cmake     | 11 ++++++----
 .../cmake/win32.llvmUseGnuModeCompilers.cmake | 21 +++++++++++--------
 llama/cmake/win32.programFilesPaths.cmake     | 10 +++++----
 5 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/llama/cmake/win32.ensureNinjaPath.cmake b/llama/cmake/win32.ensureNinjaPath.cmake
index 6b0169e5..fc9c5605 100644
--- a/llama/cmake/win32.ensureNinjaPath.cmake
+++ b/llama/cmake/win32.ensureNinjaPath.cmake
@@ -2,10 +2,10 @@ function(ensureNinjaPath)
     if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}" OR NOT CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
         find_program(NINJA_EXECUTABLE ninja)
 
-        set(CMAKE_MAKE_PROGRAM "")
+        set(CMAKE_MAKE_PROGRAM "" PARENT_SCOPE)
 
         if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
-            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
+            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}" PARENT_SCOPE)
         endif()
 
         if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
@@ -16,9 +16,10 @@ function(ensureNinjaPath)
                 file(GLOB_RECURSE FOUND_NINJA_EXE "${PROGRAMDATA_PATH}/chocolatey/bin/ninja.exe")
 
                 if(FOUND_NINJA_EXE)
-                    list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                    list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
-            else()
+            endif()
         endif()
 
         if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
@@ -29,9 +30,10 @@ function(ensureNinjaPath)
                 file(GLOB_RECURSE FOUND_NINJA_EXE "${LOCALAPPDATA_PATH}/Microsoft/WinGet/Packages/Ninja-build.Ninja_Microsoft.Winget.*/ninja.exe")
 
                 if(FOUND_NINJA_EXE)
-                    list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                    list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
-            else()
+            endif()
         endif()
 
         if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
@@ -41,7 +43,8 @@ function(ensureNinjaPath)
                     "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
 
                 if(FOUND_NINJA_EXE)
-                    list(GET FOUND_NINJA_EXE 0 CMAKE_MAKE_PROGRAM)
+                    list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                     break()
                 endif()
             endforeach()
diff --git a/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
index 4be6a324..7d9f4c04 100644
--- a/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
+++ b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
@@ -1,12 +1,13 @@
 function(llvmApplyGnuModeAdaptations)
     # adapt cmake-js to work with llvm in GNU mode
     if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
-    string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
-        CMAKE_SHARED_LINKER_FLAGS
-        "${CMAKE_SHARED_LINKER_FLAGS}")
+        string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
+            UPDATED_CMAKE_SHARED_LINKER_FLAGS
+            "${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_SHARED_LINKER_FLAGS "${UPDATED_CMAKE_SHARED_LINKER_FLAGS}" PARENT_SCOPE)
     endif()
 
-    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt")
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON PARENT_SCOPE)
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt" PARENT_SCOPE)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt" PARENT_SCOPE)
 endfunction()
diff --git a/llama/cmake/win32.llvmEnsureCmakeAr.cmake b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
index b8ad4acf..02d9c884 100644
--- a/llama/cmake/win32.llvmEnsureCmakeAr.cmake
+++ b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
@@ -11,9 +11,10 @@ function(llvmEnsureCmakeAr CURRENT_ARCH)
         foreach(PATH IN LISTS PROGRAMFILES_PATHS)
             list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
 
-            file(GLOB_RECURSE FOUND_LLVM_ROOT
-                "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/${CURRENT_ARCH}"
-                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/${CURRENT_ARCH}")
+            file(GLOB_RECURSE FOUND_LLVM_ROOT LIST_DIRECTORIES true
+                "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}"
+                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}")
+            list(FILTER FOUND_LLVM_ROOT INCLUDE REGEX "VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}$")
 
             if(FOUND_LLVM_ROOT)
                 list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
@@ -24,9 +25,11 @@ function(llvmEnsureCmakeAr CURRENT_ARCH)
             list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
         endif()
 
+        list(REMOVE_DUPLICATES LLVM_INSTALL_PATHS)
+
         foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
             if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
-                set(CMAKE_AR "${PATH}/bin/llvm-ar.exe")
+                set(CMAKE_AR "${PATH}/bin/llvm-ar.exe" PARENT_SCOPE)
                 break()
             endif()
         endforeach()
diff --git a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
index 5eb2299b..2e0702ba 100644
--- a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
+++ b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
@@ -1,9 +1,9 @@
 function(llvmUseGnuModeCompilers CURRENT_ARCH)
     set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
 
-    set(CMAKE_C_COMPILER clang)
-    set(CMAKE_CXX_COMPILER clang++)
-    set(CMAKE_RC_COMPILER llvm-rc)
+    set(CMAKE_C_COMPILER clang PARENT_SCOPE)
+    set(CMAKE_CXX_COMPILER clang++ PARENT_SCOPE)
+    set(CMAKE_RC_COMPILER llvm-rc PARENT_SCOPE)
 
 
     set (LLVM_DIR_ARCH_NAME "")
@@ -17,9 +17,10 @@ function(llvmUseGnuModeCompilers CURRENT_ARCH)
     foreach(PATH IN LISTS PROGRAMFILES_PATHS)
         list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
 
-        file(GLOB_RECURSE FOUND_LLVM_ROOT
+        file(GLOB_RECURSE FOUND_LLVM_ROOT LIST_DIRECTORIES true
             "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}"
             "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}")
+        list(FILTER FOUND_LLVM_ROOT INCLUDE REGEX "VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}$")
 
         if(FOUND_LLVM_ROOT)
             list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
@@ -30,10 +31,12 @@ function(llvmUseGnuModeCompilers CURRENT_ARCH)
         list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
     endif()
 
-    set(LLVM_ROOT "")
+    list(REMOVE_DUPLICATES LLVM_INSTALL_PATHS)
+
+    set(LLVM_ROOT "" PARENT_SCOPE)
     foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
         if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
-            set(LLVM_ROOT "${PATH}")
+            set(LLVM_ROOT "${PATH}" PARENT_SCOPE)
             break()
         endif()
     endforeach()
@@ -47,9 +50,9 @@ function(llvmUseGnuModeCompilers CURRENT_ARCH)
     endif()
 
     if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
-        set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
-        set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
-        set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
+        set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe" PARENT_SCOPE)
+        set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe" PARENT_SCOPE)
+        set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe" PARENT_SCOPE)
     endif()
 
     if (NOT EXISTS "${CMAKE_C_COMPILER}")
diff --git a/llama/cmake/win32.programFilesPaths.cmake b/llama/cmake/win32.programFilesPaths.cmake
index 949d8c96..4d1325b2 100644
--- a/llama/cmake/win32.programFilesPaths.cmake
+++ b/llama/cmake/win32.programFilesPaths.cmake
@@ -8,22 +8,24 @@ function(setProgramFilesPaths CURRENT_ARCH)
         set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
         file(TO_CMAKE_PATH "${PROGRAMFILES_ARM64}" PROGRAMFILES_ARM64)
 
-        set(PROGRAMFILES_PATHS
+        set(PROGRAMFILES_PATHS_LIST
             "${PROGRAMFILES_ARM64}"
             "${PROGRAMFILES}"
             "${PROGRAMFILES_X86}"
             "C:/Program Files (Arm)"
             "C:/Program Files"
             "C:/Program Files (x86)"
-            PARENT_SCOPE
         )
+        list(REMOVE_DUPLICATES PROGRAMFILES_PATHS_LIST)
+        set(PROGRAMFILES_PATHS ${PROGRAMFILES_PATHS_LIST} PARENT_SCOPE)
     else()
-        set(PROGRAMFILES_PATHS
+        set(PROGRAMFILES_PATHS_LIST
             "${PROGRAMFILES}"
             "${PROGRAMFILES_X86}"
             "C:/Program Files"
             "C:/Program Files (x86)"
-            PARENT_SCOPE
         )
+        list(REMOVE_DUPLICATES PROGRAMFILES_PATHS_LIST)
+        set(PROGRAMFILES_PATHS ${PROGRAMFILES_PATHS_LIST} PARENT_SCOPE)
     endif()
 endfunction()

From 3e16195b3c41187b73a0cc06fa6d7f09fcaeb243 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 20:52:07 +0200
Subject: [PATCH 34/73] fix: Windows cmake build

---
 llama/cmake/win32.ensureNinjaPath.cmake         | 5 +++++
 llama/cmake/win32.llvmUseGnuModeCompilers.cmake | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/llama/cmake/win32.ensureNinjaPath.cmake b/llama/cmake/win32.ensureNinjaPath.cmake
index fc9c5605..7d86dc66 100644
--- a/llama/cmake/win32.ensureNinjaPath.cmake
+++ b/llama/cmake/win32.ensureNinjaPath.cmake
@@ -2,9 +2,11 @@ function(ensureNinjaPath)
     if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}" OR NOT CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
         find_program(NINJA_EXECUTABLE ninja)
 
+        set(CMAKE_MAKE_PROGRAM "")
         set(CMAKE_MAKE_PROGRAM "" PARENT_SCOPE)
 
         if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
+            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
             set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}" PARENT_SCOPE)
         endif()
 
@@ -17,6 +19,7 @@ function(ensureNinjaPath)
 
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
                     list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
             endif()
@@ -31,6 +34,7 @@ function(ensureNinjaPath)
 
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
                     list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
             endif()
@@ -44,6 +48,7 @@ function(ensureNinjaPath)
 
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
                     list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                     break()
                 endif()
diff --git a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
index 2e0702ba..4f824970 100644
--- a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
+++ b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
@@ -1,8 +1,11 @@
 function(llvmUseGnuModeCompilers CURRENT_ARCH)
     set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
 
+    set(CMAKE_C_COMPILER clang)
     set(CMAKE_C_COMPILER clang PARENT_SCOPE)
+    set(CMAKE_CXX_COMPILER clang++)
     set(CMAKE_CXX_COMPILER clang++ PARENT_SCOPE)
+    set(CMAKE_RC_COMPILER llvm-rc)
     set(CMAKE_RC_COMPILER llvm-rc PARENT_SCOPE)
 
 
@@ -33,9 +36,11 @@ function(llvmUseGnuModeCompilers CURRENT_ARCH)
 
     list(REMOVE_DUPLICATES LLVM_INSTALL_PATHS)
 
+    set(LLVM_ROOT "")
     set(LLVM_ROOT "" PARENT_SCOPE)
     foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
         if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
+            set(LLVM_ROOT "${PATH}")
             set(LLVM_ROOT "${PATH}" PARENT_SCOPE)
             break()
         endif()
@@ -50,8 +55,11 @@ function(llvmUseGnuModeCompilers CURRENT_ARCH)
     endif()
 
     if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
+        set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
         set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe" PARENT_SCOPE)
+        set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
         set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe" PARENT_SCOPE)
+        set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
         set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe" PARENT_SCOPE)
     endif()
 

From e050cdf7b7f79dd6bc05d841f7019ead7245f308 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 21:18:34 +0200
Subject: [PATCH 35/73] fix: Windows cmake build

---
 llama/CMakeLists.txt                    |  9 +++------
 llama/cmake/win32.ensureNinjaPath.cmake | 16 +++++++++-------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 7e9f18a1..f95932fc 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -2,15 +2,12 @@ cmake_minimum_required(VERSION 3.14)
 
 if (NLC_TARGET_PLATFORM STREQUAL "win-arm64" AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") AND NOT MINGW)
     if(NLC_CURRENT_PLATFORM STREQUAL "win-x64")
-        get_filename_component(INCLUDE_PROFILE_ABS "./profiles/llvm.win32.host-x64.target-arm64.cmake" ABSOLUTE)
-        include("${INCLUDE_PROFILE_ABS}")
+        include("./profiles/llvm.win32.host-x64.target-arm64.cmake")
     elseif(NLC_CURRENT_PLATFORM STREQUAL "win-arm64")
-        get_filename_component(INCLUDE_PROFILE_ABS "./profiles/llvm.win32.host-arm64.target-arm64.cmake" ABSOLUTE)
-        include("${INCLUDE_PROFILE_ABS}")
+        include("./profiles/llvm.win32.host-arm64.target-arm64.cmake")
     endif()
 elseif (NLC_CURRENT_PLATFORM STREQUAL "win-x64" AND NLC_TARGET_PLATFORM STREQUAL "win-x64" AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") AND NOT MINGW)
-    get_filename_component(INCLUDE_PROFILE_ABS "./profiles/llvm.win32.host-x64.target-x64.cmake" ABSOLUTE)
-    include("${INCLUDE_PROFILE_ABS}")
+    include("./profiles/llvm.win32.host-x64.target-x64.cmake")
 endif()
 
 project("llama-addon" C CXX)
diff --git a/llama/cmake/win32.ensureNinjaPath.cmake b/llama/cmake/win32.ensureNinjaPath.cmake
index 7d86dc66..b92c285f 100644
--- a/llama/cmake/win32.ensureNinjaPath.cmake
+++ b/llama/cmake/win32.ensureNinjaPath.cmake
@@ -19,8 +19,8 @@ function(ensureNinjaPath)
 
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
-                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
-                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
             endif()
         endif()
@@ -34,8 +34,8 @@ function(ensureNinjaPath)
 
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
-                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
-                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
             endif()
         endif()
@@ -44,12 +44,14 @@ function(ensureNinjaPath)
             foreach(PATH IN LISTS PROGRAMFILES_PATHS)
                 file(GLOB_RECURSE FOUND_NINJA_EXE
                     "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
-                    "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe")
+                    "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe"
+                    "${PATH}/Microsoft Visual Studio/*/Common7/IDE/CommonExtensions/Microsoft/CMake/Ninja/ninja.exe"
+                    "${PATH}/Microsoft Visual Studio/**/*/Common7/IDE/CommonExtensions/Microsoft/CMake/Ninja/ninja.exe")
 
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
-                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
-                    list(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                     break()
                 endif()
             endforeach()

From c24c6ff85d17503431b0ba066ad84247f0338373 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 21:26:54 +0200
Subject: [PATCH 36/73] fix: Windows build

---
 llama/addon/globals/getMemoryInfo.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama/addon/globals/getMemoryInfo.cpp b/llama/addon/globals/getMemoryInfo.cpp
index 50477caf..1825c425 100644
--- a/llama/addon/globals/getMemoryInfo.cpp
+++ b/llama/addon/globals/getMemoryInfo.cpp
@@ -48,9 +48,9 @@ Napi::Value getMemoryInfo(const Napi::CallbackInfo& info) {
         addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
     }
 #elif _WIN32
-    PROCESS_MEMORY_COUNTERS memCounters;
-    
-    if (GetProcessMemoryInfo(GetCurrentProcess(), &memCounters, sizeof(memCounters))) {
+    PROCESS_MEMORY_COUNTERS_EX memCounters;
+
+    if (GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*)&memCounters, sizeof(memCounters))) {
         totalMemoryUsage = memCounters.PrivateUsage;
     } else {
         addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);

From 1714685d2b5d08b555c72796e10c80e213140d96 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 21:28:19 +0200
Subject: [PATCH 37/73] fix: Windows build

---
 llama/profiles/llvm.win32.host-arm64.target-arm64.cmake | 3 +++
 llama/profiles/llvm.win32.host-x64.target-arm64.cmake   | 3 +++
 llama/profiles/llvm.win32.host-x64.target-x64.cmake     | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
index 2d44dc1a..ead3b394 100644
--- a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
@@ -9,3 +9,6 @@ llvmApplyGnuModeAdaptations()
 
 include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
 llvmEnsureCmakeAr("arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
diff --git a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
index 9171a6ec..42cee481 100644
--- a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
@@ -9,3 +9,6 @@ llvmApplyGnuModeAdaptations()
 
 include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
 llvmEnsureCmakeAr("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
diff --git a/llama/profiles/llvm.win32.host-x64.target-x64.cmake b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
index e8e15664..f47e337b 100644
--- a/llama/profiles/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
@@ -9,3 +9,6 @@ llvmApplyGnuModeAdaptations()
 
 include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
 llvmEnsureCmakeAr("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()

From 6357bdc91f77ddd11ece478f21479ea6f40ad6a9 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 22:05:49 +0200
Subject: [PATCH 38/73] fix: Windows build

---
 src/bindings/utils/detectBuildTools.ts | 69 +++++++++++++++++---------
 src/utils/cmake.ts                     |  4 +-
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/src/bindings/utils/detectBuildTools.ts b/src/bindings/utils/detectBuildTools.ts
index e10884cc..1af01a79 100644
--- a/src/bindings/utils/detectBuildTools.ts
+++ b/src/bindings/utils/detectBuildTools.ts
@@ -13,7 +13,10 @@ export async function getWindowsVisualStudioEditionPaths() {
     const platform = getPlatform();
 
     if (platform !== "win")
-        return [];
+        return {
+            vsEditionPaths: [],
+            programFilesPaths: []
+        };
 
     const programFilesPaths = await getWindowsProgramFilesPaths();
     const potentialVisualStudioPaths = programFilesPaths
@@ -22,8 +25,10 @@ export async function getWindowsVisualStudioEditionPaths() {
     const versionPaths = (await Promise.all(
         potentialVisualStudioPaths.map(async (vsPath) => {
             if (await fs.pathExists(vsPath)) {
-                const versions = await fs.readdir(vsPath);
+                const versions = await fs.readdir(vsPath, {withFileTypes: true});
                 return versions
+                    .filter((dirent) => dirent.isDirectory())
+                    .map((dirent) => dirent.name)
                     .sort((a, b) => {
                         const aNumber = parseInt(a);
                         const bNumber = parseInt(b);
@@ -44,22 +49,27 @@ export async function getWindowsVisualStudioEditionPaths() {
         })
     )).flat();
 
-    const editionPaths = (await Promise.all(
+    const vsEditionPaths = (await Promise.all(
         versionPaths.map(async (versionPath) => {
-            const editions = await fs.readdir(versionPath);
-            return editions.map((edition) => path.join(versionPath, edition));
+            const editions = await fs.readdir(versionPath, {withFileTypes: true});
+            return editions
+                .filter((dirent) => dirent.isDirectory())
+                .map((edition) => path.join(versionPath, edition.name));
         })
     )).flat();
 
-    return editionPaths;
+    return {
+        vsEditionPaths,
+        programFilesPaths
+    };
 }
 
 export async function detectWindowsBuildTools(targetArch: typeof process.arch = process.arch) {
     try {
         const currentArch = process.arch;
-        const editionPaths = await getWindowsVisualStudioEditionPaths();
+        const {vsEditionPaths, programFilesPaths} = await getWindowsVisualStudioEditionPaths();
 
-        if (editionPaths.length === 0)
+        if (vsEditionPaths.length === 0 && programFilesPaths.length === 0)
             return {
                 hasCmake: false,
                 hasNinja: false,
@@ -67,8 +77,12 @@ export async function detectWindowsBuildTools(targetArch: typeof process.arch =
                 hasLibExe: false
             };
 
+        const programDataPaths: string[] = [
+            process.env["ProgramData"]
+        ].filter((programDataPath) => programDataPath != null);
+
         const msvcPaths = (await Promise.all(
-            editionPaths.map(async (editionPath) => {
+            vsEditionPaths.map(async (editionPath) => {
                 const msvcVersionsPath = path.join(editionPath, "VC", "Tools", "MSVC");
 
                 if (await fs.pathExists(msvcVersionsPath)) {
@@ -94,20 +108,29 @@ export async function detectWindowsBuildTools(targetArch: typeof process.arch =
             })
         )).flat();
 
-        const potentialCmakePaths = editionPaths.map((editionPath) => (
-            path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")
-        ));
-        const potentialNinjaPaths = editionPaths.map((editionPath) => (
-            path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "Ninja", "ninja.exe")
-        ));
-        const potentialLlvmPaths = editionPaths.map((editionPath) => {
-            if (currentArch === "x64")
-                return path.join(editionPath, "VC", "Tools", "Llvm", "x64", "bin");
-            else if (currentArch === "arm64")
-                return path.join(editionPath, "VC", "Tools", "Llvm", "ARM64", "bin");
-
-            return path.join(editionPath, "VC", "Tools", "Llvm", "bin");
-        });
+        const potentialCmakePaths = [
+            ...programFilesPaths.map((programFilesPath) => path.join(programFilesPath, "CMake", "bin", "cmake.exe")),
+            ...vsEditionPaths.map((editionPath) => (
+                path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")
+            ))
+        ];
+        const potentialNinjaPaths = [
+            ...programDataPaths.map((programDataPath) => path.join(programDataPath, "chocolatey", "bin", "ninja.exe")),
+            ...vsEditionPaths.map((editionPath) => (
+                path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "Ninja", "ninja.exe")
+            ))
+        ];
+        const potentialLlvmPaths = [
+            ...programFilesPaths.map((programFilesPath) => path.join(programFilesPath, "LLVM", "bin")),
+            ...vsEditionPaths.map((editionPath) => {
+                if (currentArch === "x64")
+                    return path.join(editionPath, "VC", "Tools", "Llvm", "x64", "bin");
+                else if (currentArch === "arm64")
+                    return path.join(editionPath, "VC", "Tools", "Llvm", "ARM64", "bin");
+
+                return path.join(editionPath, "VC", "Tools", "Llvm", "bin");
+            })
+        ];
         const potentialLibExePaths = msvcPaths.map((msvcPath) => {
             const hostArchDirName = currentArch === "x64"
                 ? "Hostx64"
diff --git a/src/utils/cmake.ts b/src/utils/cmake.ts
index e72ba175..c6ca12b4 100644
--- a/src/utils/cmake.ts
+++ b/src/utils/cmake.ts
@@ -111,9 +111,9 @@ async function findExistingCmake() {
     const platform = getPlatform();
 
     if (platform === "win") {
-        const editionPaths = await getWindowsVisualStudioEditionPaths();
+        const {vsEditionPaths} = await getWindowsVisualStudioEditionPaths();
 
-        const potentialCmakePaths = editionPaths.map((editionPath) => (
+        const potentialCmakePaths = vsEditionPaths.map((editionPath) => (
             path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")
         ));
 

From 5337ede940daa0271f7c3712aaf28748a720da44 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 22:28:24 +0200
Subject: [PATCH 39/73] fix: Windows build

---
 llama/cmake/win32.ensureNinjaPath.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama/cmake/win32.ensureNinjaPath.cmake b/llama/cmake/win32.ensureNinjaPath.cmake
index b92c285f..9839dd99 100644
--- a/llama/cmake/win32.ensureNinjaPath.cmake
+++ b/llama/cmake/win32.ensureNinjaPath.cmake
@@ -7,6 +7,7 @@ function(ensureNinjaPath)
 
         if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
             set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
+            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}" CACHE FILEPATH "Make program")
             set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}" PARENT_SCOPE)
         endif()
 
@@ -20,6 +21,7 @@ function(ensureNinjaPath)
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
                     set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" CACHE FILEPATH "Make program")
                     set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
             endif()
@@ -35,6 +37,7 @@ function(ensureNinjaPath)
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
                     set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" CACHE FILEPATH "Make program")
                     set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                 endif()
             endif()
@@ -51,6 +54,7 @@ function(ensureNinjaPath)
                 if(FOUND_NINJA_EXE)
                     list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
                     set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" CACHE FILEPATH "Make program")
                     set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
                     break()
                 endif()

From fa714853ea10331a208dac7d4f255d472996e655 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 1 Jan 2025 22:38:31 +0200
Subject: [PATCH 40/73] fix: Windows build

---
 llama/toolchains/llvm.win32.host-x64.target-x64.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
index 2107f8e1..7c9c5130 100644
--- a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
@@ -18,3 +18,7 @@ set(arch_c_flags "-march=native")
 
 set(CMAKE_C_FLAGS_INIT "${arch_c_flags}")
 set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags}")
+
+# Set the CUDA flags to use the same compiler as the C compiler
+get_filename_component(C_COMPILER_DIR "${CMAKE_C_COMPILER}" DIRECTORY)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-bindir=${C_COMPILER_DIR}")

From 904b4e2e66df14954b939f1a493327d4947c116c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 00:10:48 +0200
Subject: [PATCH 41/73] fix: perform a separate MSVC build on Windows

---
 src/bindings/utils/compileLLamaCpp.ts | 134 +++++++++++++++++++++-----
 1 file changed, 110 insertions(+), 24 deletions(-)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 4540fe89..b3ae1247 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -27,6 +27,11 @@ import {detectWindowsBuildTools} from "./detectBuildTools.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const buildConfigType: "Release" | "RelWithDebInfo" | "Debug" = "Release";
 
+const windowsMsvcOnlyBuildFlagsToTargets = new Map(
+    ["blas", "cann", "cuda", "hip", "kompute", "metal", "musa", "sycl", "vulkan", "opencl"]
+        .map((backend) => ["GGML_" + backend.toUpperCase(), "ggml-" + backend.toLowerCase()])
+);
+
 export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions: {
     nodeTarget?: string,
     updateLastBuildInfo?: boolean,
@@ -53,7 +58,11 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
     const finalBuildFolderName = includeBuildOptionsInBinaryFolderName
         ? buildFolderName.withCustomCmakeOptions
         : buildFolderName.withoutCustomCmakeOptions;
-    const useWindowsLlvm = (platform === "win" && !ignoreWorkarounds.includes("avoidWindowsLlvm"))
+    const useWindowsLlvm = (
+        platform === "win" &&
+        !ignoreWorkarounds.includes("avoidWindowsLlvm") &&
+        !buildOptions.customCmakeOptions.has("CMAKE_TOOLCHAIN_FILE")
+    )
         ? areWindowsBuildToolsCapableForLlvmBuild(await detectWindowsBuildTools())
         : false;
 
@@ -84,6 +93,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 const toolchainFile = await getToolchainFileForArch(buildOptions.arch, useWindowsLlvm);
                 const runtimeVersion = nodeTarget.startsWith("v") ? nodeTarget.slice("v".length) : nodeTarget;
                 const cmakeCustomOptions = new Map(buildOptions.customCmakeOptions);
+                const cmakeToolchainOptions = new Map<string, string>();
+                const windowsSeparateMsvcCmakeOptions = new Map<string, string>();
 
                 cmakeCustomOptions.set("CMAKE_CONFIGURATION_TYPES", buildConfigType);
                 cmakeCustomOptions.set("NLC_CURRENT_PLATFORM", platform + "-" + process.arch);
@@ -104,7 +115,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     cmakeCustomOptions.set("GGML_CCACHE", "OFF");
 
                 if (toolchainFile != null && !cmakeCustomOptions.has("CMAKE_TOOLCHAIN_FILE"))
-                    cmakeCustomOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
+                    cmakeToolchainOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
 
                 if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP"))
                     cmakeCustomOptions.set("GGML_OPENMP", "OFF");
@@ -124,6 +135,16 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     }
                 }
 
+                if (useWindowsLlvm) {
+                    for (const [customFlag, customFlagValue] of [...cmakeCustomOptions.entries()]) {
+                        if (!windowsMsvcOnlyBuildFlagsToTargets.has(customFlag) || isCmakeValueOff(customFlagValue))
+                            continue;
+
+                        windowsSeparateMsvcCmakeOptions.set(customFlag, customFlagValue);
+                        cmakeCustomOptions.delete(customFlag);
+                    }
+                }
+
                 await fs.remove(outDirectory);
 
                 await spawnCommand(
@@ -152,7 +173,10 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                         ...cmakeGeneratorArgs,
                         ...cmakePathArgs,
                         ...(
-                            [...cmakeCustomOptions].map(([key, value]) => "--CD" + key + "=" + value)
+                            [
+                                ...cmakeCustomOptions,
+                                ...cmakeToolchainOptions
+                            ].map(([key, value]) => "--CD" + key + "=" + value)
                         )
                     ],
                     __dirname,
@@ -160,30 +184,51 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     buildOptions.progressLogs
                 );
 
-                const binFilesDirPaths = [
-                    path.join(outDirectory, "bin"),
-                    path.join(outDirectory, "llama.cpp", "bin")
-                ];
-                const compiledResultDirPath = path.join(outDirectory, buildConfigType);
-
-                if (!await fs.pathExists(compiledResultDirPath))
-                    throw new Error(`Could not find ${buildConfigType} directory`);
-
-                for (const binFilesDirPath of binFilesDirPaths) {
-                    if (await fs.pathExists(binFilesDirPath)) {
-                        const itemNames = await fs.readdir(binFilesDirPath);
-
-                        await Promise.all(
-                            itemNames.map((itemName) => (
-                                fs.copy(path.join(binFilesDirPath, itemName), path.join(compiledResultDirPath, itemName), {
-                                    overwrite: false
-                                })
-                            ))
+                const compiledResultDirPath = await moveBuildFilesToResultDir(outDirectory);
+
+                // perform a separate MSVC build and combine the compiled backends into the final build
+                if (useWindowsLlvm && windowsSeparateMsvcCmakeOptions.size > 0) {
+                    const llvmResultDir = path.join(outDirectory, "_llvm" + buildConfigType);
+                    await fs.move(compiledResultDirPath, llvmResultDir);
+
+                    for (const [targetFlag, targetValue] of windowsSeparateMsvcCmakeOptions) {
+                        const targetName = windowsMsvcOnlyBuildFlagsToTargets.get(targetFlag);
+                        if (targetName == null)
+                            continue;
+
+                        console.info(getConsoleLogPrefix(true, false), "Building specialized GPU backends using MSVC: " + targetName);
+
+                        await fs.remove(compiledResultDirPath);
+                        await spawnCommand(
+                            "npm",
+                            [
+                                "run", "-s", "cmake-js-llama", "--", "compile",
+                                "--log-level", "warn",
+                                "--config", buildConfigType,
+                                "--arch=" + buildOptions.arch,
+                                "--out", path.relative(llamaDirectory, outDirectory),
+                                "--runtime-version=" + runtimeVersion,
+                                "--parallel=" + parallelBuildThreads,
+                                "--target", targetName,
+                                ...cmakePathArgs,
+                                ...(
+                                    [
+                                        ...cmakeCustomOptions,
+                                        [targetFlag, targetValue]
+                                    ].map(([key, value]) => "--CD" + key + "=" + value)
+                                )
+                            ],
+                            __dirname,
+                            envVars,
+                            buildOptions.progressLogs
                         );
+                        const targetCompileResultDir = await moveBuildFilesToResultDir(outDirectory);
+                        await mergeDirWithoutOverrides(targetCompileResultDir, compiledResultDirPath);
                     }
-                }
 
-                await applyResultDirFixes(compiledResultDirPath, path.join(outDirectory, "_temp"));
+                    await fs.remove(compiledResultDirPath);
+                    await fs.move(llvmResultDir, compiledResultDirPath);
+                }
 
                 await fs.writeFile(path.join(compiledResultDirPath, buildMetadataFileName), JSON.stringify({
                     buildOptions: convertBuildOptionsToBuildOptionsJSON(buildOptions)
@@ -399,6 +444,35 @@ export async function getPrebuiltBinaryBuildMetadata(folderPath: string, folderN
     return buildMetadata;
 }
 
+async function moveBuildFilesToResultDir(outDirectory: string) {
+    const binFilesDirPaths = [
+        path.join(outDirectory, "bin"),
+        path.join(outDirectory, "llama.cpp", "bin")
+    ];
+    const compiledResultDirPath = path.join(outDirectory, buildConfigType);
+
+    if (!await fs.pathExists(compiledResultDirPath))
+        throw new Error(`Could not find ${buildConfigType} directory`);
+
+    for (const binFilesDirPath of binFilesDirPaths) {
+        if (await fs.pathExists(binFilesDirPath)) {
+            const itemNames = await fs.readdir(binFilesDirPath);
+
+            await Promise.all(
+                itemNames.map((itemName) => (
+                    fs.copy(path.join(binFilesDirPath, itemName), path.join(compiledResultDirPath, itemName), {
+                        overwrite: false
+                    })
+                ))
+            );
+        }
+    }
+
+    await applyResultDirFixes(compiledResultDirPath, path.join(outDirectory, "_temp"));
+
+    return compiledResultDirPath;
+}
+
 async function applyResultDirFixes(resultDirPath: string, tempDirPath: string) {
     const releaseDirPath = path.join(resultDirPath, buildConfigType);
 
@@ -420,6 +494,18 @@ async function applyResultDirFixes(resultDirPath: string, tempDirPath: string) {
     }
 }
 
+async function mergeDirWithoutOverrides(sourceDirPath: string, targetDirPath: string) {
+    const itemNames = await fs.readdir(sourceDirPath);
+
+    await Promise.all(
+        itemNames.map((itemName) => (
+            fs.move(path.join(sourceDirPath, itemName), path.join(targetDirPath, itemName), {
+                overwrite: false
+            })
+        ))
+    );
+}
+
 async function resolvePrebuiltBinaryPath(prebuiltBinaryDirectoryPath: string) {
     const binaryPath = path.join(prebuiltBinaryDirectoryPath, "llama-addon.node");
     const buildMetadataFilePath = path.join(prebuiltBinaryDirectoryPath, buildMetadataFileName);

From 6a0d2cbb8a2da3eef7c96743891f0ccc66f9e025 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 00:14:59 +0200
Subject: [PATCH 42/73] fix: Windows build

---
 llama/toolchains/llvm.win32.host-x64.target-x64.cmake | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
index 7c9c5130..2107f8e1 100644
--- a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
+++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
@@ -18,7 +18,3 @@ set(arch_c_flags "-march=native")
 
 set(CMAKE_C_FLAGS_INIT "${arch_c_flags}")
 set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags}")
-
-# Set the CUDA flags to use the same compiler as the C compiler
-get_filename_component(C_COMPILER_DIR "${CMAKE_C_COMPILER}" DIRECTORY)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-bindir=${C_COMPILER_DIR}")

From 19eb89b3e7cf4cdc93dbdaa5493a13aa14fa4788 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 00:18:23 +0200
Subject: [PATCH 43/73] fix: Windows build

---
 src/bindings/utils/compileLLamaCpp.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index b3ae1247..bba42a42 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -136,6 +136,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 }
 
                 if (useWindowsLlvm) {
+                    cmakeCustomOptions.set("GGML_OPENMP", "OFF");
+
                     for (const [customFlag, customFlagValue] of [...cmakeCustomOptions.entries()]) {
                         if (!windowsMsvcOnlyBuildFlagsToTargets.has(customFlag) || isCmakeValueOff(customFlagValue))
                             continue;

From bd0e954f426797641342eb52c5e4f10d23e3c419 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 00:25:16 +0200
Subject: [PATCH 44/73] fix: Windows build

---
 src/bindings/utils/compileLLamaCpp.ts | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index bba42a42..97985056 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -5,6 +5,7 @@ import os from "os";
 import fs from "fs-extra";
 import chalk from "chalk";
 import which from "which";
+import filenamify from "filenamify";
 import {
     buildMetadataFileName, documentationPageUrls, llamaCppDirectory, llamaDirectory, llamaLocalBuildBinsDirectory,
     llamaPrebuiltBinsDirectory, llamaToolchainsDirectory
@@ -190,17 +191,16 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
 
                 // perform a separate MSVC build and combine the compiled backends into the final build
                 if (useWindowsLlvm && windowsSeparateMsvcCmakeOptions.size > 0) {
-                    const llvmResultDir = path.join(outDirectory, "_llvm" + buildConfigType);
-                    await fs.move(compiledResultDirPath, llvmResultDir);
-
                     for (const [targetFlag, targetValue] of windowsSeparateMsvcCmakeOptions) {
                         const targetName = windowsMsvcOnlyBuildFlagsToTargets.get(targetFlag);
                         if (targetName == null)
                             continue;
 
-                        console.info(getConsoleLogPrefix(true, false), "Building specialized GPU backends using MSVC: " + targetName);
+                        console.info(getConsoleLogPrefix(true, false), "Building specialized GPU backend using MSVC: " + targetName);
+
+                        const targetOutDir = path.join(outDirectory, "target-" + filenamify(targetName));
 
-                        await fs.remove(compiledResultDirPath);
+                        await fs.remove(targetOutDir);
                         await spawnCommand(
                             "npm",
                             [
@@ -208,7 +208,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                                 "--log-level", "warn",
                                 "--config", buildConfigType,
                                 "--arch=" + buildOptions.arch,
-                                "--out", path.relative(llamaDirectory, outDirectory),
+                                "--out", path.relative(llamaDirectory, targetOutDir),
                                 "--runtime-version=" + runtimeVersion,
                                 "--parallel=" + parallelBuildThreads,
                                 "--target", targetName,
@@ -224,12 +224,9 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                             envVars,
                             buildOptions.progressLogs
                         );
-                        const targetCompileResultDir = await moveBuildFilesToResultDir(outDirectory);
+                        const targetCompileResultDir = await moveBuildFilesToResultDir(targetOutDir);
                         await mergeDirWithoutOverrides(targetCompileResultDir, compiledResultDirPath);
                     }
-
-                    await fs.remove(compiledResultDirPath);
-                    await fs.move(llvmResultDir, compiledResultDirPath);
                 }
 
                 await fs.writeFile(path.join(compiledResultDirPath, buildMetadataFileName), JSON.stringify({

From beaefbf24d61deb97c2e8bff7ff7337bcea0b7fa Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 00:48:18 +0200
Subject: [PATCH 45/73] fix: Windows build

---
 llama/CMakeLists.txt                  | 83 ++++++++++++++-------------
 src/bindings/utils/compileLLamaCpp.ts |  3 +-
 2 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index f95932fc..ed8d39ab 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -27,11 +27,6 @@ endif()
 add_definitions(-DNAPI_VERSION=7)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
-execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
-                WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-                OUTPUT_VARIABLE NODE_ADDON_API_DIR
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
-
 set(LLAMA_BUILD_COMMON ON)
 
 if (MINGW)
@@ -53,53 +48,61 @@ else()
     set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
 endif()
 
-include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
+if (NOT DEFINED NLC_SINGLE_LLAMA_CPP_TARGET)
+    execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        OUTPUT_VARIABLE NODE_ADDON_API_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
+endif()
 
 add_subdirectory("llama.cpp")
 include_directories("llama.cpp")
 include_directories("./llama.cpp/common")
 
-unset(GPU_INFO_HEADERS)
-unset(GPU_INFO_SOURCES)
-unset(GPU_INFO_EXTRA_LIBS)
-
-if (GGML_VULKAN OR GGML_KOMPUTE)
-    find_package(Vulkan)
-    if (Vulkan_FOUND)
-        if (GGML_VULKAN)
-            message(STATUS "Using Vulkan for GPU info")
-        elseif (GGML_KOMPUTE)
-            message(STATUS "Using Vulkan for GPU info because Kompute is enabled")
-        endif()
+if (NOT DEFINED NLC_SINGLE_LLAMA_CPP_TARGET)
+    unset(GPU_INFO_HEADERS)
+    unset(GPU_INFO_SOURCES)
+    unset(GPU_INFO_EXTRA_LIBS)
 
-        list(APPEND GPU_INFO_HEADERS gpuInfo/vulkan-gpu-info.h)
-        list(APPEND GPU_INFO_SOURCES gpuInfo/vulkan-gpu-info.cpp)
+    if (GGML_VULKAN OR GGML_KOMPUTE)
+        find_package(Vulkan)
+        if (Vulkan_FOUND)
+            if (GGML_VULKAN)
+                message(STATUS "Using Vulkan for GPU info")
+            elseif (GGML_KOMPUTE)
+                message(STATUS "Using Vulkan for GPU info because Kompute is enabled")
+            endif()
 
-        add_compile_definitions(GPU_INFO_USE_VULKAN)
+            list(APPEND GPU_INFO_HEADERS gpuInfo/vulkan-gpu-info.h)
+            list(APPEND GPU_INFO_SOURCES gpuInfo/vulkan-gpu-info.cpp)
 
-        list(APPEND GPU_INFO_EXTRA_LIBS Vulkan::Vulkan)
-    else()
-        message(FATAL_ERROR "Vulkan was not found")
+            add_compile_definitions(GPU_INFO_USE_VULKAN)
+
+            list(APPEND GPU_INFO_EXTRA_LIBS Vulkan::Vulkan)
+        else()
+            message(FATAL_ERROR "Vulkan was not found")
+        endif()
     endif()
-endif()
 
-list(REMOVE_DUPLICATES GPU_INFO_HEADERS)
-list(REMOVE_DUPLICATES GPU_INFO_SOURCES)
-list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS)
+    list(REMOVE_DUPLICATES GPU_INFO_HEADERS)
+    list(REMOVE_DUPLICATES GPU_INFO_SOURCES)
+    list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS)
 
-file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES})
+    file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES})
 
-add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC} ${GPU_INFO_HEADERS})
-set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
-target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB})
-target_link_libraries(${PROJECT_NAME} "llama")
-target_link_libraries(${PROJECT_NAME} "common")
+    add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC} ${GPU_INFO_HEADERS})
+    set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
+    target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB})
+    target_link_libraries(${PROJECT_NAME} "llama")
+    target_link_libraries(${PROJECT_NAME} "common")
 
-if (DEFINED GPU_INFO_EXTRA_LIBS)
-    target_link_libraries(${PROJECT_NAME} ${GPU_INFO_EXTRA_LIBS})
-endif()
+    if (DEFINED GPU_INFO_EXTRA_LIBS)
+        target_link_libraries(${PROJECT_NAME} ${GPU_INFO_EXTRA_LIBS})
+    endif()
 
-if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
-  # Generate node.lib
-  execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
+    if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+    # Generate node.lib
+    execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
+    endif()
 endif()
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 97985056..2c59a2b1 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -216,7 +216,8 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                                 ...(
                                     [
                                         ...cmakeCustomOptions,
-                                        [targetFlag, targetValue]
+                                        [targetFlag, targetValue],
+                                        ["NLC_SINGLE_LLAMA_CPP_TARGET", "1"]
                                     ].map(([key, value]) => "--CD" + key + "=" + value)
                                 )
                             ],

From c1afcfd33f3bf0e813ba588f831f635067daa0d5 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 00:56:37 +0200
Subject: [PATCH 46/73] test: fix tests

---
 test/modelDependent/llama3.1/tokenPredictor.test.ts | 3 ++-
 test/modelDependent/llama3/functions.test.ts        | 7 ++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/modelDependent/llama3.1/tokenPredictor.test.ts b/test/modelDependent/llama3.1/tokenPredictor.test.ts
index e1854d9d..ca91eb89 100644
--- a/test/modelDependent/llama3.1/tokenPredictor.test.ts
+++ b/test/modelDependent/llama3.1/tokenPredictor.test.ts
@@ -190,7 +190,8 @@ describe("llama 3.1", () => {
                 `);
             });
 
-            test("with evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            // disabled for now due to flakiness
+            test.skip("with evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => {
                 const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
                 const llama = await getTestLlama();
 
diff --git a/test/modelDependent/llama3/functions.test.ts b/test/modelDependent/llama3/functions.test.ts
index e6c6f92c..07da0d94 100644
--- a/test/modelDependent/llama3/functions.test.ts
+++ b/test/modelDependent/llama3/functions.test.ts
@@ -187,12 +187,9 @@ describe("llama 3", () => {
                 }
             } as const;
 
-            const res = await chatSession.prompt("Is an apple more expensive than a banana?", promptOptions);
+            const res = await chatSession.prompt("Is an apple more expensive than a banana? Answer in Yes/No", promptOptions);
 
-            expect(res).to.be.satisfy((text: string) => [
-                "According to the information I have, an apple is more expensive than a banana.",
-                "Let me check the prices for you.  According to the prices I checked, an apple is more expensive than a banana. The apple costs $6, while the banana costs $4."
-            ].includes(text));
+            expect(res).to.toMatchInlineSnapshot('"Yes"');
         });
 
         test("Compare fruit prices with currency", {timeout: 1000 * 60 * 60 * 2}, async () => {

From 87171db685646c5dc0418be96148fad87bd378c6 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 01:04:27 +0200
Subject: [PATCH 47/73] fix: Windows build

---
 llama/CMakeLists.txt                  | 78 +++++++++++++--------------
 src/bindings/utils/compileLLamaCpp.ts | 10 ++--
 2 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index ed8d39ab..836e122d 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -48,61 +48,57 @@ else()
     set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
 endif()
 
-if (NOT DEFINED NLC_SINGLE_LLAMA_CPP_TARGET)
-    execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
-        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-        OUTPUT_VARIABLE NODE_ADDON_API_DIR
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
-endif()
+execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    OUTPUT_VARIABLE NODE_ADDON_API_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
 
 add_subdirectory("llama.cpp")
 include_directories("llama.cpp")
 include_directories("./llama.cpp/common")
 
-if (NOT DEFINED NLC_SINGLE_LLAMA_CPP_TARGET)
-    unset(GPU_INFO_HEADERS)
-    unset(GPU_INFO_SOURCES)
-    unset(GPU_INFO_EXTRA_LIBS)
-
-    if (GGML_VULKAN OR GGML_KOMPUTE)
-        find_package(Vulkan)
-        if (Vulkan_FOUND)
-            if (GGML_VULKAN)
-                message(STATUS "Using Vulkan for GPU info")
-            elseif (GGML_KOMPUTE)
-                message(STATUS "Using Vulkan for GPU info because Kompute is enabled")
-            endif()
+unset(GPU_INFO_HEADERS)
+unset(GPU_INFO_SOURCES)
+unset(GPU_INFO_EXTRA_LIBS)
+
+if (GGML_VULKAN OR GGML_KOMPUTE)
+    find_package(Vulkan)
+    if (Vulkan_FOUND)
+        if (GGML_VULKAN)
+            message(STATUS "Using Vulkan for GPU info")
+        elseif (GGML_KOMPUTE)
+            message(STATUS "Using Vulkan for GPU info because Kompute is enabled")
+        endif()
 
-            list(APPEND GPU_INFO_HEADERS gpuInfo/vulkan-gpu-info.h)
-            list(APPEND GPU_INFO_SOURCES gpuInfo/vulkan-gpu-info.cpp)
+        list(APPEND GPU_INFO_HEADERS gpuInfo/vulkan-gpu-info.h)
+        list(APPEND GPU_INFO_SOURCES gpuInfo/vulkan-gpu-info.cpp)
 
-            add_compile_definitions(GPU_INFO_USE_VULKAN)
+        add_compile_definitions(GPU_INFO_USE_VULKAN)
 
-            list(APPEND GPU_INFO_EXTRA_LIBS Vulkan::Vulkan)
-        else()
-            message(FATAL_ERROR "Vulkan was not found")
-        endif()
+        list(APPEND GPU_INFO_EXTRA_LIBS Vulkan::Vulkan)
+    else()
+        message(FATAL_ERROR "Vulkan was not found")
     endif()
+endif()
 
-    list(REMOVE_DUPLICATES GPU_INFO_HEADERS)
-    list(REMOVE_DUPLICATES GPU_INFO_SOURCES)
-    list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS)
+list(REMOVE_DUPLICATES GPU_INFO_HEADERS)
+list(REMOVE_DUPLICATES GPU_INFO_SOURCES)
+list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS)
 
-    file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES})
+file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES})
 
-    add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC} ${GPU_INFO_HEADERS})
-    set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
-    target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB})
-    target_link_libraries(${PROJECT_NAME} "llama")
-    target_link_libraries(${PROJECT_NAME} "common")
+add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC} ${GPU_INFO_HEADERS})
+set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
+target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB})
+target_link_libraries(${PROJECT_NAME} "llama")
+target_link_libraries(${PROJECT_NAME} "common")
 
-    if (DEFINED GPU_INFO_EXTRA_LIBS)
-        target_link_libraries(${PROJECT_NAME} ${GPU_INFO_EXTRA_LIBS})
-    endif()
+if (DEFINED GPU_INFO_EXTRA_LIBS)
+    target_link_libraries(${PROJECT_NAME} ${GPU_INFO_EXTRA_LIBS})
+endif()
 
-    if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
     # Generate node.lib
     execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
-    endif()
 endif()
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 2c59a2b1..6d0abe11 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -216,8 +216,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                                 ...(
                                     [
                                         ...cmakeCustomOptions,
-                                        [targetFlag, targetValue],
-                                        ["NLC_SINGLE_LLAMA_CPP_TARGET", "1"]
+                                        [targetFlag, targetValue]
                                     ].map(([key, value]) => "--CD" + key + "=" + value)
                                 )
                             ],
@@ -225,7 +224,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                             envVars,
                             buildOptions.progressLogs
                         );
-                        const targetCompileResultDir = await moveBuildFilesToResultDir(targetOutDir);
+                        const targetCompileResultDir = await moveBuildFilesToResultDir(targetOutDir, true);
                         await mergeDirWithoutOverrides(targetCompileResultDir, compiledResultDirPath);
                     }
                 }
@@ -444,13 +443,16 @@ export async function getPrebuiltBinaryBuildMetadata(folderPath: string, folderN
     return buildMetadata;
 }
 
-async function moveBuildFilesToResultDir(outDirectory: string) {
+async function moveBuildFilesToResultDir(outDirectory: string, canCreateReleaseDir: boolean = false) {
     const binFilesDirPaths = [
         path.join(outDirectory, "bin"),
         path.join(outDirectory, "llama.cpp", "bin")
     ];
     const compiledResultDirPath = path.join(outDirectory, buildConfigType);
 
+    if (canCreateReleaseDir)
+        await fs.ensureDir(compiledResultDirPath);
+
     if (!await fs.pathExists(compiledResultDirPath))
         throw new Error(`Could not find ${buildConfigType} directory`);
 

From 08fab6b3d7a283f94a400dfa4af35a1e037364e8 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 01:06:21 +0200
Subject: [PATCH 48/73] fix: Windows build

---
 src/bindings/utils/compileLLamaCpp.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 6d0abe11..7e0e3618 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -196,6 +196,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                         if (targetName == null)
                             continue;
 
+                        console.info();
                         console.info(getConsoleLogPrefix(true, false), "Building specialized GPU backend using MSVC: " + targetName);
 
                         const targetOutDir = path.join(outDirectory, "target-" + filenamify(targetName));

From 7cc01a68c7e0703261ebe4a8f50c051d0aa248d5 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 01:10:25 +0200
Subject: [PATCH 49/73] fix: Windows build

---
 src/bindings/utils/compileLLamaCpp.ts | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 7e0e3618..f4612733 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -24,6 +24,7 @@ import {logDistroInstallInstruction} from "./logDistroInstallInstruction.js";
 import {testCmakeBinary} from "./testCmakeBinary.js";
 import {getCudaNvccPaths} from "./detectAvailableComputeLayers.js";
 import {detectWindowsBuildTools} from "./detectBuildTools.js";
+import {asyncSome} from "./asyncSome.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const buildConfigType: "Release" | "RelWithDebInfo" | "Debug" = "Release";
@@ -451,11 +452,15 @@ async function moveBuildFilesToResultDir(outDirectory: string, canCreateReleaseD
     ];
     const compiledResultDirPath = path.join(outDirectory, buildConfigType);
 
-    if (canCreateReleaseDir)
-        await fs.ensureDir(compiledResultDirPath);
-
-    if (!await fs.pathExists(compiledResultDirPath))
-        throw new Error(`Could not find ${buildConfigType} directory`);
+    if (!await fs.pathExists(compiledResultDirPath)) {
+        if (canCreateReleaseDir) {
+            if (await asyncSome(binFilesDirPaths.map((dirPath) => fs.pathExists(dirPath))))
+                await fs.ensureDir(compiledResultDirPath);
+            else
+                throw new Error(`Could not find ${buildConfigType} directory or any other output directory`);
+        } else
+            throw new Error(`Could not find ${buildConfigType} directory`);
+    }
 
     for (const binFilesDirPath of binFilesDirPaths) {
         if (await fs.pathExists(binFilesDirPath)) {

From 76519b65c3cb6b7193cf59547bab693bc2cbc271 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 01:19:39 +0200
Subject: [PATCH 50/73] fix: Windows build

---
 src/bindings/utils/compileLLamaCpp.ts | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index f4612733..16d52a59 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -503,14 +503,29 @@ async function applyResultDirFixes(resultDirPath: string, tempDirPath: string) {
 }
 
 async function mergeDirWithoutOverrides(sourceDirPath: string, targetDirPath: string) {
-    const itemNames = await fs.readdir(sourceDirPath);
+    const itemNames = await fs.readdir(sourceDirPath, {withFileTypes: true});
 
     await Promise.all(
-        itemNames.map((itemName) => (
-            fs.move(path.join(sourceDirPath, itemName), path.join(targetDirPath, itemName), {
-                overwrite: false
-            })
-        ))
+        itemNames.map(async (item) => {
+            const targetItemPath = path.join(targetDirPath, item.name);
+
+            if (item.isDirectory()) {
+                if (await fs.pathExists(targetItemPath)) {
+                    if ((await fs.stat(targetItemPath)).isDirectory())
+                        await mergeDirWithoutOverrides(path.join(sourceDirPath, item.name), targetItemPath);
+                } else {
+                    await fs.ensureDir(targetItemPath);
+                    await mergeDirWithoutOverrides(path.join(sourceDirPath, item.name), targetItemPath);
+                }
+            } else {
+                if (await fs.pathExists(targetItemPath))
+                    return;
+
+                await fs.move(path.join(sourceDirPath, item.name), targetItemPath, {
+                    overwrite: false
+                });
+            }
+        })
     );
 }
 

From 7d76b61d2ba4e756237f2e84aba9208f60cfae9c Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 01:49:31 +0200
Subject: [PATCH 51/73] fix: Windows build

---
 src/bindings/utils/compileLLamaCpp.ts | 101 +++-----------------------
 1 file changed, 11 insertions(+), 90 deletions(-)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 16d52a59..efa65838 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -5,7 +5,6 @@ import os from "os";
 import fs from "fs-extra";
 import chalk from "chalk";
 import which from "which";
-import filenamify from "filenamify";
 import {
     buildMetadataFileName, documentationPageUrls, llamaCppDirectory, llamaDirectory, llamaLocalBuildBinsDirectory,
     llamaPrebuiltBinsDirectory, llamaToolchainsDirectory
@@ -29,10 +28,8 @@ import {asyncSome} from "./asyncSome.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const buildConfigType: "Release" | "RelWithDebInfo" | "Debug" = "Release";
 
-const windowsMsvcOnlyBuildFlagsToTargets = new Map(
-    ["blas", "cann", "cuda", "hip", "kompute", "metal", "musa", "sycl", "vulkan", "opencl"]
-        .map((backend) => ["GGML_" + backend.toUpperCase(), "ggml-" + backend.toLowerCase()])
-);
+const requiresMsvcOnWindowsFlags = ["blas", "cann", "cuda", "hip", "kompute", "metal", "musa", "sycl", "vulkan", "opencl"]
+    .map((backend) => ("GGML_" + backend.toUpperCase()));
 
 export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions: {
     nodeTarget?: string,
@@ -62,8 +59,10 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
         : buildFolderName.withoutCustomCmakeOptions;
     const useWindowsLlvm = (
         platform === "win" &&
+        buildOptions.gpu === false &&
         !ignoreWorkarounds.includes("avoidWindowsLlvm") &&
-        !buildOptions.customCmakeOptions.has("CMAKE_TOOLCHAIN_FILE")
+        !buildOptions.customCmakeOptions.has("CMAKE_TOOLCHAIN_FILE") &&
+        !requiresMsvcOnWindowsFlags.some((flag) => buildOptions.customCmakeOptions.has(flag))
     )
         ? areWindowsBuildToolsCapableForLlvmBuild(await detectWindowsBuildTools())
         : false;
@@ -96,12 +95,14 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 const runtimeVersion = nodeTarget.startsWith("v") ? nodeTarget.slice("v".length) : nodeTarget;
                 const cmakeCustomOptions = new Map(buildOptions.customCmakeOptions);
                 const cmakeToolchainOptions = new Map<string, string>();
-                const windowsSeparateMsvcCmakeOptions = new Map<string, string>();
 
                 cmakeCustomOptions.set("CMAKE_CONFIGURATION_TYPES", buildConfigType);
                 cmakeCustomOptions.set("NLC_CURRENT_PLATFORM", platform + "-" + process.arch);
                 cmakeCustomOptions.set("NLC_TARGET_PLATFORM", buildOptions.platform + "-" + buildOptions.arch);
 
+                if (toolchainFile != null && !cmakeCustomOptions.has("CMAKE_TOOLCHAIN_FILE"))
+                    cmakeToolchainOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
+
                 if (buildOptions.gpu === "metal" && process.platform === "darwin" && !cmakeCustomOptions.has("GGML_METAL"))
                     cmakeCustomOptions.set("GGML_METAL", "1");
                 else if (!cmakeCustomOptions.has("GGML_METAL"))
@@ -116,12 +117,12 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 if (!cmakeCustomOptions.has("GGML_CCACHE"))
                     cmakeCustomOptions.set("GGML_CCACHE", "OFF");
 
-                if (toolchainFile != null && !cmakeCustomOptions.has("CMAKE_TOOLCHAIN_FILE"))
-                    cmakeToolchainOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
-
                 if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP"))
                     cmakeCustomOptions.set("GGML_OPENMP", "OFF");
 
+                if (useWindowsLlvm)
+                    cmakeCustomOptions.set("GGML_OPENMP", "OFF");
+
                 if (ciMode) {
                     if (!cmakeCustomOptions.has("GGML_OPENMP"))
                         cmakeCustomOptions.set("GGML_OPENMP", "OFF");
@@ -137,18 +138,6 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     }
                 }
 
-                if (useWindowsLlvm) {
-                    cmakeCustomOptions.set("GGML_OPENMP", "OFF");
-
-                    for (const [customFlag, customFlagValue] of [...cmakeCustomOptions.entries()]) {
-                        if (!windowsMsvcOnlyBuildFlagsToTargets.has(customFlag) || isCmakeValueOff(customFlagValue))
-                            continue;
-
-                        windowsSeparateMsvcCmakeOptions.set(customFlag, customFlagValue);
-                        cmakeCustomOptions.delete(customFlag);
-                    }
-                }
-
                 await fs.remove(outDirectory);
 
                 await spawnCommand(
@@ -190,47 +179,6 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
 
                 const compiledResultDirPath = await moveBuildFilesToResultDir(outDirectory);
 
-                // perform a separate MSVC build and combine the compiled backends into the final build
-                if (useWindowsLlvm && windowsSeparateMsvcCmakeOptions.size > 0) {
-                    for (const [targetFlag, targetValue] of windowsSeparateMsvcCmakeOptions) {
-                        const targetName = windowsMsvcOnlyBuildFlagsToTargets.get(targetFlag);
-                        if (targetName == null)
-                            continue;
-
-                        console.info();
-                        console.info(getConsoleLogPrefix(true, false), "Building specialized GPU backend using MSVC: " + targetName);
-
-                        const targetOutDir = path.join(outDirectory, "target-" + filenamify(targetName));
-
-                        await fs.remove(targetOutDir);
-                        await spawnCommand(
-                            "npm",
-                            [
-                                "run", "-s", "cmake-js-llama", "--", "compile",
-                                "--log-level", "warn",
-                                "--config", buildConfigType,
-                                "--arch=" + buildOptions.arch,
-                                "--out", path.relative(llamaDirectory, targetOutDir),
-                                "--runtime-version=" + runtimeVersion,
-                                "--parallel=" + parallelBuildThreads,
-                                "--target", targetName,
-                                ...cmakePathArgs,
-                                ...(
-                                    [
-                                        ...cmakeCustomOptions,
-                                        [targetFlag, targetValue]
-                                    ].map(([key, value]) => "--CD" + key + "=" + value)
-                                )
-                            ],
-                            __dirname,
-                            envVars,
-                            buildOptions.progressLogs
-                        );
-                        const targetCompileResultDir = await moveBuildFilesToResultDir(targetOutDir, true);
-                        await mergeDirWithoutOverrides(targetCompileResultDir, compiledResultDirPath);
-                    }
-                }
-
                 await fs.writeFile(path.join(compiledResultDirPath, buildMetadataFileName), JSON.stringify({
                     buildOptions: convertBuildOptionsToBuildOptionsJSON(buildOptions)
                 } satisfies BuildMetadataFile), "utf8");
@@ -502,33 +450,6 @@ async function applyResultDirFixes(resultDirPath: string, tempDirPath: string) {
     }
 }
 
-async function mergeDirWithoutOverrides(sourceDirPath: string, targetDirPath: string) {
-    const itemNames = await fs.readdir(sourceDirPath, {withFileTypes: true});
-
-    await Promise.all(
-        itemNames.map(async (item) => {
-            const targetItemPath = path.join(targetDirPath, item.name);
-
-            if (item.isDirectory()) {
-                if (await fs.pathExists(targetItemPath)) {
-                    if ((await fs.stat(targetItemPath)).isDirectory())
-                        await mergeDirWithoutOverrides(path.join(sourceDirPath, item.name), targetItemPath);
-                } else {
-                    await fs.ensureDir(targetItemPath);
-                    await mergeDirWithoutOverrides(path.join(sourceDirPath, item.name), targetItemPath);
-                }
-            } else {
-                if (await fs.pathExists(targetItemPath))
-                    return;
-
-                await fs.move(path.join(sourceDirPath, item.name), targetItemPath, {
-                    overwrite: false
-                });
-            }
-        })
-    );
-}
-
 async function resolvePrebuiltBinaryPath(prebuiltBinaryDirectoryPath: string) {
     const binaryPath = path.join(prebuiltBinaryDirectoryPath, "llama-addon.node");
     const buildMetadataFilePath = path.join(prebuiltBinaryDirectoryPath, buildMetadataFileName);

From cff255c3b6e5011d8703e3f6aed82e2ea9973a1b Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 01:52:28 +0200
Subject: [PATCH 52/73] fix: Windows build

---
 src/bindings/utils/compileLLamaCpp.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index efa65838..ad40b259 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -28,7 +28,7 @@ import {asyncSome} from "./asyncSome.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const buildConfigType: "Release" | "RelWithDebInfo" | "Debug" = "Release";
 
-const requiresMsvcOnWindowsFlags = ["blas", "cann", "cuda", "hip", "kompute", "metal", "musa", "sycl", "vulkan", "opencl"]
+const requiresMsvcOnWindowsFlags = ["blas", "cann", "cuda", "hip", "kompute", "musa", "sycl", "opencl"]
     .map((backend) => ("GGML_" + backend.toUpperCase()));
 
 export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions: {
@@ -59,7 +59,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
         : buildFolderName.withoutCustomCmakeOptions;
     const useWindowsLlvm = (
         platform === "win" &&
-        buildOptions.gpu === false &&
+        (buildOptions.gpu === false || buildOptions.gpu === "vulkan") &&
         !ignoreWorkarounds.includes("avoidWindowsLlvm") &&
         !buildOptions.customCmakeOptions.has("CMAKE_TOOLCHAIN_FILE") &&
         !requiresMsvcOnWindowsFlags.some((flag) => buildOptions.customCmakeOptions.has(flag))

From 80d9a2fb17dadd8a9128db1e0b2f951b6aa85eb1 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 02:26:33 +0200
Subject: [PATCH 53/73] fix: Windows build

---
 llama/CMakeLists.txt                                | 4 ++++
 llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 836e122d..f2363083 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -1,5 +1,9 @@
 cmake_minimum_required(VERSION 3.14)
 
+if (NLC_CURRENT_PLATFORM STREQUAL "win-x64")
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
 if (NLC_TARGET_PLATFORM STREQUAL "win-arm64" AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") AND NOT MINGW)
     if(NLC_CURRENT_PLATFORM STREQUAL "win-x64")
         include("./profiles/llvm.win32.host-x64.target-arm64.cmake")
diff --git a/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
index 7d9f4c04..9163da3e 100644
--- a/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
+++ b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
@@ -7,7 +7,6 @@ function(llvmApplyGnuModeAdaptations)
         set(CMAKE_SHARED_LINKER_FLAGS "${UPDATED_CMAKE_SHARED_LINKER_FLAGS}" PARENT_SCOPE)
     endif()
 
-    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON PARENT_SCOPE)
     set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt" PARENT_SCOPE)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt" PARENT_SCOPE)
 endfunction()

From 144c9c48ad2b79ab8b23ce114eed88f09da4ccbe Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 02:32:55 +0200
Subject: [PATCH 54/73] test: fix tests

---
 test/modelDependent/llama3.1/tokenPredictor.test.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/modelDependent/llama3.1/tokenPredictor.test.ts b/test/modelDependent/llama3.1/tokenPredictor.test.ts
index ca91eb89..a549f39b 100644
--- a/test/modelDependent/llama3.1/tokenPredictor.test.ts
+++ b/test/modelDependent/llama3.1/tokenPredictor.test.ts
@@ -72,14 +72,14 @@ describe("llama 3.1", () => {
               ]
             `);
 
-            const text2Tokens = model.tokenize("can the");
+            const text2Tokens = model.tokenize("can be");
             predictor.pushTokens(text2Tokens);
 
             const predictedTokens4 = await predictor.predictTokens();
             expect(predictedTokens4.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
               [
-                " average",
-                " person",
+                " done",
+                " to",
               ]
             `);
         });

From b3ec8c26ab5e9713beb3af5f86f911cf030ad78b Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 03:10:04 +0200
Subject: [PATCH 55/73] fix: Windows build

---
 llama/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index f2363083..c4698367 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.14)
 
-if (NLC_CURRENT_PLATFORM STREQUAL "win-x64")
+if (NLC_CURRENT_PLATFORM STREQUAL "win-x64" OR NLC_CURRENT_PLATFORM STREQUAL "win-arm64")
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 

From a08aaf15ae4097e823b745430f5137d3204d4456 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 04:20:07 +0200
Subject: [PATCH 56/73] fix: prevent loading Vulkan if the device is
 unsupported

---
 llama/addon/addon.cpp                   |  1 +
 llama/addon/globals/getGpuInfo.cpp      | 11 +++++++++++
 llama/addon/globals/getGpuInfo.h        |  3 ++-
 llama/gpuInfo/vulkan-gpu-info.cpp       | 10 ++++++++++
 llama/gpuInfo/vulkan-gpu-info.h         |  1 +
 src/bindings/AddonTypes.ts              |  1 +
 src/bindings/Llama.ts                   |  2 ++
 src/bindings/utils/testBindingBinary.ts |  2 ++
 8 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
index f2a6669c..943866c0 100644
--- a/llama/addon/addon.cpp
+++ b/llama/addon/addon.cpp
@@ -237,6 +237,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("getGpuVramInfo", getGpuVramInfo),
         Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo),
         Napi::PropertyDescriptor::Function("getGpuType", getGpuType),
+        Napi::PropertyDescriptor::Function("ensureGpuDeviceIsSupported", ensureGpuDeviceIsSupported),
         Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo),
         Napi::PropertyDescriptor::Function("getMemoryInfo", getMemoryInfo),
         Napi::PropertyDescriptor::Function("loadBackends", addonLoadBackends),
diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp
index 61715c3c..172636ad 100644
--- a/llama/addon/globals/getGpuInfo.cpp
+++ b/llama/addon/globals/getGpuInfo.cpp
@@ -128,3 +128,14 @@ Napi::Value getGpuType(const Napi::CallbackInfo& info) {
 
     return info.Env().Undefined();
 }
+
+Napi::Value ensureGpuDeviceIsSupported(const Napi::CallbackInfo& info) {
+#ifdef GPU_INFO_USE_VULKAN
+    if (!checkIsVulkanEnvSupported()) {
+        Napi::Error::New(info.Env(), "Vulkan device is not supported").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+#endif
+
+    return info.Env().Undefined();
+}
diff --git a/llama/addon/globals/getGpuInfo.h b/llama/addon/globals/getGpuInfo.h
index ef8509ed..275f2e44 100644
--- a/llama/addon/globals/getGpuInfo.h
+++ b/llama/addon/globals/getGpuInfo.h
@@ -7,4 +7,5 @@
 Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info);
 Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info);
 std::pair<ggml_backend_dev_t, std::string> getGpuDevice();
-Napi::Value getGpuType(const Napi::CallbackInfo& info);
\ No newline at end of file
+Napi::Value getGpuType(const Napi::CallbackInfo& info);
+Napi::Value ensureGpuDeviceIsSupported(const Napi::CallbackInfo& info);
diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp
index b47f92a8..52c30573 100644
--- a/llama/gpuInfo/vulkan-gpu-info.cpp
+++ b/llama/gpuInfo/vulkan-gpu-info.cpp
@@ -80,3 +80,13 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedM
 bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
     return enumerateVulkanDevices(total, used, unifiedMemorySize, false, nullptr, warningLogCallback);
 }
+
+bool checkIsVulkanEnvSupported() {
+    VkPhysicalDeviceVulkan11Features vk11_features;
+
+    if (!vk11_features.storageBuffer16BitAccess) {
+        return false;
+    }
+
+    return true;
+}
diff --git a/llama/gpuInfo/vulkan-gpu-info.h b/llama/gpuInfo/vulkan-gpu-info.h
index af03026e..01aeb9cf 100644
--- a/llama/gpuInfo/vulkan-gpu-info.h
+++ b/llama/gpuInfo/vulkan-gpu-info.h
@@ -6,3 +6,4 @@
 typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
 
 bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback);
+bool checkIsVulkanEnvSupported();
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index d1ec4a12..5befaadb 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -72,6 +72,7 @@ export type BindingModule = {
         deviceNames: string[]
     },
     getGpuType(): "cuda" | "vulkan" | "metal" | false | undefined,
+    ensureGpuDeviceIsSupported(): void,
     getSwapInfo(): {
         total: number,
         maxSize: number,
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index dfeb5578..0ed6e7f8 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -108,6 +108,8 @@ export class Llama {
         if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
             bindings.loadBackends(path.dirname(bindingPath));
 
+        bindings.ensureGpuDeviceIsSupported();
+
         this._gpu = bindings.getGpuType() ?? false;
         this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
         this._supportsMmap = bindings.getSupportsMmap();
diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts
index b85ac213..914acb34 100644
--- a/src/bindings/utils/testBindingBinary.ts
+++ b/src/bindings/utils/testBindingBinary.ts
@@ -211,6 +211,8 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
                 if (gpuType !== message.gpu)
                     throw new Error(`Binary GPU type mismatch. Expected: ${message.gpu}, got: ${gpuType}`);
 
+                binding.ensureGpuDeviceIsSupported();
+
                 sendMessage({type: "done"});
             } catch (err) {
                 console.error(err);

From 96987c01590efac45ddf92d50da52cbb79627e45 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 04:35:12 +0200
Subject: [PATCH 57/73] fix: prevent loading Vulkan if the device is
 unsupported

---
 llama/addon/globals/getGpuInfo.cpp |  2 +-
 llama/gpuInfo/vulkan-gpu-info.cpp  | 35 +++++++++++++++++++++++-------
 llama/gpuInfo/vulkan-gpu-info.h    |  2 +-
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp
index 172636ad..561c1896 100644
--- a/llama/addon/globals/getGpuInfo.cpp
+++ b/llama/addon/globals/getGpuInfo.cpp
@@ -131,7 +131,7 @@ Napi::Value getGpuType(const Napi::CallbackInfo& info) {
 
 Napi::Value ensureGpuDeviceIsSupported(const Napi::CallbackInfo& info) {
 #ifdef GPU_INFO_USE_VULKAN
-    if (!checkIsVulkanEnvSupported()) {
+    if (!checkIsVulkanEnvSupported(logVulkanWarning)) {
         Napi::Error::New(info.Env(), "Vulkan device is not supported").ThrowAsJavaScriptException();
         return info.Env().Undefined();
     }
diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp
index 52c30573..9c192224 100644
--- a/llama/gpuInfo/vulkan-gpu-info.cpp
+++ b/llama/gpuInfo/vulkan-gpu-info.cpp
@@ -5,7 +5,7 @@
 
 typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
 
-static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
+static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback, bool * checkSupported) {
     vk::ApplicationInfo appInfo("node-llama-cpp GPU info", 1, "llama.cpp", 1, VK_API_VERSION_1_2);
     vk::InstanceCreateInfo createInfo(vk::InstanceCreateFlags(), &appInfo, {}, {});
     vk::Instance instance = vk::createInstance(createInfo);
@@ -56,6 +56,24 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedM
                     if (size > 0 && addDeviceNames) {
                         (*deviceNames).push_back(std::string(deviceProps.deviceName.data()));
                     }
+
+                    if (checkSupported != nullptr && checkSupported) {
+                        VkPhysicalDeviceFeatures2 device_features2;
+                        device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+                        device_features2.pNext = nullptr;
+                        device_features2.features = (VkPhysicalDeviceFeatures)physicalDevice.getFeatures();
+
+                        VkPhysicalDeviceVulkan11Features vk11_features;
+                        vk11_features.pNext = nullptr;
+                        vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+                        device_features2.pNext = &vk11_features;
+
+                        vkGetPhysicalDeviceFeatures2(physicalDevice, &device_features2);
+
+                        if (!vk11_features.storageBuffer16BitAccess) {
+                            checkSupported = false;
+                        }
+                    }
                 }
             }
         } else {
@@ -78,15 +96,16 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedM
 }
 
 bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
-    return enumerateVulkanDevices(total, used, unifiedMemorySize, false, nullptr, warningLogCallback);
+    return enumerateVulkanDevices(total, used, unifiedMemorySize, false, nullptr, warningLogCallback, nullptr);
 }
 
-bool checkIsVulkanEnvSupported() {
-    VkPhysicalDeviceVulkan11Features vk11_features;
+bool checkIsVulkanEnvSupported(gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
+    size_t total = 0;
+    size_t used = 0;
+    size_t unifiedMemorySize = 0;
 
-    if (!vk11_features.storageBuffer16BitAccess) {
-        return false;
-    }
+    bool isSupported = true;
+    enumerateVulkanDevices(&total, &used, &unifiedMemorySize, false, nullptr, warningLogCallback, &isSupported);
 
-    return true;
+    return isSupported;
 }
diff --git a/llama/gpuInfo/vulkan-gpu-info.h b/llama/gpuInfo/vulkan-gpu-info.h
index 01aeb9cf..09f63406 100644
--- a/llama/gpuInfo/vulkan-gpu-info.h
+++ b/llama/gpuInfo/vulkan-gpu-info.h
@@ -6,4 +6,4 @@
 typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
 
 bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback);
-bool checkIsVulkanEnvSupported();
+bool checkIsVulkanEnvSupported(gpuInfoVulkanWarningLogCallback_t warningLogCallback);

From 26fcb2aec8312f058877ccec234bd59a00351e49 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 04:38:35 +0200
Subject: [PATCH 58/73] fix: bug

---
 llama/gpuInfo/vulkan-gpu-info.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp
index 9c192224..da915f4c 100644
--- a/llama/gpuInfo/vulkan-gpu-info.cpp
+++ b/llama/gpuInfo/vulkan-gpu-info.cpp
@@ -71,7 +71,7 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedM
                         vkGetPhysicalDeviceFeatures2(physicalDevice, &device_features2);
 
                         if (!vk11_features.storageBuffer16BitAccess) {
-                            checkSupported = false;
+                            *checkSupported = false;
                         }
                     }
                 }

From 1c8347613c2f2c9db83486e3ebe2c3ce67f0cfe8 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 2 Jan 2025 04:43:38 +0200
Subject: [PATCH 59/73] fix: bug

---
 llama/gpuInfo/vulkan-gpu-info.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp
index da915f4c..d9bf40f6 100644
--- a/llama/gpuInfo/vulkan-gpu-info.cpp
+++ b/llama/gpuInfo/vulkan-gpu-info.cpp
@@ -58,19 +58,17 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedM
                     }
 
                     if (checkSupported != nullptr && checkSupported) {
-                        VkPhysicalDeviceFeatures2 device_features2;
-                        device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
-                        device_features2.pNext = nullptr;
-                        device_features2.features = (VkPhysicalDeviceFeatures)physicalDevice.getFeatures();
+                        VkPhysicalDeviceFeatures2 features2 = {};
+                        features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
 
-                        VkPhysicalDeviceVulkan11Features vk11_features;
-                        vk11_features.pNext = nullptr;
-                        vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
-                        device_features2.pNext = &vk11_features;
+                        VkPhysicalDeviceVulkan11Features vk11Features = {};
+                        vk11Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+                        features2.pNext = &vk11Features;
 
-                        vkGetPhysicalDeviceFeatures2(physicalDevice, &device_features2);
+                        vkGetPhysicalDeviceFeatures2(physicalDevice, &features2);
+                        VkPhysicalDeviceFeatures2 device_features2;
 
-                        if (!vk11_features.storageBuffer16BitAccess) {
+                        if (!vk11Features.storageBuffer16BitAccess) {
                             *checkSupported = false;
                         }
                     }

From 9b0139732bc6926f4cbf12744bb8d169faeb9290 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 4 Jan 2025 18:29:02 +0200
Subject: [PATCH 60/73] fix: remove deprecated CLS token

---
 llama/addon/AddonModel.cpp             | 16 +++++-------
 llama/addon/AddonModel.h               |  1 -
 src/bindings/AddonTypes.ts             |  1 -
 src/evaluator/LlamaModel/LlamaModel.ts | 35 --------------------------
 src/utils/LlamaText.ts                 |  4 +--
 src/utils/tokenizerUtils.ts            |  2 +-
 6 files changed, 9 insertions(+), 50 deletions(-)

diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
index a76c2bbc..6ddf9705 100644
--- a/llama/addon/AddonModel.cpp
+++ b/llama/addon/AddonModel.cpp
@@ -515,7 +515,12 @@ Napi::Value AddonModel::TokenBos(const Napi::CallbackInfo& info) {
         return info.Env().Undefined();
     }
 
-    return getNapiControlToken(info, model, llama_token_bos(model));
+    auto token = llama_token_bos(model);
+    if (token == LLAMA_TOKEN_NULL) {
+        token = llama_token_cls(model);
+    }
+
+    return getNapiControlToken(info, model, token);
 }
 Napi::Value AddonModel::TokenEos(const Napi::CallbackInfo& info) {
     if (disposed) {
@@ -565,14 +570,6 @@ Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {
 
     return getNapiToken(info, model, llama_token_eot(model));
 }
-Napi::Value AddonModel::ClsToken(const Napi::CallbackInfo& info) {
-    if (disposed) {
-        Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
-        return info.Env().Undefined();
-    }
-
-    return getNapiToken(info, model, llama_token_cls(model));
-}
 Napi::Value AddonModel::SepToken(const Napi::CallbackInfo& info) {
     if (disposed) {
         Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
@@ -678,7 +675,6 @@ void AddonModel::init(Napi::Object exports) {
                 InstanceMethod("middleToken", &AddonModel::MiddleToken),
                 InstanceMethod("suffixToken", &AddonModel::SuffixToken),
                 InstanceMethod("eotToken", &AddonModel::EotToken),
-                InstanceMethod("clsToken", &AddonModel::ClsToken),
                 InstanceMethod("sepToken", &AddonModel::SepToken),
                 InstanceMethod("getTokenString", &AddonModel::GetTokenString),
                 InstanceMethod("getTokenAttributes", &AddonModel::GetTokenAttributes),
diff --git a/llama/addon/AddonModel.h b/llama/addon/AddonModel.h
index a0efd6a1..50d47fef 100644
--- a/llama/addon/AddonModel.h
+++ b/llama/addon/AddonModel.h
@@ -49,7 +49,6 @@ class AddonModel : public Napi::ObjectWrap<AddonModel> {
         Napi::Value MiddleToken(const Napi::CallbackInfo& info);
         Napi::Value SuffixToken(const Napi::CallbackInfo& info);
         Napi::Value EotToken(const Napi::CallbackInfo& info);
-        Napi::Value ClsToken(const Napi::CallbackInfo& info);
         Napi::Value SepToken(const Napi::CallbackInfo& info);
         Napi::Value GetTokenString(const Napi::CallbackInfo& info);
 
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 5befaadb..711eb41f 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -105,7 +105,6 @@ export type AddonModel = {
     middleToken(): Token,
     suffixToken(): Token,
     eotToken(): Token,
-    clsToken(): Token,
     sepToken(): Token,
     getTokenString(token: number): string,
     getTokenAttributes(token: Token): number,
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 208c7a9c..6e8fac18 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -336,7 +336,6 @@ export class LlamaModel {
                 case "EOS": return this.tokens.eos == null ? [] : [this.tokens.eos];
                 case "NL": return this.tokens.nl == null ? [] : [this.tokens.nl];
                 case "EOT": return this.tokens.eot == null ? [] : [this.tokens.eot];
-                case "CLS": return this.tokens.cls == null ? [] : [this.tokens.cls];
                 case "SEP": return this.tokens.sep == null ? [] : [this.tokens.sep];
             }
 
@@ -761,13 +760,11 @@ export class LlamaModelTokens {
     /** @internal */ private _bosToken?: Token;
     /** @internal */ private _eosToken?: Token;
     /** @internal */ private _eotToken?: Token;
-    /** @internal */ private _clsToken?: Token;
     /** @internal */ private _sepToken?: Token;
     /** @internal */ private _nlToken?: Token;
     /** @internal */ private _bosString?: string;
     /** @internal */ private _eosString?: string;
     /** @internal */ private _eotString?: string;
-    /** @internal */ private _clsString?: string;
     /** @internal */ private _sepString?: string;
     /** @internal */ private _nlString?: string;
     /** @internal */ private _shouldPrependBosToken?: boolean;
@@ -835,21 +832,6 @@ export class LlamaModelTokens {
         return this._eotToken;
     }
 
-    /**
-     * @returns The CLS (Classification) token.
-     */
-    public get cls(): Token | null {
-        this._ensureNotDisposed();
-
-        if (this._clsToken == null)
-            this._clsToken = this._model.clsToken();
-
-        if (this._clsToken === -1)
-            return null;
-
-        return this._clsToken;
-    }
-
     /**
      * @returns The SEP (Sentence Separator) token.
      */
@@ -931,23 +913,6 @@ export class LlamaModelTokens {
         return this._eotString;
     }
 
-    /**
-     * @returns The CLS (Classification) token text representation.
-     */
-    public get clsString(): string | null {
-        this._ensureNotDisposed();
-
-        const clsToken = this.cls;
-
-        if (clsToken == null)
-            return null;
-
-        if (this._clsString == null)
-            this._clsString = this._model.getTokenString(clsToken);
-
-        return this._clsString;
-    }
-
     /**
      * @returns The SEP (Sentence Separator) token text representation.
      */
diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts
index f7a2323c..50fa13db 100644
--- a/src/utils/LlamaText.ts
+++ b/src/utils/LlamaText.ts
@@ -515,7 +515,7 @@ export class SpecialTokensText {
     }
 }
 
-export type BuiltinSpecialTokenValue = "BOS" | "EOS" | "NL" | "EOT" | "CLS" | "SEP";
+export type BuiltinSpecialTokenValue = "BOS" | "EOS" | "NL" | "EOT" | "SEP";
 export class SpecialToken {
     public readonly value: BuiltinSpecialTokenValue;
 
@@ -568,7 +568,7 @@ export class SpecialToken {
 
     public static getTokenToValueMap(tokenizer: Tokenizer): ReadonlyMap<Token | undefined, BuiltinSpecialTokenValue> {
         const supportedValues = [
-            "BOS", "EOS", "NL", "EOT", "CLS", "SEP"
+            "BOS", "EOS", "NL", "EOT", "SEP"
         ] as const satisfies BuiltinSpecialTokenValue[];
         void (0 as any as BuiltinSpecialTokenValue satisfies typeof supportedValues[number]);
 
diff --git a/src/utils/tokenizerUtils.ts b/src/utils/tokenizerUtils.ts
index 987d45dd..71e9dbb5 100644
--- a/src/utils/tokenizerUtils.ts
+++ b/src/utils/tokenizerUtils.ts
@@ -10,7 +10,7 @@ export function resolveBeginningTokenToPrepend(vocabularyType: LlamaVocabularyTy
         return null;
 
     if (vocabularyType === LlamaVocabularyType.wpm)
-        return tokens.cls ?? tokens.bos; // https://github.com/ggerganov/llama.cpp/pull/10930
+        return tokens.bos;
 
     if (tokens.shouldPrependBosToken)
         return tokens.bos;

From f050fa453af81085d9d6e355d7b4c14e5a5b76f8 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sat, 4 Jan 2025 18:31:24 +0200
Subject: [PATCH 61/73] fix: bug

---
 src/evaluator/LlamaContext/LlamaContext.ts | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index ab579581..6caa6c4c 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -12,6 +12,7 @@ import {UnsupportedError} from "../../utils/UnsupportedError.js";
 import {ThreadsSplitterConsumer} from "../../utils/ThreadsSplitter.js";
 import {pushAll} from "../../utils/pushAll.js";
 import {safeEventCallback} from "../../utils/safeEventCallback.js";
+import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js";
 import {
     BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem,
     EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, SequenceEvaluateOptions
@@ -1079,7 +1080,10 @@ export class LlamaContextSequence {
      * which incurs token evaluation of the shifted tokens.
      */
     public async adaptStateToTokens(tokens: Token[], allowShift: boolean = true) {
-        if (this.model.fileInsights.isRecurrent || !allowShift) {
+        const modelSupportsShifting = !this.model.fileInsights.isRecurrent &&
+            this.model.fileInfo.metadata?.general?.architecture !== GgufArchitectureType.deepseek2;
+
+        if (!modelSupportsShifting || !allowShift) {
             const {firstDifferentIndex} = this.compareContextTokens(tokens);
             if (firstDifferentIndex < this.nextTokenIndex)
                 await this._eraseContextTokenRanges([{

From 62ee6e3f19340f3bae0b462ce11ccc3ec800cac5 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 5 Jan 2025 01:15:43 +0200
Subject: [PATCH 62/73] feat: `evaluateWithMetadata`, token confidence

---
 .config/typedoc.json                          |   2 +-
 docs/guide/low-level-api.md                   |  85 ++-
 llama/addon/AddonContext.cpp                  | 113 +++-
 src/bindings/AddonTypes.ts                    |   5 +-
 src/evaluator/LlamaContext/LlamaContext.ts    | 183 ++++--
 .../DraftSequenceTokenPredictor.ts            |  50 +-
 src/evaluator/LlamaContext/types.ts           |  87 ++-
 src/evaluator/utils/chunkDocument.ts          |   2 +-
 src/index.ts                                  |   6 +-
 src/utils/utilTypes.ts                        |  11 +
 .../llama3.1/controlledEvaluate.test.ts       | 163 ++++++
 .../llama3.1/evaluateWithMetadata.test.ts     | 553 ++++++++++++++++++
 .../llama3.1/tokenPredictor.test.ts           |  27 +-
 13 files changed, 1180 insertions(+), 107 deletions(-)
 create mode 100644 test/modelDependent/llama3.1/controlledEvaluate.test.ts
 create mode 100644 test/modelDependent/llama3.1/evaluateWithMetadata.test.ts

diff --git a/.config/typedoc.json b/.config/typedoc.json
index 3a4e2c22..6cae16a3 100644
--- a/.config/typedoc.json
+++ b/.config/typedoc.json
@@ -27,6 +27,6 @@
     "interfacePropertiesFormat": "list",
     "sort": ["source-order"],
     "docsRoot": "../docs",
-    "intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"],
+    "intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"],
     "useHTMLEncodedBrackets": true
 }
diff --git a/docs/guide/low-level-api.md b/docs/guide/low-level-api.md
index a92104fc..bf478af1 100644
--- a/docs/guide/low-level-api.md
+++ b/docs/guide/low-level-api.md
@@ -38,13 +38,25 @@ and you can pass no sampling options to avoid making any adjustments to the prob
 It's best to avoid getting the full probabilities list unless you really need it,
 as passing it to the JavaScript side can be slow.
 
+### Context Shift {#context-shift}
+When the context sequence is full and you want to evaluate more tokens onto it,
+some tokens will have to be removed to make room for new ones to be added.
+
+Ideally, you'd want to do that on your logic level, so you can control which content to keep and which to remove.
+> All the high-level APIs of `node-llama-cpp` [automatically do that](./chat-context-shift.md).
+
+If you don't do that, `node-llama-cpp` will automatically remove the oldest tokens from the context sequence state to make room for new ones.
+
+You can customize the context shift strategy `node-llama-cpp` uses for the context sequence by configuring the [`contextShift`](../api/classes/LlamaContext.md#parameters) option when calling [`.getSequence(...)`](../api/classes/LlamaContext.md#getsequence),
+or by passing a customized the [`contextShift`](../api/type-aliases/SequenceEvaluateOptions#contextshift) option to the evaluation method you use.
+
 ## Simple Evaluation {#simple-evaluation}
-You can evaluate the given input tokens onto a context sequence using [`.evaluate`](../api/classes/LlamaContextSequence.md#evaluate)
+You can evaluate the given input tokens onto a context sequence using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate)
 and generate the next token for the last input token.
 
 On each iteration of the returned iterator, the generated token is then added to the context sequence state and the next token is generated for it, and so on.
 
-When using [`.evaluate`](../api/classes/LlamaContextSequence.md#evaluate), the configured [token predictor](./token-prediction.md) is used to speed up the generation process.
+When using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate), the configured [token predictor](./token-prediction.md) is used to speed up the generation process.
 
 ```typescript
 import {fileURLToPath} from "url";
@@ -130,9 +142,67 @@ console.log("Result: " + resText);
 ```
 > If you want to adjust the token probabilities when generating output, consider using [token bias](./token-bias.md) instead
 
+### With Metadata {#evaluation-with-metadata}
+You can use [`.evaluateWithMetadata(...)`](../api/classes/LlamaContextSequence.md#evaluatewithmetadata) to evaluate tokens onto the context sequence state like [`.evaluate(...)`](#simple-evaluation), but with metadata emitted for each token.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const maxTokens = 10;
+const res: Array<{
+    token: Token,
+    confidence: number,
+    probabilities: Map<Token, number>
+}> = [];
+const metadataOptions = {
+    // configure which metadata should be returned
+    confidence: true,
+    probabilities: true
+} as const;
+const options: SequenceEvaluateOptions = {
+    temperature: 0.8
+};
+
+const iterator = sequence.evaluateWithMetadata(
+    tokens,
+    metadataOptions,
+    options
+);
+for await (const item of iterator) {
+    res.push({
+        token: item.token,
+        confidence: item.confidence,
+        probabilities: new Map(
+            // only keep the top 5 probabilities
+            [...item.probabilities.entries()].slice(0, 5)
+        )
+    });
+
+    if (res.length >= maxTokens)
+        break;
+}
+
+const resText = model.detokenize(res.map(({token}) => token));
+console.log("Result: " + resText);
+console.log("With metadata:", res);
+```
+
 ### No Generation {#evaluation-without-generation}
 To evaluate the input tokens onto a context sequence without generating new tokens,
-you can use [`.evaluateWithoutGeneratingNewTokens`](../api/classes/LlamaContextSequence.md#evaluatewithoutgeneratingnewtokens).
+you can use [`.evaluateWithoutGeneratingNewTokens(...)`](../api/classes/LlamaContextSequence.md#evaluatewithoutgeneratingnewtokens).
 
 ```typescript
 import {fileURLToPath} from "url";
@@ -154,7 +224,8 @@ await sequence.evaluateWithoutGeneratingNewTokens(tokens);
 ```
 
 ## Controlled Evaluation {#controlled-evaluation}
-To manually control for which of the input tokens to generate output, you can use [`.controlledEvaluate`](../api/classes/LlamaContextSequence.md#controlledevaluate).
+To manually control for which of the input tokens to generate output,
+you can use [`.controlledEvaluate(...)`](../api/classes/LlamaContextSequence.md#controlledevaluate).
 
 ```typescript
 import {fileURLToPath} from "url";
@@ -179,8 +250,8 @@ const lastToken = evaluateInput.pop() as Token;
 if (lastToken != null)
     evaluateInput.push([lastToken, {
         generateNext: {
-            singleToken: true,
-            probabilitiesList: true,
+            token: true,
+            probabilities: true,
             options: {
                 temperature: 0.8
             }
@@ -222,7 +293,7 @@ as it may lead to unexpected results.
 
 ### Erase State Ranges {#erase-state-ranges}
 To erase a range of tokens from the context sequence state,
-you can use [`.eraseContextTokenRanges`](../api/classes/LlamaContextSequence.md#erasecontexttokenranges).
+you can use [`.eraseContextTokenRanges(...)`](../api/classes/LlamaContextSequence.md#erasecontexttokenranges).
 
 ```typescript
 import {fileURLToPath} from "url";
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 3cfb9567..3fd2af2b 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -191,11 +191,13 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
         AddonContext* ctx;
         AddonSampler* sampler;
         bool arrayResult = false;
-        bool returnLogprobs = false;
-        bool has_logprobs = false;
-        size_t logprobs_size;
-        llama_token * logprobs_tokens;
-        float * logprobs_probs;
+        bool returnProbabilities = false;
+        bool returnConfidence = false;
+        float tokenConfidence = -1;
+        bool has_probabilities = false;
+        size_t probabilities_size;
+        llama_token * probabilities_tokens;
+        float * probabilities_probs;
         int32_t batchLogitIndex;
         llama_token result;
         bool no_output = false;
@@ -209,16 +211,17 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
             batchLogitIndex = info[0].As<Napi::Number>().Int32Value();
             sampler = Napi::ObjectWrap<AddonSampler>::Unwrap(info[1].As<Napi::Object>());
             arrayResult = info.Length() > 2 && info[2].IsBoolean();
-            returnLogprobs = arrayResult ? info[2].As<Napi::Boolean>().Value() : false;
+            returnProbabilities = arrayResult ? info[2].As<Napi::Boolean>().Value() : false;
+            returnConfidence = arrayResult && info.Length() > 3 && info[3].IsBoolean() ? info[3].As<Napi::Boolean>().Value() : false;
             sampler->Ref();
         }
         ~AddonContextSampleTokenWorker() {
             ctx->Unref();
             sampler->Unref();
 
-            if (has_logprobs) {
-                delete[] logprobs_tokens;
-                delete[] logprobs_probs;
+            if (has_probabilities) {
+                delete[] probabilities_tokens;
+                delete[] probabilities_probs;
             }
         }
 
@@ -264,32 +267,84 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             llama_sampler_apply(sampler->chain, &cur_p);
 
-            if (returnLogprobs) {
+            if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) {
+                no_output = true;
+                return;
+            }
+
+            auto new_token_id = cur_p.data[cur_p.selected].id;
+
+            if (returnProbabilities || returnConfidence) {
                 if (!cur_p.sorted) {
                     std::sort(cur_p.data, cur_p.data + cur_p.size, [](const llama_token_data & a, const llama_token_data & b) {
                         return a.logit > b.logit;
                     });
                     cur_p.sorted = true;
+
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        if (cur_p.data[i].id == new_token_id) {
+                            cur_p.selected = i;
+                            break;
+                        }
+                    }
                 }
+            }
 
-                logprobs_size = cur_p.size;
-                logprobs_tokens = new llama_token[logprobs_size];
-                logprobs_probs = new float[logprobs_size];
+            if (returnProbabilities) {
+                probabilities_size = cur_p.size;
+                probabilities_tokens = new llama_token[probabilities_size];
+                probabilities_probs = new float[probabilities_size];
+                float maxLogit = cur_p.size > 0 ? cur_p.data[0].logit : -INFINITY;
 
                 for (size_t i = 0; i < cur_p.size; i++) {
-                    logprobs_tokens[i] = cur_p.data[i].id;
-                    logprobs_probs[i] = cur_p.data[i].logit;
+                    auto logit = cur_p.data[i].logit;
+
+                    probabilities_tokens[i] = cur_p.data[i].id;
+                    probabilities_probs[i] = logit;
+
+                    if (logit > maxLogit) {
+                        maxLogit = logit;
+                    }
+                }
+
+                if (probabilities_size > 0 && maxLogit != -INFINITY) {
+                    float sum = 0.0f;
+                    for (size_t i = 0; i < probabilities_size; i++) {
+                        float prob = expf(probabilities_probs[i] - maxLogit);
+                        probabilities_probs[i] = prob;
+                        sum += prob;
+                    }
+
+                    for (size_t i = 0; i < probabilities_size; i++) {
+                        probabilities_probs[i] /= sum;
+                    }
                 }
 
-                has_logprobs = true;
+                has_probabilities = true;
             }
 
-            if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) {
-                no_output = true;
-                return;
+            if (returnConfidence) {
+                if (has_probabilities && cur_p.selected < probabilities_size) {
+                    tokenConfidence = probabilities_probs[cur_p.selected];
+                } else {
+                    float maxLogit = cur_p.data[0].logit;
+                    float sum = 0.0f;
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        auto logit = cur_p.data[i].logit;
+
+                        if (logit > maxLogit) {
+                            maxLogit = logit;
+                        }
+                    }
+
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        sum += expf(cur_p.data[i].logit - maxLogit);
+                    }
+
+                    tokenConfidence = expf(cur_p.data[cur_p.selected].logit - maxLogit) / sum;
+                }
             }
 
-            auto new_token_id = cur_p.data[cur_p.selected].id;
             sampler->acceptToken(new_token_id);
             result = new_token_id;
         }
@@ -308,14 +363,18 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             Napi::Array resultArray = Napi::Array::New(Env(), 2);
             resultArray.Set(Napi::Number::New(Env(), 0), resultToken);
-            
-            if (has_logprobs) {
-                Napi::Array logprobs = Napi::Array::New(Env(), logprobs_size * 2);
-                for (size_t i = 0; i < logprobs_size; i++) {
-                    logprobs.Set(i * 2, Napi::Number::New(Env(), logprobs_tokens[i]));
-                    logprobs.Set(i * 2 + 1, Napi::Number::New(Env(), logprobs_probs[i]));
+
+            if (has_probabilities) {
+                Napi::Array probabilities = Napi::Array::New(Env(), probabilities_size * 2);
+                for (size_t i = 0; i < probabilities_size; i++) {
+                    probabilities.Set(i * 2, Napi::Number::New(Env(), probabilities_tokens[i]));
+                    probabilities.Set(i * 2 + 1, Napi::Number::New(Env(), probabilities_probs[i]));
                 }
-                resultArray.Set(1, logprobs);
+                resultArray.Set(1, probabilities);
+            }
+
+            if (returnConfidence && tokenConfidence != -1) {
+                resultArray.Set(2, Napi::Number::New(Env(), tokenConfidence));
             }
 
             deferred.Resolve(resultArray);
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 711eb41f..7c575f88 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -131,8 +131,9 @@ export type AddonContext = {
     sampleToken(
         batchLogitIndex: BatchLogitIndex,
         sampler: AddonSampler,
-        logprobs: boolean
-    ): Promise<[Token | -1, (Token | number)[] | undefined]>,
+        probabilities: boolean,
+        confidence?: boolean
+    ): Promise<[token: Token | -1, probabilities: (Token | number)[] | undefined, confidence: number | undefined]>,
     disposeSequence(sequenceId: number): void,
 
     // startPos in inclusive, endPos is exclusive
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 6caa6c4c..9844b0db 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -15,7 +15,8 @@ import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js";
 import {
     BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem,
-    EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, SequenceEvaluateOptions
+    EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, SequenceEvaluateMetadataOptions,
+    SequenceEvaluateOptions, SequenceEvaluateOutput
 } from "./types.js";
 import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import {LlamaSampler} from "./LlamaSampler.js";
@@ -918,7 +919,10 @@ export class LlamaContextSequence {
     /** @internal */ private _tokenPredictorOwner: {} = {};
     /** @internal */ public _contextTokens: Token[] = [];
     /** @internal */ private _nextTokenIndex: number = 0;
-    /** @internal */ private _loadedTokenPredictions: [input: Token, output: Token][] = [];
+    /** @internal */ private _loadedTokenPredictions: Array<[
+        input: Token,
+        output: [token: Token, probabilities: (Token | number)[] | undefined, confidence: number | undefined]
+    ]> = [];
     /** @internal */ private _usedTokenPredictions: number = 0;
     /** @internal */ private _unusedTokenPredictions: number = 0;
     /** @internal */ private _validatedTokenPredictions: number = 0;
@@ -1266,7 +1270,33 @@ export class LlamaContextSequence {
      *
      * This method uses the token predictor (when provided) to generate new tokens faster.
      */
-    public evaluate(tokens: Token[], options: SequenceEvaluateOptions = {}): AsyncGenerator<Token, void, void | Token | Token[]> {
+    public async *evaluate(tokens: Token[], options: SequenceEvaluateOptions = {}): AsyncGenerator<Token, void, void | Token | Token[]> {
+        const iterator = this.evaluateWithMetadata(tokens, {}, options);
+        let iterateInput: void | Token | Token[] = undefined;
+
+        try {
+            while (true) {
+                const {value, done} = await iterator.next(iterateInput);
+                if (done)
+                    return;
+
+                iterateInput = yield value.token;
+            }
+        } finally {
+            await iterator.return();
+        }
+    }
+
+    /**
+     * Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
+     *
+     * Configure the additional metadata options to choose which metadata to include.
+     */
+    public evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(
+        tokens: Token[],
+        metadata: Metadata,
+        options: SequenceEvaluateOptions = {}
+    ): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]> {
         const {
             temperature = 0,
             minP = 0,
@@ -1287,7 +1317,7 @@ export class LlamaContextSequence {
         } = options;
 
         if (this._tokenPredictor != null && !_noSampling && tokens.length > 0)
-            return this._speculativeEvaluate(tokens, {
+            return this._speculativeEvaluate(tokens, metadata, {
                 temperature,
                 minP,
                 topK,
@@ -1305,7 +1335,7 @@ export class LlamaContextSequence {
                 tokenPredictor: this._tokenPredictor
             });
 
-        return this._evaluate(tokens, {
+        return this._evaluate(tokens, metadata, {
             temperature,
             minP,
             topK,
@@ -1355,7 +1385,7 @@ export class LlamaContextSequence {
             _skipLock = false
         } = options;
 
-        const iterator = this._evaluate(tokens, {
+        const iterator = this._evaluate(tokens, {}, {
             generateNewTokens: false,
             evaluationPriority,
             contextShiftOptions: {
@@ -1387,6 +1417,8 @@ export class LlamaContextSequence {
             // Array.from doesn't work with async generators, so we have to iterate over the generator
         }
 
+        await iterator.return();
+
         if (predictorAlignmentPromise != null)
             await predictorAlignmentPromise;
     }
@@ -1453,7 +1485,7 @@ export class LlamaContextSequence {
             if (item instanceof Array) {
                 const [token, options] = item;
                 const generateNext = options?.generateNext ?? {};
-                if (generateNext.probabilitiesList || generateNext.singleToken === true || generateNext.singleToken != null)
+                if (generateNext.probabilities === true || generateNext.confidence === true || generateNext.token === true)
                     logitsArray[index] = true;
 
                 return token;
@@ -1478,8 +1510,9 @@ export class LlamaContextSequence {
                     const generateNext = inputOptions.generateNext;
 
                     if (generateNext == null || (
-                        (generateNext.probabilitiesList == null || !generateNext.probabilitiesList) &&
-                        (generateNext.singleToken == null || !generateNext.singleToken)
+                        (generateNext.probabilities == null || !generateNext.probabilities) &&
+                        (generateNext.token == null || !generateNext.token) &&
+                        (generateNext.confidence == null || !generateNext.confidence)
                     ))
                         return undefined;
 
@@ -1499,22 +1532,28 @@ export class LlamaContextSequence {
                             return undefined;
 
                         sampler.applyConfig(samplerConfig);
-                        const [token, probabilitiesList] = await this._context._ctx.sampleToken(
+                        const [token, probabilities, confidence] = await this._context._ctx.sampleToken(
                             batchLogitIndex,
                             sampler._sampler,
-                            !!generateNext.probabilitiesList
+                            !!generateNext.probabilities,
+                            !!generateNext.confidence
                         );
 
                         const output: ControlledEvaluateIndexOutput = {
-                            next: {
-                                token: generateNext.singleToken
-                                    ? token === -1
-                                        ? null
-                                        : (token ?? null)
-                                    : undefined,
-                                probabilities: reviveTokenProbabilities(probabilitiesList)
-                            }
+                            next: {}
                         };
+
+                        if (generateNext.token)
+                            output.next.token = token === -1
+                                ? null
+                                : (token ?? null);
+
+                        if (confidence != null)
+                            output.next.confidence = confidence;
+
+                        if (probabilities != null)
+                            output.next.probabilities = reviveTokenProbabilities(probabilities);
+
                         onTokenResult?.(tokenIndex, output);
 
                         return output;
@@ -1528,7 +1567,7 @@ export class LlamaContextSequence {
     }
 
     /** @internal */
-    private async *_evaluate(tokens: Token[], {
+    private async *_evaluate<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, {
         temperature,
         minP,
         topK,
@@ -1552,7 +1591,7 @@ export class LlamaContextSequence {
         yieldEogToken?: boolean,
         _noSampling?: boolean,
         _skipLock?: boolean
-    }): AsyncGenerator<Token, void, void | Token | Token[]> {
+    }): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]> {
         this._ensureNotDisposed();
 
         let evalTokens = tokens;
@@ -1562,6 +1601,9 @@ export class LlamaContextSequence {
 
         await this._abortTokenPredictor(false, true);
 
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
+
         const sampler = new LlamaSampler(this.model);
         try {
             while (true) {
@@ -1570,6 +1612,7 @@ export class LlamaContextSequence {
                     ? undefined
                     : await acquireLock(this._lock, "evaluate");
                 let nextToken: Token | -1 | null | undefined;
+                const yieldRes: Partial<SequenceEvaluateOutput<{probabilities: true, confidence: true}>> = {};
 
                 try {
                     const logitsArray: (true | undefined)[] = [];
@@ -1604,12 +1647,32 @@ export class LlamaContextSequence {
                                     return null;
 
                                 sampler.applyConfig(samplerConfig);
-                                return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                                if (sampleProbabilities || sampleConfidence)
+                                    return this._context._ctx.sampleToken(
+                                        batchLogitIndex,
+                                        sampler._sampler,
+                                        sampleProbabilities,
+                                        sampleConfidence
+                                    );
+                                else
+                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
                             });
                         }
                     );
 
-                    nextToken = decodeResult[evalTokens.length - 1];
+                    const lastDecodeResult = decodeResult[evalTokens.length - 1];
+
+                    if (lastDecodeResult instanceof Array) {
+                        const [token, probabilities, confidence] = lastDecodeResult;
+                        nextToken = token;
+
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
+                    } else
+                        nextToken = lastDecodeResult;
 
                     if (nextToken === -1)
                         throw new Error("Failed to sample next token");
@@ -1624,7 +1687,9 @@ export class LlamaContextSequence {
                     evaluatorLock?.dispose();
                 }
 
-                const replacementToken = yield nextToken;
+                yieldRes.token = nextToken;
+
+                const replacementToken = yield yieldRes as SequenceEvaluateOutput<Metadata>;
 
                 // set the tokens for the next evaluation
                 if (replacementToken instanceof Array)
@@ -1640,7 +1705,7 @@ export class LlamaContextSequence {
     }
 
     /** @internal */
-    private async *_speculativeEvaluate(tokens: Token[], {
+    private async *_speculativeEvaluate<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, {
         temperature,
         minP,
         topK,
@@ -1659,7 +1724,7 @@ export class LlamaContextSequence {
         repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias),
         evaluationPriority?: EvaluationPriority, contextShiftOptions: Required<ContextShiftOptions>,
         yieldEogToken?: boolean, tokenPredictor: TokenPredictor
-    }): AsyncGenerator<Token, void, void | Token | Token[]> {
+    }): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]> {
         this._ensureNotDisposed();
 
         let evalTokens = tokens.slice();
@@ -1671,6 +1736,9 @@ export class LlamaContextSequence {
         this._tokenPredictorOwner = tokenPredictorOwner;
         await this._abortTokenPredictor();
 
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
+
         let logitsArray: (true | undefined)[] = [];
         let logitsStartIndex = evalTokens.length - 1;
         const validatedTokens: [input: Token, output: Token][] = [];
@@ -1682,6 +1750,7 @@ export class LlamaContextSequence {
                 this._ensureNotDisposed();
                 const evaluatorLock = await acquireLock(this._lock, "evaluate");
                 let nextToken: Token | undefined;
+                const yieldRes: Partial<SequenceEvaluateOutput<{probabilities: true, confidence: true}>> = {};
 
                 try {
                     if (this._tokenPredictorOwner === tokenPredictorOwner &&
@@ -1689,7 +1758,16 @@ export class LlamaContextSequence {
                         evalTokens.length === 1 &&
                         evalTokens[0] === this._loadedTokenPredictions[0]?.[0]
                     ) {
-                        nextToken = this._loadedTokenPredictions.shift()![1];
+                        const [token, probabilities, confidence] = this._loadedTokenPredictions.shift()![1];
+                        nextToken = token;
+                        yieldRes.token = nextToken;
+
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
+
                         const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
                             ? grammarEvaluationState()
                             : grammarEvaluationState;
@@ -1799,13 +1877,24 @@ export class LlamaContextSequence {
                                         return null;
 
                                     sampler.applyConfig(samplerConfig);
-                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                                    if (sampleProbabilities || sampleConfidence)
+                                        return this._context._ctx.sampleToken(
+                                            batchLogitIndex,
+                                            sampler._sampler,
+                                            sampleProbabilities,
+                                            sampleConfidence
+                                        );
+                                    else
+                                        return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
                                 });
                             }
                         );
 
                         for (let i = logitsStartIndex; i < evalTokens.length; i++) {
-                            const resultToken = decodeResult[i];
+                            const item = decodeResult[i];
+                            const [resultToken, probabilities, confidence] = item instanceof Array
+                                ? item
+                                : [item];
 
                             if (i === logitsStartIndex) {
                                 if (resultToken === -1)
@@ -1815,6 +1904,13 @@ export class LlamaContextSequence {
                                     return;
 
                                 nextToken = resultToken;
+                                yieldRes.token = nextToken;
+
+                                if (probabilities != null)
+                                    yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+
+                                if (confidence != null)
+                                    yieldRes.confidence = confidence;
                             } else {
                                 if (resultToken === -1 || resultToken == null)
                                     break;
@@ -1823,7 +1919,7 @@ export class LlamaContextSequence {
                                     ? nextToken
                                     : validatedTokens.at(-1)?.[1];
                                 if (lastValidatedTokenOutput != null && lastValidatedTokenOutput === evalTokens[i]) {
-                                    this._loadedTokenPredictions.push([evalTokens[i]!, resultToken]);
+                                    this._loadedTokenPredictions.push([evalTokens[i]!, [resultToken, probabilities, confidence]]);
                                     this._validatedTokenPredictions++;
                                     this._unusedTokenPredictions++;
                                 } else {
@@ -1851,7 +1947,7 @@ export class LlamaContextSequence {
                     evaluatorLock.dispose();
                 }
 
-                const replacementToken = yield nextToken;
+                const replacementToken = yield yieldRes as SequenceEvaluateOutput<Metadata>;
 
                 // set the tokens for the next evaluation
                 if (replacementToken instanceof Array)
@@ -2135,34 +2231,19 @@ function getTokenBiasesForAddon(tokenBias: undefined | TokenBias | (() => TokenB
     };
 }
 
-function reviveTokenProbabilities(probabilitiesList?: (Token | number)[]) {
-    if (probabilitiesList == null)
+function reviveTokenProbabilities(probabilities?: (Token | number)[]) {
+    if (probabilities == null)
         return undefined;
 
     const res = new Map<Token, number>();
-    let maxLogit: number | undefined = undefined;
-
-    for (let i = 1; i < probabilitiesList.length; i += 2) {
-        const token = probabilitiesList[i - 1]! as Token;
-        const probability = probabilitiesList[i]! as number;
 
-        if (maxLogit == null || probability > maxLogit)
-            maxLogit = probability;
+    for (let i = 1; i < probabilities.length; i += 2) {
+        const token = probabilities[i - 1]! as Token;
+        const probability = probabilities[i]! as number;
 
         res.set(token, probability);
     }
 
-    if (maxLogit != null) {
-        for (const [token, logit] of res)
-            res.set(token, Math.exp(logit - maxLogit));
-
-        const sum = Array.from(res.values()).reduce((a, b) => a + b, 0);
-
-        // normalize the probabilities
-        for (const [token, logit] of res)
-            res.set(token, logit / sum);
-    }
-
     return res;
 }
 
diff --git a/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts b/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
index 5c85d92c..d20e3522 100644
--- a/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
+++ b/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
@@ -3,13 +3,14 @@ import {Token} from "../../../types.js";
 import {LlamaGrammarEvaluationState} from "../../LlamaGrammarEvaluationState.js";
 import {pushAll} from "../../../utils/pushAll.js";
 import {getConsoleLogPrefix} from "../../../utils/getConsoleLogPrefix.js";
-import {SequenceEvaluateOptions} from "../types.js";
+import {SequenceEvaluateOptions, SequenceEvaluateOutput} from "../types.js";
 import {LlamaSampler} from "../LlamaSampler.js";
 import {LlamaContextSequence} from "../LlamaContext.js";
 import {TokenPredictor} from "../TokenPredictor.js";
 
 const defaultPredictionMinTokens = 0;
 const defaultPredictionMaxTokens = 16;
+const defaultPredictionMinConfidence = 0.6;
 
 /**
  * Predicts the next tokens by evaluating the current state of the target sequence
@@ -20,6 +21,7 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
     /** @internal */ private readonly _draftSequence: LlamaContextSequence;
     /** @internal */ private readonly _minTokens: number;
     /** @internal */ private readonly _maxTokens: number;
+    /** @internal */ private readonly _minConfidence?: number;
     /** @internal */ private _stateTokens: Token[] = [];
     /** @internal */ private _pendingEvalTokens: Token[] = [];
     /** @internal */ private _predictedTokens: Token[] = [];
@@ -32,7 +34,7 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
     /** @internal */ private _waitForPredictionExhaustion: boolean = false;
     /** @internal */ private _minTokensCallbacks: Array<() => void> = [];
     /** @internal */ private _resetPredictions: boolean = false;
-    /** @internal */ private _iterator?: AsyncGenerator<Token, void | Token>;
+    /** @internal */ private _iterator?: AsyncGenerator<SequenceEvaluateOutput<{readonly confidence: true}>, void | Token>;
     /** @internal */ private _active: boolean = false;
     /** @internal */ private _disposed: boolean = false;
 
@@ -56,7 +58,20 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
          *
          * You can override any of the options for the prediction here.
          */
-        evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">
+        evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">,
+
+        /**
+         * Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
+         * When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
+         * are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
+         *
+         * A number between `0` and `1` representing the minimum probability of the token to be generated.
+         *
+         * Set to `0` to disable.
+         *
+         * Defaults to `0.6`.
+         */
+        minConfidence?: number
     } = {}) {
         super();
 
@@ -64,6 +79,7 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
         this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
         this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
         this._overrideEvaluateOptions = options.evaluateOptions ?? {};
+        this._minConfidence = Math.min(1, Math.max(0, options?.minConfidence ?? defaultPredictionMinConfidence));
 
         if (draftSequence.disposed)
             throw new Error("The draft sequence is disposed");
@@ -81,6 +97,10 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
         return this._maxTokens;
     }
 
+    public get minConfidence() {
+        return this._minConfidence;
+    }
+
     public async reset({targetSequence, stateTokens, evaluateOptions}: {
         targetSequence: LlamaContextSequence,
         stateTokens: Token[],
@@ -202,6 +222,10 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
         this._stopped = true;
         this._currentEvaluationAbortController.abort();
         this._currentEvaluationAbortController = new AbortController();
+
+        if (untilPredictionsExhausted)
+            this._waitForPredictionExhaustion = true;
+
         void withLock(this, "evaluate", async () => {
             this._iterator?.return();
             this._iterator = undefined;
@@ -254,7 +278,7 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
                 const createIterator = () => {
                     const tokens = this._pendingEvalTokens;
                     this._pendingEvalTokens = [];
-                    return this.draftSequence.evaluate(tokens, {
+                    return this.draftSequence.evaluateWithMetadata(tokens, {confidence: true}, {
                         ...this._evaluateOptions,
                         ...this._overrideEvaluateOptions,
                         grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
@@ -271,8 +295,20 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
                 this._iterator = iterator;
                 while (this._canIterate() && !abortSignal.aborted) {
                     const {value, done} = await iterator.next();
-                    if (value != null)
-                        this._predictedTokens.push(value);
+                    let shouldBreak = done;
+                    if (value != null) {
+                        const {token, confidence} = value;
+
+                        if (this._minConfidence != null && this._minConfidence !== 0 && this._minConfidence !== 1 &&
+                            confidence < this._minConfidence
+                        ) {
+                            this._iterator = undefined;
+                            await iterator.return();
+                            this._waitForPredictionExhaustion = true;
+                            shouldBreak = true;
+                        } else
+                            this._predictedTokens.push(token);
+                    }
 
                     if (this._resetPredictions && !abortSignal.aborted) {
                         await resetPredications();
@@ -286,7 +322,7 @@ export class DraftSequenceTokenPredictor extends TokenPredictor {
                             this._minTokensCallbacks.shift()?.();
                     }
 
-                    if (done) {
+                    if (shouldBreak) {
                         this._iterator = undefined;
                         await iterator.return();
                         this._waitForPredictionExhaustion = true;
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 5c78f7e2..78755247 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -1,3 +1,4 @@
+import {PickOptions} from "../../utils/utilTypes.js";
 import type {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js";
 import type {TokenBias} from "../TokenBias.js";
 import type {Token} from "../../types.js";
@@ -300,7 +301,11 @@ export type SequenceEvaluateOptions = {
      */
     evaluationPriority?: EvaluationPriority,
 
-    /** Override the sequence context shift options for this evaluation */
+    /**
+     * Override the sequence context shift options for this evaluation
+     *
+     * See {@link ContextShiftOptions} for more information.
+     */
     contextShift?: ContextShiftOptions,
 
     /**
@@ -314,23 +319,88 @@ export type SequenceEvaluateOptions = {
     _noSampling?: boolean
 };
 
+export type SequenceEvaluateMetadataOptions = {
+    /**
+     * Get the confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)` from the output.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     */
+    readonly confidence?: boolean,
+
+    /**
+     * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+     *
+     * Only enable when needed, as it impacts the performance.
+     *
+     * Defaults to `false`.
+     */
+    readonly probabilities?: boolean
+};
+
+export type SequenceEvaluateOutput<
+    Options extends {
+        readonly confidence?: boolean,
+        readonly probabilities?: boolean
+    } = {
+        readonly confidence: true,
+        readonly probabilities: true
+    }
+> = PickOptions<{
+    /**
+     * The next token generated by the model and selected using the given options (such a temperature).
+     */
+    token: Token,
+
+    /**
+     * The confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)`.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     */
+    confidence: number,
+
+    /**
+     * The probabilities of the tokens from the vocabulary to be the next token.
+     *
+     * A probability is a number from `0` to `1`.
+     *
+     * The map is sorted by the probability of the tokens from the highest to the lowest,
+     * and is reflected in the order of the entries when iterating over the map.
+     * Use `.entries().next().value` to get the top probability pair
+     * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+     */
+    probabilities: Map<Token, number>
+}, Options & {token: true}>;
+
 export type ControlledEvaluateInputItem = Token | [token: Token, options: {
     generateNext?: {
         /**
          * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
          *
-         * Only enable when needed, since it impacts the performance.
+         * Only enable when needed, as it impacts the performance.
          *
          * Defaults to `false`.
          */
-        probabilitiesList?: boolean,
+        probabilities?: boolean,
+
+        /**
+         * Get the confidence (probability) of the selected token.
+         *
+         * Same as `next.probabilities.get(next.token)` from the output.
+         *
+         * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         */
+        confidence?: boolean,
 
         /**
          * Generate the next token with the provided options using sampling.
          *
          * Setting this to `true` will generate probabilities for the next token and sample it.
          */
-        singleToken?: boolean,
+        token?: boolean,
 
         options?: {
             temperature?: number, minP?: number, topK?: number, topP?: number,
@@ -361,6 +431,15 @@ export type ControlledEvaluateIndexOutput = {
     next: {
         token?: Token | null,
 
+        /**
+         * The confidence (probability) of the selected token (the `token` field in this object).
+         *
+         * Same as `next.probabilities.get(next.token)`.
+         *
+         * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         */
+        confidence?: number,
+
         /**
          * The probabilities of the tokens from the vocabulary to be the next token.
          *
diff --git a/src/evaluator/utils/chunkDocument.ts b/src/evaluator/utils/chunkDocument.ts
index 0d40be7a..ac39fa33 100644
--- a/src/evaluator/utils/chunkDocument.ts
+++ b/src/evaluator/utils/chunkDocument.ts
@@ -146,7 +146,7 @@ export async function experimentalChunkDocument(options: {
         const token = documentTokens[i]!;
         evaluateInput.push([token, {
             generateNext: {
-                probabilitiesList: true
+                probabilities: true
             }
         }]);
     }
diff --git a/src/index.ts b/src/index.ts
index 04f97ce8..59c6ef49 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -18,8 +18,8 @@ import {LlamaEmbedding, type LlamaEmbeddingOptions, type LlamaEmbeddingJSON} fro
 import {
     type LlamaContextOptions, type SequenceEvaluateOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty,
     type CustomBatchingDispatchSchedule, type CustomBatchingPrioritizationStrategy, type BatchItem, type PrioritizedBatchItem,
-    type ContextShiftOptions, type ContextTokensDeleteRange, type EvaluationPriority, type ControlledEvaluateInputItem,
-    type ControlledEvaluateIndexOutput
+    type ContextShiftOptions, type ContextTokensDeleteRange, type EvaluationPriority, type SequenceEvaluateMetadataOptions,
+    type SequenceEvaluateOutput, type ControlledEvaluateInputItem, type ControlledEvaluateIndexOutput
 } from "./evaluator/LlamaContext/types.js";
 import {TokenBias} from "./evaluator/TokenBias.js";
 import {
@@ -144,6 +144,8 @@ export {
     type ContextShiftOptions,
     type ContextTokensDeleteRange,
     type EvaluationPriority,
+    type SequenceEvaluateMetadataOptions,
+    type SequenceEvaluateOutput,
     type LlamaContextSequenceRepeatPenalty,
     type ControlledEvaluateInputItem,
     type ControlledEvaluateIndexOutput,
diff --git a/src/utils/utilTypes.ts b/src/utils/utilTypes.ts
index 2bf7bdeb..b7b96191 100644
--- a/src/utils/utilTypes.ts
+++ b/src/utils/utilTypes.ts
@@ -1,3 +1,14 @@
 export type Writable<T> = {
     -readonly [P in keyof T]: T[P];
 };
+
+export type PickOptions<
+    Value extends Readonly<Record<string, any>>,
+    Options extends {readonly [key: string]: boolean | undefined}
+> = Pick<Value, {
+    [Key in keyof Value]: Key extends keyof Options
+        ? Options[Key] extends true
+            ? Key
+            : never
+        : never
+}[keyof Value]>;
diff --git a/test/modelDependent/llama3.1/controlledEvaluate.test.ts b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
new file mode 100644
index 00000000..2f1202eb
--- /dev/null
+++ b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
@@ -0,0 +1,163 @@
+import {describe, expect, test} from "vitest";
+import {Token, ControlledEvaluateInputItem} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+
+describe("llama 3.1", () => {
+    describe("controlled evaluate", () => {
+        test("get probabilities for 3 tokens", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const text = "The quick brown fox jumps over the lazy dog, but! the lazy dog is too lazy to care. " +
+                "The reason for this is that the lazy dog is too lazy to care about the quick brown fox.";
+
+            const inputTokens: ControlledEvaluateInputItem[] = model.tokenize(text);
+            expect(inputTokens.length).to.be.greaterThan(8);
+
+            inputTokens[2] = [inputTokens[2] as Token, {
+                generateNext: {
+                    token: true
+                }
+            }];
+            inputTokens[3] = [inputTokens[3] as Token, {
+                generateNext: {
+                    probabilities: true
+                }
+            }];
+            inputTokens[4] = [inputTokens[4] as Token, {
+                generateNext: {
+                    token: true,
+                    probabilities: true
+                }
+            }];
+
+            inputTokens[5] = [inputTokens[5] as Token, {
+                generateNext: {
+                    token: true,
+                    confidence: true
+                }
+            }];
+            inputTokens[6] = [inputTokens[6] as Token, {
+                generateNext: {
+                    probabilities: true,
+                    confidence: true
+                }
+            }];
+            inputTokens[7] = [inputTokens[7] as Token, {
+                generateNext: {
+                    token: true,
+                    probabilities: true,
+                    confidence: true
+                }
+            }];
+
+            const res = await sequence.controlledEvaluate(inputTokens);
+
+            const simplifiedRes = res.map((item) => {
+                if (item == null || item.next == null)
+                    return item;
+
+                // only keep the top 10 probabilities to not clutter the snapshot
+                if (item.next?.probabilities != null)
+                    item.next.probabilities = new Map([...item.next.probabilities.entries()].slice(0, 10));
+
+                return item;
+            });
+
+            expect(simplifiedRes).toMatchInlineSnapshot(`
+              [
+                ,
+                ,
+                {
+                  "next": {
+                    "token": 39935,
+                  },
+                },
+                {
+                  "next": {
+                    "probabilities": Map {
+                      35308 => 0.5214946269989014,
+                      27096 => 0.24320587515830994,
+                      11 => 0.022182414308190346,
+                      198 => 0.011944590136408806,
+                      374 => 0.008361410349607468,
+                      863 => 0.008360812440514565,
+                      1131 => 0.006834662053734064,
+                      25 => 0.006243313197046518,
+                      7940 => 0.00540389958769083,
+                      1 => 0.005168775096535683,
+                    },
+                  },
+                },
+                {
+                  "next": {
+                    "probabilities": Map {
+                      927 => 0.9811904430389404,
+                      198 => 0.0033848676830530167,
+                      6288 => 0.0032705331686884165,
+                      279 => 0.0006552835111506283,
+                      1633 => 0.00031841936288401484,
+                      1035 => 0.0003114044084213674,
+                      13 => 0.0002916732046287507,
+                      264 => 0.00028948261751793325,
+                      297 => 0.0002833220351021737,
+                      720 => 0.00024898265837691724,
+                    },
+                    "token": 927,
+                  },
+                },
+                {
+                  "next": {
+                    "confidence": 0.9306728839874268,
+                    "token": 279,
+                  },
+                },
+                {
+                  "next": {
+                    "confidence": 0.9597684741020203,
+                    "probabilities": Map {
+                      16053 => 0.9597684741020203,
+                      1208 => 0.004750591237097979,
+                      198 => 0.0031827085185796022,
+                      5679 => 0.0029162338469177485,
+                      65536 => 0.00197240780107677,
+                      6435 => 0.0009124248754233122,
+                      2697 => 0.0006706250132992864,
+                      720 => 0.0005979162524454296,
+                      21811 => 0.0005516768433153629,
+                      45363 => 0.0005495203076861799,
+                    },
+                  },
+                },
+                {
+                  "next": {
+                    "confidence": 0.9871460199356079,
+                    "probabilities": Map {
+                      5679 => 0.9871460199356079,
+                      21811 => 0.001438674982637167,
+                      198 => 0.0009368227329105139,
+                      8415 => 0.0007225279696285725,
+                      12875 => 0.00038032486918382347,
+                      4194 => 0.00034695648355409503,
+                      720 => 0.00028149448917247355,
+                      14588 => 0.00027612835401669145,
+                      9522 => 0.00024171460245270282,
+                      627 => 0.0002042166597675532,
+                    },
+                    "token": 5679,
+                  },
+                },
+              ]
+            `);
+        });
+    });
+});
diff --git a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
new file mode 100644
index 00000000..0c368053
--- /dev/null
+++ b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
@@ -0,0 +1,553 @@
+import {describe, expect, test} from "vitest";
+import {Token} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+import {SequenceEvaluateOutput} from "../../../src/evaluator/LlamaContext/types.js";
+
+describe("llama 3.1", () => {
+    describe("evaluate with metadata", () => {
+        const text = "The quick brown fox jumps over the lazy dog, but! the lazy dog is too lazy to care. " +
+            "The reason for this is that the lazy dog is too lazy to care about the quick brown fox.";
+
+        test("no options", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "token": 578,
+                },
+                {
+                  "token": 16053,
+                },
+                {
+                  "token": 5679,
+                },
+                {
+                  "token": 374,
+                },
+                {
+                  "token": 2288,
+                },
+                {
+                  "token": 16053,
+                },
+                {
+                  "token": 311,
+                },
+                {
+                  "token": 2512,
+                },
+                {
+                  "token": 922,
+                },
+                {
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("with probabilities", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{readonly probabilities: true}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {probabilities: true})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            simplifyRes(res);
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "probabilities": Map {
+                    578 => 0.43072959780693054,
+                    1115 => 0.13043756783008575,
+                    1102 => 0.05168525502085686,
+                    763 => 0.04288897663354874,
+                    1283 => 0.029397012665867805,
+                    2100 => 0.029378682374954224,
+                    15636 => 0.026268385350704193,
+                    2030 => 0.02184896357357502,
+                    320 => 0.01690298318862915,
+                    1628 => 0.011869494803249836,
+                  },
+                  "token": 578,
+                },
+                {
+                  "probabilities": Map {
+                    16053 => 0.42230042815208435,
+                    4062 => 0.30363598465919495,
+                    39935 => 0.060395680367946625,
+                    2944 => 0.0373028889298439,
+                    5679 => 0.023811226710677147,
+                    11914 => 0.016298197209835052,
+                    2144 => 0.014683431014418602,
+                    1121 => 0.006984429899603128,
+                    17571 => 0.005794311873614788,
+                    3446 => 0.004934381693601608,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "probabilities": Map {
+                    5679 => 0.9981182813644409,
+                    12875 => 0.00015924211766105145,
+                    18964 => 0.00011538491526152939,
+                    39935 => 0.00011497695231810212,
+                    13 => 0.00010490677959751338,
+                    627 => 0.00009277161007048562,
+                    656 => 0.00006256866618059576,
+                    893 => 0.00005633986438624561,
+                    198 => 0.00005223735934123397,
+                    374 => 0.00005191291347728111,
+                  },
+                  "token": 5679,
+                },
+                {
+                  "probabilities": Map {
+                    374 => 0.8126624226570129,
+                    1587 => 0.04815267026424408,
+                    596 => 0.024733318015933037,
+                    1120 => 0.022302960976958275,
+                    3250 => 0.02154506742954254,
+                    706 => 0.01618366874754429,
+                    15849 => 0.00869414210319519,
+                    1053 => 0.005911793559789658,
+                    55064 => 0.0037806404288858175,
+                    11 => 0.0036655946169048548,
+                  },
+                  "token": 374,
+                },
+                {
+                  "probabilities": Map {
+                    2288 => 0.2757589817047119,
+                    1120 => 0.16664838790893555,
+                    539 => 0.15775153040885925,
+                    779 => 0.1334378868341446,
+                    264 => 0.05585397779941559,
+                    1101 => 0.029216185212135315,
+                    16053 => 0.017680438235402107,
+                    5042 => 0.015862826257944107,
+                    1193 => 0.014583030715584755,
+                    2744 => 0.014090186916291714,
+                  },
+                  "token": 2288,
+                },
+                {
+                  "probabilities": Map {
+                    16053 => 0.9066131114959717,
+                    13326 => 0.06362388283014297,
+                    19781 => 0.00715874508023262,
+                    17551 => 0.0020243648905307055,
+                    10968 => 0.0012676455080509186,
+                    11920 => 0.0011003530817106366,
+                    6435 => 0.0010086935944855213,
+                    34386 => 0.0007757442072033882,
+                    1208 => 0.0006099422462284565,
+                    25366 => 0.0005670536775141954,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "probabilities": Map {
+                    311 => 0.9882798194885254,
+                    1524 => 0.006186165846884251,
+                    11 => 0.0025764326564967632,
+                    323 => 0.0005241178441792727,
+                    13 => 0.0003534558054525405,
+                    627 => 0.0003210459544789046,
+                    1606 => 0.00026416024775244296,
+                    2288 => 0.00025828619254752994,
+                    369 => 0.00012467413034755737,
+                    320 => 0.00010217254020972177,
+                  },
+                  "token": 311,
+                },
+                {
+                  "probabilities": Map {
+                    2512 => 0.7492305040359497,
+                    1524 => 0.09894875437021255,
+                    656 => 0.032405685633420944,
+                    636 => 0.024060513824224472,
+                    7940 => 0.014415031298995018,
+                    33586 => 0.010868093930184841,
+                    387 => 0.008681590668857098,
+                    1781 => 0.005856928415596485,
+                    1629 => 0.005487737245857716,
+                    3351 => 0.005111564416438341,
+                  },
+                  "token": 2512,
+                },
+                {
+                  "probabilities": Map {
+                    922 => 0.9521902799606323,
+                    1606 => 0.01500858273357153,
+                    11 => 0.014008254744112492,
+                    430 => 0.002968641696497798,
+                    627 => 0.002314596902579069,
+                    13 => 0.0018861450953409076,
+                    1524 => 0.0018014858942478895,
+                    369 => 0.001769029418937862,
+                    323 => 0.0009245016262866557,
+                    382 => 0.0008477734518237412,
+                  },
+                  "token": 922,
+                },
+                {
+                  "probabilities": Map {
+                    279 => 0.6508099436759949,
+                    4205 => 0.31288382411003113,
+                    1148 => 0.011366183869540691,
+                    1690 => 0.004424854647368193,
+                    904 => 0.0030378159135580063,
+                    1202 => 0.0026804644148796797,
+                    264 => 0.0011171377263963223,
+                    1790 => 0.0010860266629606485,
+                    813 => 0.0010579257505014539,
+                    1524 => 0.0007699796697124839,
+                  },
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("with confidence", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{readonly confidence: true}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {confidence: true})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "confidence": 0.43072959780693054,
+                  "token": 578,
+                },
+                {
+                  "confidence": 0.42230042815208435,
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9981182813644409,
+                  "token": 5679,
+                },
+                {
+                  "confidence": 0.8126624226570129,
+                  "token": 374,
+                },
+                {
+                  "confidence": 0.2757589817047119,
+                  "token": 2288,
+                },
+                {
+                  "confidence": 0.9066131114959717,
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9882798194885254,
+                  "token": 311,
+                },
+                {
+                  "confidence": 0.7492305040359497,
+                  "token": 2512,
+                },
+                {
+                  "confidence": 0.9521902799606323,
+                  "token": 922,
+                },
+                {
+                  "confidence": 0.6508099436759949,
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("with probabilities and confidence", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{readonly probabilities: true, readonly confidence: true}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {probabilities: true, confidence: true})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            simplifyRes(res);
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "confidence": 0.43072959780693054,
+                  "probabilities": Map {
+                    578 => 0.43072959780693054,
+                    1115 => 0.13043756783008575,
+                    1102 => 0.05168525502085686,
+                    763 => 0.04288897663354874,
+                    1283 => 0.029397012665867805,
+                    2100 => 0.029378682374954224,
+                    15636 => 0.026268385350704193,
+                    2030 => 0.02184896357357502,
+                    320 => 0.01690298318862915,
+                    1628 => 0.011869494803249836,
+                  },
+                  "token": 578,
+                },
+                {
+                  "confidence": 0.42230042815208435,
+                  "probabilities": Map {
+                    16053 => 0.42230042815208435,
+                    4062 => 0.30363598465919495,
+                    39935 => 0.060395680367946625,
+                    2944 => 0.0373028889298439,
+                    5679 => 0.023811226710677147,
+                    11914 => 0.016298197209835052,
+                    2144 => 0.014683431014418602,
+                    1121 => 0.006984429899603128,
+                    17571 => 0.005794311873614788,
+                    3446 => 0.004934381693601608,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9981182813644409,
+                  "probabilities": Map {
+                    5679 => 0.9981182813644409,
+                    12875 => 0.00015924211766105145,
+                    18964 => 0.00011538491526152939,
+                    39935 => 0.00011497695231810212,
+                    13 => 0.00010490677959751338,
+                    627 => 0.00009277161007048562,
+                    656 => 0.00006256866618059576,
+                    893 => 0.00005633986438624561,
+                    198 => 0.00005223735934123397,
+                    374 => 0.00005191291347728111,
+                  },
+                  "token": 5679,
+                },
+                {
+                  "confidence": 0.8126624226570129,
+                  "probabilities": Map {
+                    374 => 0.8126624226570129,
+                    1587 => 0.04815267026424408,
+                    596 => 0.024733318015933037,
+                    1120 => 0.022302960976958275,
+                    3250 => 0.02154506742954254,
+                    706 => 0.01618366874754429,
+                    15849 => 0.00869414210319519,
+                    1053 => 0.005911793559789658,
+                    55064 => 0.0037806404288858175,
+                    11 => 0.0036655946169048548,
+                  },
+                  "token": 374,
+                },
+                {
+                  "confidence": 0.2757589817047119,
+                  "probabilities": Map {
+                    2288 => 0.2757589817047119,
+                    1120 => 0.16664838790893555,
+                    539 => 0.15775153040885925,
+                    779 => 0.1334378868341446,
+                    264 => 0.05585397779941559,
+                    1101 => 0.029216185212135315,
+                    16053 => 0.017680438235402107,
+                    5042 => 0.015862826257944107,
+                    1193 => 0.014583030715584755,
+                    2744 => 0.014090186916291714,
+                  },
+                  "token": 2288,
+                },
+                {
+                  "confidence": 0.9066131114959717,
+                  "probabilities": Map {
+                    16053 => 0.9066131114959717,
+                    13326 => 0.06362388283014297,
+                    19781 => 0.00715874508023262,
+                    17551 => 0.0020243648905307055,
+                    10968 => 0.0012676455080509186,
+                    11920 => 0.0011003530817106366,
+                    6435 => 0.0010086935944855213,
+                    34386 => 0.0007757442072033882,
+                    1208 => 0.0006099422462284565,
+                    25366 => 0.0005670536775141954,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9882798194885254,
+                  "probabilities": Map {
+                    311 => 0.9882798194885254,
+                    1524 => 0.006186165846884251,
+                    11 => 0.0025764326564967632,
+                    323 => 0.0005241178441792727,
+                    13 => 0.0003534558054525405,
+                    627 => 0.0003210459544789046,
+                    1606 => 0.00026416024775244296,
+                    2288 => 0.00025828619254752994,
+                    369 => 0.00012467413034755737,
+                    320 => 0.00010217254020972177,
+                  },
+                  "token": 311,
+                },
+                {
+                  "confidence": 0.7492305040359497,
+                  "probabilities": Map {
+                    2512 => 0.7492305040359497,
+                    1524 => 0.09894875437021255,
+                    656 => 0.032405685633420944,
+                    636 => 0.024060513824224472,
+                    7940 => 0.014415031298995018,
+                    33586 => 0.010868093930184841,
+                    387 => 0.008681590668857098,
+                    1781 => 0.005856928415596485,
+                    1629 => 0.005487737245857716,
+                    3351 => 0.005111564416438341,
+                  },
+                  "token": 2512,
+                },
+                {
+                  "confidence": 0.9521902799606323,
+                  "probabilities": Map {
+                    922 => 0.9521902799606323,
+                    1606 => 0.01500858273357153,
+                    11 => 0.014008254744112492,
+                    430 => 0.002968641696497798,
+                    627 => 0.002314596902579069,
+                    13 => 0.0018861450953409076,
+                    1524 => 0.0018014858942478895,
+                    369 => 0.001769029418937862,
+                    323 => 0.0009245016262866557,
+                    382 => 0.0008477734518237412,
+                  },
+                  "token": 922,
+                },
+                {
+                  "confidence": 0.6508099436759949,
+                  "probabilities": Map {
+                    279 => 0.6508099436759949,
+                    4205 => 0.31288382411003113,
+                    1148 => 0.011366183869540691,
+                    1690 => 0.004424854647368193,
+                    904 => 0.0030378159135580063,
+                    1202 => 0.0026804644148796797,
+                    264 => 0.0011171377263963223,
+                    1790 => 0.0010860266629606485,
+                    813 => 0.0010579257505014539,
+                    1524 => 0.0007699796697124839,
+                  },
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("confidence alone matches probability alone", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+
+            const probabilityRes: [token: Token, probability: number][] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {probabilities: true})) {
+                const tokenProbability = output.probabilities.get(output.token);
+                if (tokenProbability == null)
+                    throw new Error("Token probability not found");
+
+                probabilityRes.push([output.token, tokenProbability]);
+
+                if (probabilityRes.length >= maxTokens)
+                    break;
+            }
+
+            await sequence.clearHistory();
+
+            const confidenceRes: [token: Token, probability: number][] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {confidence: true})) {
+                confidenceRes.push([output.token, output.confidence]);
+
+                if (confidenceRes.length >= maxTokens)
+                    break;
+            }
+
+            expect(probabilityRes).toEqual(confidenceRes);
+        });
+    });
+});
+
+function simplifyRes<T extends SequenceEvaluateOutput<{readonly probabilities: true}>>(res: T[]) {
+    for (const item of res) {
+        if (item.probabilities != null)
+            item.probabilities = new Map([...item.probabilities.entries()].slice(0, 10));
+    }
+}
diff --git a/test/modelDependent/llama3.1/tokenPredictor.test.ts b/test/modelDependent/llama3.1/tokenPredictor.test.ts
index a549f39b..181f7bde 100644
--- a/test/modelDependent/llama3.1/tokenPredictor.test.ts
+++ b/test/modelDependent/llama3.1/tokenPredictor.test.ts
@@ -20,7 +20,8 @@ describe("llama 3.1", () => {
             const draftSequence = context.getSequence();
             const predictor = new DraftSequenceTokenPredictor(draftSequence, {
                 minTokens: 2,
-                maxTokens: 2
+                maxTokens: 2,
+                minConfidence: 0.2
             });
 
             const mainSequence = context.getSequence();
@@ -44,19 +45,19 @@ describe("llama 3.1", () => {
               ]
             `);
 
-            const textTokens = model.tokenize("! The");
+            const textTokens = model.tokenize("! How are");
             predictor.pushTokens(textTokens);
 
             const predictedTokens2 = await predictor.predictTokens();
             expect(predictedTokens2.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
               [
-                " weather",
-                " in",
+                " you",
+                " today",
               ]
             `);
 
 
-            await chatSession.preloadPrompt("What");
+            await chatSession.preloadPrompt("What ");
 
             await predictor.reset({
                 targetSequence: mainSequence,
@@ -82,6 +83,22 @@ describe("llama 3.1", () => {
                 " to",
               ]
             `);
+
+
+            await chatSession.preloadPrompt("If all");
+
+            await predictor.reset({
+                targetSequence: mainSequence,
+                stateTokens: mainSequence.contextTokens,
+                evaluateOptions: {}
+            });
+
+            const text3Tokens = model.tokenize("exquisite");
+            predictor.pushTokens(text3Tokens);
+
+            // no prediction with the given minimum confidence
+            const predictedTokens5 = await predictor.predictTokens();
+            expect(predictedTokens5.map((token) => model.detokenize([token], true))).to.eql([]);
         });
 
         describe("InputLookupTokenPredictor", () => {

From 72645e7645dcd3bb54507f3412194d3b43633cc7 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 5 Jan 2025 01:52:39 +0200
Subject: [PATCH 63/73] docs: document `useMmap`

---
 src/evaluator/LlamaModel/LlamaModel.ts | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 6e8fac18..08e4a9cd 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -63,9 +63,15 @@ export type LlamaModelOptions = {
     vocabOnly?: boolean,
 
     /**
-     * Use mmap if possible.
+     * Use mmap (memory-mapped file) to load the model.
      *
-     * Defaults to `true`.
+     * Using mmap allows the OS to load the model tensors directly from the file on the filesystem,
+     * and makes it easier for the system to manage memory.
+     *
+     * When using mmap, you might notice a delay the first time you actually use the model,
+     * which is caused by the OS itself loading the model into memory.
+     *
+     * Defaults to `true` if the current system supports it.
      */
     useMmap?: boolean,
 

From 854d902637459df815e3593626148da5232be14a Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 5 Jan 2025 01:52:56 +0200
Subject: [PATCH 64/73] fix: add missing include

---
 llama/addon/AddonContext.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 3fd2af2b..b65236c3 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -1,5 +1,6 @@
 #include <thread>
 #include <algorithm>
+#include <cmath>
 #include "common/common.h"
 #include "llama-grammar.h"
 #include "llama.h"

From e65b8394ceffe7c10a853f4dbecc58dcf66e50e1 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 5 Jan 2025 02:01:21 +0200
Subject: [PATCH 65/73] docs: improve examples

---
 docs/guide/external-chat-state.md | 4 +++-
 docs/guide/token-prediction.md    | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/guide/external-chat-state.md b/docs/guide/external-chat-state.md
index 47c9deb3..cab29046 100644
--- a/docs/guide/external-chat-state.md
+++ b/docs/guide/external-chat-state.md
@@ -38,7 +38,9 @@ const llamaChat = new LlamaChat({
     contextSequence: context.getSequence()
 });
 
-let chatHistory = llamaChat.chatWrapper.generateInitialChatHistory();
+let chatHistory = llamaChat.chatWrapper.generateInitialChatHistory({
+    // systemPrompt: "You're a helpful assistant"
+});
 
 const prompt = "Hi there, how are you?";
 
diff --git a/docs/guide/token-prediction.md b/docs/guide/token-prediction.md
index 41b19100..99913fb8 100644
--- a/docs/guide/token-prediction.md
+++ b/docs/guide/token-prediction.md
@@ -119,7 +119,10 @@ const contextSequence = context.getSequence({
     tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence, {
         // try to change this value to `1` or more
         // and see the difference in response times
-        minTokens: 0
+        minTokens: 0,
+        
+        // the minimum probability of a toke prediction to be considered
+        minConfidence: 0.6
     })
 });
 

From d53a07e46810b9b7b523ad170743788485e2e98a Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 5 Jan 2025 02:18:56 +0200
Subject: [PATCH 66/73] test: fix tests

---
 .../llama3.1/controlledEvaluate.test.ts       |  95 ++--
 .../llama3.1/evaluateWithMetadata.test.ts     | 456 +++++++++---------
 2 files changed, 283 insertions(+), 268 deletions(-)

diff --git a/test/modelDependent/llama3.1/controlledEvaluate.test.ts b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
index 2f1202eb..85a71811 100644
--- a/test/modelDependent/llama3.1/controlledEvaluate.test.ts
+++ b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
@@ -68,7 +68,14 @@ describe("llama 3.1", () => {
 
                 // only keep the top 10 probabilities to not clutter the snapshot
                 if (item.next?.probabilities != null)
-                    item.next.probabilities = new Map([...item.next.probabilities.entries()].slice(0, 10));
+                    item.next.probabilities = new Map(
+                        [...item.next.probabilities.entries()]
+                            .slice(0, 10)
+                            .map(([token, probability]) => [token, parseFloat(probability.toFixed(7))])
+                    );
+
+                if (item.next?.confidence != null)
+                    item.next.confidence = parseFloat(item.next.confidence.toFixed(7));
 
                 return item;
             });
@@ -85,73 +92,73 @@ describe("llama 3.1", () => {
                 {
                   "next": {
                     "probabilities": Map {
-                      35308 => 0.5214946269989014,
-                      27096 => 0.24320587515830994,
-                      11 => 0.022182414308190346,
-                      198 => 0.011944590136408806,
-                      374 => 0.008361410349607468,
-                      863 => 0.008360812440514565,
-                      1131 => 0.006834662053734064,
-                      25 => 0.006243313197046518,
-                      7940 => 0.00540389958769083,
-                      1 => 0.005168775096535683,
+                      35308 => 0.5214946,
+                      27096 => 0.2432059,
+                      11 => 0.0221824,
+                      198 => 0.0119446,
+                      374 => 0.0083614,
+                      863 => 0.0083608,
+                      1131 => 0.0068347,
+                      25 => 0.0062433,
+                      7940 => 0.0054039,
+                      1 => 0.0051688,
                     },
                   },
                 },
                 {
                   "next": {
                     "probabilities": Map {
-                      927 => 0.9811904430389404,
-                      198 => 0.0033848676830530167,
-                      6288 => 0.0032705331686884165,
-                      279 => 0.0006552835111506283,
-                      1633 => 0.00031841936288401484,
-                      1035 => 0.0003114044084213674,
-                      13 => 0.0002916732046287507,
-                      264 => 0.00028948261751793325,
-                      297 => 0.0002833220351021737,
-                      720 => 0.00024898265837691724,
+                      927 => 0.9811904,
+                      198 => 0.0033849,
+                      6288 => 0.0032705,
+                      279 => 0.0006553,
+                      1633 => 0.0003184,
+                      1035 => 0.0003114,
+                      13 => 0.0002917,
+                      264 => 0.0002895,
+                      297 => 0.0002833,
+                      720 => 0.000249,
                     },
                     "token": 927,
                   },
                 },
                 {
                   "next": {
-                    "confidence": 0.9306728839874268,
+                    "confidence": 0.9306729,
                     "token": 279,
                   },
                 },
                 {
                   "next": {
-                    "confidence": 0.9597684741020203,
+                    "confidence": 0.9597685,
                     "probabilities": Map {
-                      16053 => 0.9597684741020203,
-                      1208 => 0.004750591237097979,
-                      198 => 0.0031827085185796022,
-                      5679 => 0.0029162338469177485,
-                      65536 => 0.00197240780107677,
-                      6435 => 0.0009124248754233122,
-                      2697 => 0.0006706250132992864,
-                      720 => 0.0005979162524454296,
-                      21811 => 0.0005516768433153629,
-                      45363 => 0.0005495203076861799,
+                      16053 => 0.9597685,
+                      1208 => 0.0047506,
+                      198 => 0.0031827,
+                      5679 => 0.0029162,
+                      65536 => 0.0019724,
+                      6435 => 0.0009124,
+                      2697 => 0.0006706,
+                      720 => 0.0005979,
+                      21811 => 0.0005517,
+                      45363 => 0.0005495,
                     },
                   },
                 },
                 {
                   "next": {
-                    "confidence": 0.9871460199356079,
+                    "confidence": 0.987146,
                     "probabilities": Map {
-                      5679 => 0.9871460199356079,
-                      21811 => 0.001438674982637167,
-                      198 => 0.0009368227329105139,
-                      8415 => 0.0007225279696285725,
-                      12875 => 0.00038032486918382347,
-                      4194 => 0.00034695648355409503,
-                      720 => 0.00028149448917247355,
-                      14588 => 0.00027612835401669145,
-                      9522 => 0.00024171460245270282,
-                      627 => 0.0002042166597675532,
+                      5679 => 0.987146,
+                      21811 => 0.0014387,
+                      198 => 0.0009368,
+                      8415 => 0.0007225,
+                      12875 => 0.0003803,
+                      4194 => 0.000347,
+                      720 => 0.0002815,
+                      14588 => 0.0002761,
+                      9522 => 0.0002417,
+                      627 => 0.0002042,
                     },
                     "token": 5679,
                   },
diff --git a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
index 0c368053..f638f053 100644
--- a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
+++ b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
@@ -1,8 +1,7 @@
 import {describe, expect, test} from "vitest";
-import {Token} from "../../../src/index.js";
+import {Token, SequenceEvaluateOutput} from "../../../src/index.js";
 import {getModelFile} from "../../utils/modelFiles.js";
 import {getTestLlama} from "../../utils/getTestLlama.js";
-import {SequenceEvaluateOutput} from "../../../src/evaluator/LlamaContext/types.js";
 
 describe("llama 3.1", () => {
     describe("evaluate with metadata", () => {
@@ -31,6 +30,7 @@ describe("llama 3.1", () => {
                     break;
             }
 
+            simplifyRes(res);
             expect(res).toMatchInlineSnapshot(`
               [
                 {
@@ -94,151 +94,151 @@ describe("llama 3.1", () => {
               [
                 {
                   "probabilities": Map {
-                    578 => 0.43072959780693054,
-                    1115 => 0.13043756783008575,
-                    1102 => 0.05168525502085686,
-                    763 => 0.04288897663354874,
-                    1283 => 0.029397012665867805,
-                    2100 => 0.029378682374954224,
-                    15636 => 0.026268385350704193,
-                    2030 => 0.02184896357357502,
-                    320 => 0.01690298318862915,
-                    1628 => 0.011869494803249836,
+                    578 => 0.4307296,
+                    1115 => 0.1304376,
+                    1102 => 0.0516853,
+                    763 => 0.042889,
+                    1283 => 0.029397,
+                    2100 => 0.0293787,
+                    15636 => 0.0262684,
+                    2030 => 0.021849,
+                    320 => 0.016903,
+                    1628 => 0.0118695,
                   },
                   "token": 578,
                 },
                 {
                   "probabilities": Map {
-                    16053 => 0.42230042815208435,
-                    4062 => 0.30363598465919495,
-                    39935 => 0.060395680367946625,
-                    2944 => 0.0373028889298439,
-                    5679 => 0.023811226710677147,
-                    11914 => 0.016298197209835052,
-                    2144 => 0.014683431014418602,
-                    1121 => 0.006984429899603128,
-                    17571 => 0.005794311873614788,
-                    3446 => 0.004934381693601608,
+                    16053 => 0.4223004,
+                    4062 => 0.303636,
+                    39935 => 0.0603957,
+                    2944 => 0.0373029,
+                    5679 => 0.0238112,
+                    11914 => 0.0162982,
+                    2144 => 0.0146834,
+                    1121 => 0.0069844,
+                    17571 => 0.0057943,
+                    3446 => 0.0049344,
                   },
                   "token": 16053,
                 },
                 {
                   "probabilities": Map {
-                    5679 => 0.9981182813644409,
-                    12875 => 0.00015924211766105145,
-                    18964 => 0.00011538491526152939,
-                    39935 => 0.00011497695231810212,
-                    13 => 0.00010490677959751338,
-                    627 => 0.00009277161007048562,
-                    656 => 0.00006256866618059576,
-                    893 => 0.00005633986438624561,
-                    198 => 0.00005223735934123397,
-                    374 => 0.00005191291347728111,
+                    5679 => 0.9981183,
+                    12875 => 0.0001592,
+                    18964 => 0.0001154,
+                    39935 => 0.000115,
+                    13 => 0.0001049,
+                    627 => 0.0000928,
+                    656 => 0.0000626,
+                    893 => 0.0000563,
+                    198 => 0.0000522,
+                    374 => 0.0000519,
                   },
                   "token": 5679,
                 },
                 {
                   "probabilities": Map {
-                    374 => 0.8126624226570129,
-                    1587 => 0.04815267026424408,
-                    596 => 0.024733318015933037,
-                    1120 => 0.022302960976958275,
-                    3250 => 0.02154506742954254,
-                    706 => 0.01618366874754429,
-                    15849 => 0.00869414210319519,
-                    1053 => 0.005911793559789658,
-                    55064 => 0.0037806404288858175,
-                    11 => 0.0036655946169048548,
+                    374 => 0.8126624,
+                    1587 => 0.0481527,
+                    596 => 0.0247333,
+                    1120 => 0.022303,
+                    3250 => 0.0215451,
+                    706 => 0.0161837,
+                    15849 => 0.0086941,
+                    1053 => 0.0059118,
+                    55064 => 0.0037806,
+                    11 => 0.0036656,
                   },
                   "token": 374,
                 },
                 {
                   "probabilities": Map {
-                    2288 => 0.2757589817047119,
-                    1120 => 0.16664838790893555,
-                    539 => 0.15775153040885925,
-                    779 => 0.1334378868341446,
-                    264 => 0.05585397779941559,
-                    1101 => 0.029216185212135315,
-                    16053 => 0.017680438235402107,
-                    5042 => 0.015862826257944107,
-                    1193 => 0.014583030715584755,
-                    2744 => 0.014090186916291714,
+                    2288 => 0.275759,
+                    1120 => 0.1666484,
+                    539 => 0.1577515,
+                    779 => 0.1334379,
+                    264 => 0.055854,
+                    1101 => 0.0292162,
+                    16053 => 0.0176804,
+                    5042 => 0.0158628,
+                    1193 => 0.014583,
+                    2744 => 0.0140902,
                   },
                   "token": 2288,
                 },
                 {
                   "probabilities": Map {
-                    16053 => 0.9066131114959717,
-                    13326 => 0.06362388283014297,
-                    19781 => 0.00715874508023262,
-                    17551 => 0.0020243648905307055,
-                    10968 => 0.0012676455080509186,
-                    11920 => 0.0011003530817106366,
-                    6435 => 0.0010086935944855213,
-                    34386 => 0.0007757442072033882,
-                    1208 => 0.0006099422462284565,
-                    25366 => 0.0005670536775141954,
+                    16053 => 0.9066131,
+                    13326 => 0.0636239,
+                    19781 => 0.0071587,
+                    17551 => 0.0020244,
+                    10968 => 0.0012676,
+                    11920 => 0.0011004,
+                    6435 => 0.0010087,
+                    34386 => 0.0007757,
+                    1208 => 0.0006099,
+                    25366 => 0.0005671,
                   },
                   "token": 16053,
                 },
                 {
                   "probabilities": Map {
-                    311 => 0.9882798194885254,
-                    1524 => 0.006186165846884251,
-                    11 => 0.0025764326564967632,
-                    323 => 0.0005241178441792727,
-                    13 => 0.0003534558054525405,
-                    627 => 0.0003210459544789046,
-                    1606 => 0.00026416024775244296,
-                    2288 => 0.00025828619254752994,
-                    369 => 0.00012467413034755737,
-                    320 => 0.00010217254020972177,
+                    311 => 0.9882798,
+                    1524 => 0.0061862,
+                    11 => 0.0025764,
+                    323 => 0.0005241,
+                    13 => 0.0003535,
+                    627 => 0.000321,
+                    1606 => 0.0002642,
+                    2288 => 0.0002583,
+                    369 => 0.0001247,
+                    320 => 0.0001022,
                   },
                   "token": 311,
                 },
                 {
                   "probabilities": Map {
-                    2512 => 0.7492305040359497,
-                    1524 => 0.09894875437021255,
-                    656 => 0.032405685633420944,
-                    636 => 0.024060513824224472,
-                    7940 => 0.014415031298995018,
-                    33586 => 0.010868093930184841,
-                    387 => 0.008681590668857098,
-                    1781 => 0.005856928415596485,
-                    1629 => 0.005487737245857716,
-                    3351 => 0.005111564416438341,
+                    2512 => 0.7492305,
+                    1524 => 0.0989488,
+                    656 => 0.0324057,
+                    636 => 0.0240605,
+                    7940 => 0.014415,
+                    33586 => 0.0108681,
+                    387 => 0.0086816,
+                    1781 => 0.0058569,
+                    1629 => 0.0054877,
+                    3351 => 0.0051116,
                   },
                   "token": 2512,
                 },
                 {
                   "probabilities": Map {
-                    922 => 0.9521902799606323,
-                    1606 => 0.01500858273357153,
-                    11 => 0.014008254744112492,
-                    430 => 0.002968641696497798,
-                    627 => 0.002314596902579069,
-                    13 => 0.0018861450953409076,
-                    1524 => 0.0018014858942478895,
-                    369 => 0.001769029418937862,
-                    323 => 0.0009245016262866557,
-                    382 => 0.0008477734518237412,
+                    922 => 0.9521903,
+                    1606 => 0.0150086,
+                    11 => 0.0140083,
+                    430 => 0.0029686,
+                    627 => 0.0023146,
+                    13 => 0.0018861,
+                    1524 => 0.0018015,
+                    369 => 0.001769,
+                    323 => 0.0009245,
+                    382 => 0.0008478,
                   },
                   "token": 922,
                 },
                 {
                   "probabilities": Map {
-                    279 => 0.6508099436759949,
-                    4205 => 0.31288382411003113,
-                    1148 => 0.011366183869540691,
-                    1690 => 0.004424854647368193,
-                    904 => 0.0030378159135580063,
-                    1202 => 0.0026804644148796797,
-                    264 => 0.0011171377263963223,
-                    1790 => 0.0010860266629606485,
-                    813 => 0.0010579257505014539,
-                    1524 => 0.0007699796697124839,
+                    279 => 0.6508099,
+                    4205 => 0.3128838,
+                    1148 => 0.0113662,
+                    1690 => 0.0044249,
+                    904 => 0.0030378,
+                    1202 => 0.0026805,
+                    264 => 0.0011171,
+                    1790 => 0.001086,
+                    813 => 0.0010579,
+                    1524 => 0.00077,
                   },
                   "token": 279,
                 },
@@ -268,46 +268,47 @@ describe("llama 3.1", () => {
                     break;
             }
 
+            simplifyRes(res);
             expect(res).toMatchInlineSnapshot(`
               [
                 {
-                  "confidence": 0.43072959780693054,
+                  "confidence": 0.4307296,
                   "token": 578,
                 },
                 {
-                  "confidence": 0.42230042815208435,
+                  "confidence": 0.4223004,
                   "token": 16053,
                 },
                 {
-                  "confidence": 0.9981182813644409,
+                  "confidence": 0.9981183,
                   "token": 5679,
                 },
                 {
-                  "confidence": 0.8126624226570129,
+                  "confidence": 0.8126624,
                   "token": 374,
                 },
                 {
-                  "confidence": 0.2757589817047119,
+                  "confidence": 0.275759,
                   "token": 2288,
                 },
                 {
-                  "confidence": 0.9066131114959717,
+                  "confidence": 0.9066131,
                   "token": 16053,
                 },
                 {
-                  "confidence": 0.9882798194885254,
+                  "confidence": 0.9882798,
                   "token": 311,
                 },
                 {
-                  "confidence": 0.7492305040359497,
+                  "confidence": 0.7492305,
                   "token": 2512,
                 },
                 {
-                  "confidence": 0.9521902799606323,
+                  "confidence": 0.9521903,
                   "token": 922,
                 },
                 {
-                  "confidence": 0.6508099436759949,
+                  "confidence": 0.6508099,
                   "token": 279,
                 },
               ]
@@ -340,162 +341,162 @@ describe("llama 3.1", () => {
             expect(res).toMatchInlineSnapshot(`
               [
                 {
-                  "confidence": 0.43072959780693054,
+                  "confidence": 0.4307296,
                   "probabilities": Map {
-                    578 => 0.43072959780693054,
-                    1115 => 0.13043756783008575,
-                    1102 => 0.05168525502085686,
-                    763 => 0.04288897663354874,
-                    1283 => 0.029397012665867805,
-                    2100 => 0.029378682374954224,
-                    15636 => 0.026268385350704193,
-                    2030 => 0.02184896357357502,
-                    320 => 0.01690298318862915,
-                    1628 => 0.011869494803249836,
+                    578 => 0.4307296,
+                    1115 => 0.1304376,
+                    1102 => 0.0516853,
+                    763 => 0.042889,
+                    1283 => 0.029397,
+                    2100 => 0.0293787,
+                    15636 => 0.0262684,
+                    2030 => 0.021849,
+                    320 => 0.016903,
+                    1628 => 0.0118695,
                   },
                   "token": 578,
                 },
                 {
-                  "confidence": 0.42230042815208435,
+                  "confidence": 0.4223004,
                   "probabilities": Map {
-                    16053 => 0.42230042815208435,
-                    4062 => 0.30363598465919495,
-                    39935 => 0.060395680367946625,
-                    2944 => 0.0373028889298439,
-                    5679 => 0.023811226710677147,
-                    11914 => 0.016298197209835052,
-                    2144 => 0.014683431014418602,
-                    1121 => 0.006984429899603128,
-                    17571 => 0.005794311873614788,
-                    3446 => 0.004934381693601608,
+                    16053 => 0.4223004,
+                    4062 => 0.303636,
+                    39935 => 0.0603957,
+                    2944 => 0.0373029,
+                    5679 => 0.0238112,
+                    11914 => 0.0162982,
+                    2144 => 0.0146834,
+                    1121 => 0.0069844,
+                    17571 => 0.0057943,
+                    3446 => 0.0049344,
                   },
                   "token": 16053,
                 },
                 {
-                  "confidence": 0.9981182813644409,
+                  "confidence": 0.9981183,
                   "probabilities": Map {
-                    5679 => 0.9981182813644409,
-                    12875 => 0.00015924211766105145,
-                    18964 => 0.00011538491526152939,
-                    39935 => 0.00011497695231810212,
-                    13 => 0.00010490677959751338,
-                    627 => 0.00009277161007048562,
-                    656 => 0.00006256866618059576,
-                    893 => 0.00005633986438624561,
-                    198 => 0.00005223735934123397,
-                    374 => 0.00005191291347728111,
+                    5679 => 0.9981183,
+                    12875 => 0.0001592,
+                    18964 => 0.0001154,
+                    39935 => 0.000115,
+                    13 => 0.0001049,
+                    627 => 0.0000928,
+                    656 => 0.0000626,
+                    893 => 0.0000563,
+                    198 => 0.0000522,
+                    374 => 0.0000519,
                   },
                   "token": 5679,
                 },
                 {
-                  "confidence": 0.8126624226570129,
+                  "confidence": 0.8126624,
                   "probabilities": Map {
-                    374 => 0.8126624226570129,
-                    1587 => 0.04815267026424408,
-                    596 => 0.024733318015933037,
-                    1120 => 0.022302960976958275,
-                    3250 => 0.02154506742954254,
-                    706 => 0.01618366874754429,
-                    15849 => 0.00869414210319519,
-                    1053 => 0.005911793559789658,
-                    55064 => 0.0037806404288858175,
-                    11 => 0.0036655946169048548,
+                    374 => 0.8126624,
+                    1587 => 0.0481527,
+                    596 => 0.0247333,
+                    1120 => 0.022303,
+                    3250 => 0.0215451,
+                    706 => 0.0161837,
+                    15849 => 0.0086941,
+                    1053 => 0.0059118,
+                    55064 => 0.0037806,
+                    11 => 0.0036656,
                   },
                   "token": 374,
                 },
                 {
-                  "confidence": 0.2757589817047119,
+                  "confidence": 0.275759,
                   "probabilities": Map {
-                    2288 => 0.2757589817047119,
-                    1120 => 0.16664838790893555,
-                    539 => 0.15775153040885925,
-                    779 => 0.1334378868341446,
-                    264 => 0.05585397779941559,
-                    1101 => 0.029216185212135315,
-                    16053 => 0.017680438235402107,
-                    5042 => 0.015862826257944107,
-                    1193 => 0.014583030715584755,
-                    2744 => 0.014090186916291714,
+                    2288 => 0.275759,
+                    1120 => 0.1666484,
+                    539 => 0.1577515,
+                    779 => 0.1334379,
+                    264 => 0.055854,
+                    1101 => 0.0292162,
+                    16053 => 0.0176804,
+                    5042 => 0.0158628,
+                    1193 => 0.014583,
+                    2744 => 0.0140902,
                   },
                   "token": 2288,
                 },
                 {
-                  "confidence": 0.9066131114959717,
+                  "confidence": 0.9066131,
                   "probabilities": Map {
-                    16053 => 0.9066131114959717,
-                    13326 => 0.06362388283014297,
-                    19781 => 0.00715874508023262,
-                    17551 => 0.0020243648905307055,
-                    10968 => 0.0012676455080509186,
-                    11920 => 0.0011003530817106366,
-                    6435 => 0.0010086935944855213,
-                    34386 => 0.0007757442072033882,
-                    1208 => 0.0006099422462284565,
-                    25366 => 0.0005670536775141954,
+                    16053 => 0.9066131,
+                    13326 => 0.0636239,
+                    19781 => 0.0071587,
+                    17551 => 0.0020244,
+                    10968 => 0.0012676,
+                    11920 => 0.0011004,
+                    6435 => 0.0010087,
+                    34386 => 0.0007757,
+                    1208 => 0.0006099,
+                    25366 => 0.0005671,
                   },
                   "token": 16053,
                 },
                 {
-                  "confidence": 0.9882798194885254,
+                  "confidence": 0.9882798,
                   "probabilities": Map {
-                    311 => 0.9882798194885254,
-                    1524 => 0.006186165846884251,
-                    11 => 0.0025764326564967632,
-                    323 => 0.0005241178441792727,
-                    13 => 0.0003534558054525405,
-                    627 => 0.0003210459544789046,
-                    1606 => 0.00026416024775244296,
-                    2288 => 0.00025828619254752994,
-                    369 => 0.00012467413034755737,
-                    320 => 0.00010217254020972177,
+                    311 => 0.9882798,
+                    1524 => 0.0061862,
+                    11 => 0.0025764,
+                    323 => 0.0005241,
+                    13 => 0.0003535,
+                    627 => 0.000321,
+                    1606 => 0.0002642,
+                    2288 => 0.0002583,
+                    369 => 0.0001247,
+                    320 => 0.0001022,
                   },
                   "token": 311,
                 },
                 {
-                  "confidence": 0.7492305040359497,
+                  "confidence": 0.7492305,
                   "probabilities": Map {
-                    2512 => 0.7492305040359497,
-                    1524 => 0.09894875437021255,
-                    656 => 0.032405685633420944,
-                    636 => 0.024060513824224472,
-                    7940 => 0.014415031298995018,
-                    33586 => 0.010868093930184841,
-                    387 => 0.008681590668857098,
-                    1781 => 0.005856928415596485,
-                    1629 => 0.005487737245857716,
-                    3351 => 0.005111564416438341,
+                    2512 => 0.7492305,
+                    1524 => 0.0989488,
+                    656 => 0.0324057,
+                    636 => 0.0240605,
+                    7940 => 0.014415,
+                    33586 => 0.0108681,
+                    387 => 0.0086816,
+                    1781 => 0.0058569,
+                    1629 => 0.0054877,
+                    3351 => 0.0051116,
                   },
                   "token": 2512,
                 },
                 {
-                  "confidence": 0.9521902799606323,
+                  "confidence": 0.9521903,
                   "probabilities": Map {
-                    922 => 0.9521902799606323,
-                    1606 => 0.01500858273357153,
-                    11 => 0.014008254744112492,
-                    430 => 0.002968641696497798,
-                    627 => 0.002314596902579069,
-                    13 => 0.0018861450953409076,
-                    1524 => 0.0018014858942478895,
-                    369 => 0.001769029418937862,
-                    323 => 0.0009245016262866557,
-                    382 => 0.0008477734518237412,
+                    922 => 0.9521903,
+                    1606 => 0.0150086,
+                    11 => 0.0140083,
+                    430 => 0.0029686,
+                    627 => 0.0023146,
+                    13 => 0.0018861,
+                    1524 => 0.0018015,
+                    369 => 0.001769,
+                    323 => 0.0009245,
+                    382 => 0.0008478,
                   },
                   "token": 922,
                 },
                 {
-                  "confidence": 0.6508099436759949,
+                  "confidence": 0.6508099,
                   "probabilities": Map {
-                    279 => 0.6508099436759949,
-                    4205 => 0.31288382411003113,
-                    1148 => 0.011366183869540691,
-                    1690 => 0.004424854647368193,
-                    904 => 0.0030378159135580063,
-                    1202 => 0.0026804644148796797,
-                    264 => 0.0011171377263963223,
-                    1790 => 0.0010860266629606485,
-                    813 => 0.0010579257505014539,
-                    1524 => 0.0007699796697124839,
+                    279 => 0.6508099,
+                    4205 => 0.3128838,
+                    1148 => 0.0113662,
+                    1690 => 0.0044249,
+                    904 => 0.0030378,
+                    1202 => 0.0026805,
+                    264 => 0.0011171,
+                    1790 => 0.001086,
+                    813 => 0.0010579,
+                    1524 => 0.00077,
                   },
                   "token": 279,
                 },
@@ -545,9 +546,16 @@ describe("llama 3.1", () => {
     });
 });
 
-function simplifyRes<T extends SequenceEvaluateOutput<{readonly probabilities: true}>>(res: T[]) {
+function simplifyRes<T extends Partial<SequenceEvaluateOutput<{readonly probabilities: true, readonly confidence: true}>>>(res: T[]) {
     for (const item of res) {
         if (item.probabilities != null)
-            item.probabilities = new Map([...item.probabilities.entries()].slice(0, 10));
+            item.probabilities = new Map(
+                [...item.probabilities.entries()]
+                    .slice(0, 10)
+                    .map(([token, probability]) => [token, parseFloat(probability.toFixed(7))])
+            );
+
+        if (item.confidence != null)
+            item.confidence = parseFloat(item.confidence.toFixed(7));
     }
 }

From 2527413e3e4679bce2ea23db712ecf3a07b3a970 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 6 Jan 2025 01:40:41 +0200
Subject: [PATCH 67/73] fix: pipe error logs in `inspect gpu` command

---
 src/bindings/Llama.ts                         |  2 +-
 src/bindings/getLlama.ts                      | 19 +++--
 src/bindings/utils/testBindingBinary.ts       | 82 +++++++++++++++++--
 .../inspect/commands/InspectGpuCommand.ts     |  3 +-
 4 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index 0ed6e7f8..e894521d 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -14,7 +14,7 @@ import {BindingModule} from "./AddonTypes.js";
 import {BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel, LlamaLogLevelGreaterThanOrEqual} from "./types.js";
 import {MemoryOrchestrator, MemoryReservation} from "./utils/MemoryOrchestrator.js";
 
-const LlamaLogLevelToAddonLogLevel: ReadonlyMap<LlamaLogLevel, number> = new Map([
+export const LlamaLogLevelToAddonLogLevel: ReadonlyMap<LlamaLogLevel, number> = new Map([
     [LlamaLogLevel.disabled, 0],
     [LlamaLogLevel.fatal, 1],
     [LlamaLogLevel.error, 2],
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
index c0dcffc6..62e5d95e 100644
--- a/src/bindings/getLlama.ts
+++ b/src/bindings/getLlama.ts
@@ -342,10 +342,12 @@ export async function getLlamaForOptions({
     debug = defaultLlamaCppDebugMode
 }: LlamaOptions, {
     updateLastBuildInfoOnCompile = false,
-    skipLlamaInit = false
+    skipLlamaInit = false,
+    pipeBinaryTestErrorLogs = false
 }: {
     updateLastBuildInfoOnCompile?: boolean,
-    skipLlamaInit?: boolean
+    skipLlamaInit?: boolean,
+    pipeBinaryTestErrorLogs?: boolean
 } = {}): Promise<Llama> {
     const platform = getPlatform();
     const arch = process.arch;
@@ -463,7 +465,8 @@ export async function getLlamaForOptions({
                             ? "falling back to building from source"
                             : null
                     ),
-                debug
+                debug,
+                pipeBinaryTestErrorLogs
             });
 
             if (llama != null)
@@ -556,7 +559,8 @@ async function loadExistingLlamaBinary({
     vramPadding,
     ramPadding,
     fallbackMessage,
-    debug
+    debug,
+    pipeBinaryTestErrorLogs
 }: {
     buildOptions: BuildOptions,
     canUsePrebuiltBinaries: boolean,
@@ -571,7 +575,8 @@ async function loadExistingLlamaBinary({
     vramPadding: Required<LlamaOptions>["vramPadding"],
     ramPadding: Required<LlamaOptions>["ramPadding"],
     fallbackMessage: string | null,
-    debug: boolean
+    debug: boolean,
+    pipeBinaryTestErrorLogs: boolean
 }) {
     const buildFolderName = await getBuildFolderNameForBuildOptions(buildOptions);
 
@@ -590,7 +595,7 @@ async function loadExistingLlamaBinary({
             });
             const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
             const binaryCompatible = shouldTestBinaryBeforeLoading
-                ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu)
+                ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs)
                 : true;
 
             if (binaryCompatible) {
@@ -649,7 +654,7 @@ async function loadExistingLlamaBinary({
                 });
                 const resolvedBindingPath = await resolveActualBindingBinaryPath(prebuiltBinDetails.binaryPath);
                 const binaryCompatible = shouldTestBinaryBeforeLoading
-                    ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu)
+                    ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs)
                     : true;
 
                 if (binaryCompatible) {
diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts
index 914acb34..7151c702 100644
--- a/src/bindings/utils/testBindingBinary.ts
+++ b/src/bindings/utils/testBindingBinary.ts
@@ -4,7 +4,8 @@ import {createRequire} from "module";
 import path from "path";
 import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
 import {runningInElectron} from "../../utils/runtime.js";
-import {BuildGpu} from "../types.js";
+import {BuildGpu, LlamaLogLevel} from "../types.js";
+import {LlamaLogLevelToAddonLogLevel} from "../Llama.js";
 import type {BindingModule} from "../AddonTypes.js";
 
 const require = createRequire(import.meta.url);
@@ -12,7 +13,12 @@ const __filename = fileURLToPath(import.meta.url);
 const detectedFileName = path.basename(__filename);
 const expectedFileName = "testBindingBinary";
 
-export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu, testTimeout: number = 1000 * 60 * 5): Promise<boolean> {
+export async function testBindingBinary(
+    bindingBinaryPath: string,
+    gpu: BuildGpu,
+    testTimeout: number = 1000 * 60 * 5,
+    pipeOutputOnNode: boolean = false
+): Promise<boolean> {
     if (!detectedFileName.startsWith(expectedFileName)) {
         console.warn(
             getConsoleLogPrefix() +
@@ -57,7 +63,8 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
         onExit(code: number): void
     }): {
         sendMessage(message: ParentToChildMessage): void,
-        killProcess(): void
+        killProcess(): void,
+        pipeMessages(): void
     } {
         if (forkFunction.type === "electron") {
             let exited = false;
@@ -88,13 +95,18 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
 
             return {
                 sendMessage: (message: ParentToChildMessage) => subProcess.postMessage(message),
-                killProcess: cleanupElectronFork
+                killProcess: cleanupElectronFork,
+                pipeMessages: () => void 0
             };
         }
 
+        let pipeSet = false;
         const subProcess = forkFunction.fork(__filename, [], {
             detached: false,
             silent: true,
+            stdio: pipeOutputOnNode
+                ? ["ignore", "pipe", "pipe", "ipc"]
+                : ["ignore", "ignore", "ignore", "ipc"],
             env: {
                 ...process.env,
                 TEST_BINDING_CP: "true"
@@ -102,6 +114,9 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
         });
 
         function cleanupNodeFork() {
+            subProcess.stdout?.off("data", onStdout);
+            subProcess.stderr?.off("data", onStderr);
+
             if (subProcess.exitCode == null)
                 subProcess.kill("SIGKILL");
 
@@ -121,9 +136,36 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
             onExit(subProcess.exitCode ?? -1);
         }
 
+        function onStdout(data: string) {
+            if (!pipeSet)
+                return;
+
+            process.stdout.write(data);
+        }
+
+        function onStderr(data: string) {
+            if (!pipeSet)
+                return;
+
+            process.stderr.write(data);
+        }
+
+        if (pipeOutputOnNode) {
+            subProcess.stdout?.on("data", onStdout);
+            subProcess.stderr?.on("data", onStderr);
+        }
+
+        function pipeMessages() {
+            if (!pipeOutputOnNode || pipeSet)
+                return;
+
+            pipeSet = true;
+        }
+
         return {
             sendMessage: (message: ParentToChildMessage) => subProcess.send(message),
-            killProcess: cleanupNodeFork
+            killProcess: cleanupNodeFork,
+            pipeMessages
         };
     }
 
@@ -169,6 +211,13 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
                             bindingBinaryPath,
                             gpu
                         });
+                    } else if (message.type === "loaded") {
+                        subProcess!.pipeMessages(); // only start piping error logs if the binary loaded successfully
+                        subProcess!.sendMessage({
+                            type: "test",
+                            bindingBinaryPath,
+                            gpu
+                        });
                     } else if (message.type === "done") {
                         testPassed = true;
                         subProcess!.sendMessage({type: "exit"});
@@ -189,13 +238,28 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
 }
 
 if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || process.send != null)) {
+    let binding: BindingModule;
     const sendMessage = process.parentPort != null
         ? (message: ChildToParentMessage) => process.parentPort.postMessage(message)
         : (message: ChildToParentMessage) => process.send!(message);
     const onMessage = async (message: ParentToChildMessage) => {
         if (message.type === "start") {
             try {
-                const binding: BindingModule = require(message.bindingBinaryPath);
+                binding = require(message.bindingBinaryPath);
+
+                const errorLogLevel = LlamaLogLevelToAddonLogLevel.get(LlamaLogLevel.error);
+                if (errorLogLevel != null)
+                    binding.setLoggerLogLevel(errorLogLevel);
+
+                sendMessage({type: "loaded"});
+            } catch (err) {
+                console.error(err);
+                process.exit(1);
+            }
+        } else if (message.type === "test") {
+            try {
+                if (binding == null)
+                    throw new Error("Binding binary is not loaded");
 
                 binding.loadBackends();
                 const loadedGpu = binding.getGpuType();
@@ -235,9 +299,13 @@ type ParentToChildMessage = {
     type: "start",
     bindingBinaryPath: string,
     gpu: BuildGpu
+} | {
+    type: "test",
+    bindingBinaryPath: string,
+    gpu: BuildGpu
 } | {
     type: "exit"
 };
 type ChildToParentMessage = {
-    type: "ready" | "done"
+    type: "ready" | "loaded" | "done"
 };
diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
index 199b0e06..62c88bd7 100644
--- a/src/cli/commands/inspect/commands/InspectGpuCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
@@ -172,7 +172,8 @@ async function getLlamaForGpu(gpu: BuildGpu) {
             logLevel: LlamaLogLevel.warn,
             vramPadding: 0
         }, {
-            skipLlamaInit: true
+            skipLlamaInit: true,
+            pipeBinaryTestErrorLogs: true
         });
     } catch (err) {
         return undefined;

From bb33a5de58dd686e46a691657530675384e7ead8 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 6 Jan 2025 02:04:31 +0200
Subject: [PATCH 68/73] test: fix tests

---
 src/evaluator/LlamaContext/types.ts           | 12 ++++++++++
 .../llama3.1/controlledEvaluate.test.ts       |  6 ++++-
 .../llama3.1/evaluateWithMetadata.test.ts     | 24 +++++++++++++++----
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 78755247..a68a72e3 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -326,6 +326,8 @@ export type SequenceEvaluateMetadataOptions = {
      * Same as `probabilities.get(token)` from the output.
      *
      * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
      */
     readonly confidence?: boolean,
 
@@ -359,6 +361,8 @@ export type SequenceEvaluateOutput<
      * Same as `probabilities.get(token)`.
      *
      * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
      */
     confidence: number,
 
@@ -367,6 +371,8 @@ export type SequenceEvaluateOutput<
      *
      * A probability is a number from `0` to `1`.
      *
+     * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+     *
      * The map is sorted by the probability of the tokens from the highest to the lowest,
      * and is reflected in the order of the entries when iterating over the map.
      * Use `.entries().next().value` to get the top probability pair
@@ -392,6 +398,8 @@ export type ControlledEvaluateInputItem = Token | [token: Token, options: {
          * Same as `next.probabilities.get(next.token)` from the output.
          *
          * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
          */
         confidence?: boolean,
 
@@ -437,6 +445,8 @@ export type ControlledEvaluateIndexOutput = {
          * Same as `next.probabilities.get(next.token)`.
          *
          * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
          */
         confidence?: number,
 
@@ -445,6 +455,8 @@ export type ControlledEvaluateIndexOutput = {
          *
          * A probability is a number from `0` to `1`.
          *
+         * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+         *
          * The map is sorted by the probability of the tokens from the highest to the lowest,
          * and is reflected in the order of the entries when iterating over the map.
          * Use `.entries().next().value` to get the top probability pair
diff --git a/test/modelDependent/llama3.1/controlledEvaluate.test.ts b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
index 85a71811..c9dcde17 100644
--- a/test/modelDependent/llama3.1/controlledEvaluate.test.ts
+++ b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
@@ -5,10 +5,14 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 
 describe("llama 3.1", () => {
     describe("controlled evaluate", () => {
-        test("get probabilities for 3 tokens", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("get probabilities for 3 tokens", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
diff --git a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
index f638f053..4b459bb0 100644
--- a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
+++ b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
@@ -67,10 +67,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("with probabilities", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("with probabilities", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
@@ -246,10 +250,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("with confidence", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("with confidence", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
@@ -315,10 +323,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("with probabilities and confidence", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("with probabilities and confidence", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
@@ -504,10 +516,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("confidence alone matches probability alone", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("confidence alone matches probability alone", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });

From 4e1c676e6a32eb84aae39e653779b2c9ee8ec724 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 6 Jan 2025 04:24:05 +0200
Subject: [PATCH 69/73] feat: reranking (`LlamaRankingContext`)

---
 .vitepress/config/apiReferenceSidebar.ts     |   3 +-
 docs/guide/embedding.md                      |  53 ++++
 docs/index.md                                |   1 +
 llama/addon/AddonContext.cpp                 |   4 +
 src/bindings/AddonTypes.ts                   |   1 +
 src/evaluator/LlamaContext/LlamaContext.ts   |   4 +-
 src/evaluator/LlamaContext/types.ts          |   8 +-
 src/evaluator/LlamaModel/LlamaModel.ts       |  11 +
 src/evaluator/LlamaRankingContext.ts         | 251 +++++++++++++++++++
 src/index.ts                                 |   3 +
 test/modelDependent/bgeReranker/rank.test.ts | 205 +++++++++++++++
 test/utils/modelFiles.ts                     |   3 +-
 12 files changed, 543 insertions(+), 4 deletions(-)
 create mode 100644 src/evaluator/LlamaRankingContext.ts
 create mode 100644 test/modelDependent/bgeReranker/rank.test.ts

diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts
index 5a63cd71..2dd87ae0 100644
--- a/.vitepress/config/apiReferenceSidebar.ts
+++ b/.vitepress/config/apiReferenceSidebar.ts
@@ -1,6 +1,6 @@
 import {DefaultTheme} from "vitepress";
 /* eslint import/no-unresolved: "off" */
-import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; // if this import fails, run `npm run docs:generateTypedoc`
+import typedocSidebar from "../../docs/api/typedoc-sidebar.json";
 
 const categoryOrder = [
     "Functions",
@@ -28,6 +28,7 @@ const classesOrder = [
     "LlamaCompletion",
     "LlamaEmbeddingContext",
     "LlamaEmbedding",
+    "LlamaRankingContext",
     "LlamaGrammar",
     "LlamaJsonSchemaGrammar",
     "LlamaText",
diff --git a/docs/guide/embedding.md b/docs/guide/embedding.md
index b8a672a8..6ce591a5 100644
--- a/docs/guide/embedding.md
+++ b/docs/guide/embedding.md
@@ -138,6 +138,59 @@ const embedding = await context.getEmbeddingFor(text);
 console.log("Embedding vector:", embedding.vector);
 ```
 
+## Reranking Documents {#reranking}
+After you search for the most similar documents using embedding vectors,
+you can use inference to rerank (sort) the documents based on their relevance to the given query.
+
+Doing this allows you to combine the best of both worlds: the speed of embedding and the quality of inference.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(
+    fileURLToPath(import.meta.url)
+);
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "bge-reranker-v2-m3-Q8_0.gguf")
+});
+const context = await model.createRankingContext();
+
+const documents = [
+    "The sky is clear and blue today",
+    "I love eating pizza with extra cheese",
+    "Dogs love to play fetch with their owners",
+    "The capital of France is Paris",
+    "Drinking water is important for staying hydrated",
+    "Mount Everest is the tallest mountain in the world",
+    "A warm cup of tea is perfect for a cold winter day",
+    "Painting is a form of creative expression",
+    "Not all the things that shine are made of gold",
+    "Cleaning the house is a good way to keep it tidy"
+];
+
+const query = "Tell me a goegraphical fact";
+const rankedDocuments = await context.rankAndSort(query, documents);
+
+const topDocument = rankedDocuments[0]!;
+const secondDocument = rankedDocuments[1]!;
+
+console.log("query:", query);
+console.log("Top document:", topDocument.document);
+console.log("Second document:", secondDocument.document);
+console.log("Ranked documents:", rankedDocuments);
+```
+> This example will produce this output:
+> ```
+> query: Tell me a goegraphical fact
+> Top document: Mount Everest is the tallest mountain in the world
+> Second document: The capital of France is Paris
+> ```
+> This example uses [bge-reranker-v2-m3-Q8_0.gguf](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/blob/main/bge-reranker-v2-m3-Q8_0.gguf)
+
 ## Using External Databases
 When you have a large number of documents you want to use with embedding, it's often more efficient to store them with their embedding in an external database and search for the most similar embeddings there.
 
diff --git a/docs/index.md b/docs/index.md
index aa400999..1e1d3dd5 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -97,6 +97,7 @@ npx -y node-llama-cpp inspect gpu
 * [Remote GGUF reader](./api/functions/readGgufFileInfo.md)
 * [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp)
 * [Token prediction](./guide/token-prediction.md)
+* [Reranking](./guide/embedding.md#reranking)
 
 </template>
 <template v-slot:simple-code>
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index b65236c3..dbeb6ca9 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -415,6 +415,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.embeddings = options.Get("embeddings").As<Napi::Boolean>().Value();
         }
 
+        if (options.Has("ranking") && options.Get("ranking").As<Napi::Boolean>().Value()) {
+            context_params.pooling_type = LLAMA_POOLING_TYPE_RANK;
+        }
+
         if (options.Has("flashAttention")) {
             context_params.flash_attn = options.Get("flashAttention").As<Napi::Boolean>().Value();
         }
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 7c575f88..c88c1af7 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -26,6 +26,7 @@ export type BindingModule = {
             flashAttention?: boolean,
             logitsAll?: boolean,
             embeddings?: boolean,
+            ranking?: boolean,
             threads?: number,
             performanceTracking?: boolean
         }): AddonContext
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 9844b0db..1bd1e47f 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -80,7 +80,8 @@ export class LlamaContext {
             itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism"
         } = {},
         performanceTracking = false,
-        _embeddings
+        _embeddings,
+        _ranking
     }: LlamaContextOptions & {
         sequences: number,
         contextSize: number,
@@ -121,6 +122,7 @@ export class LlamaContext {
             flashAttention: this._flashAttention,
             threads: this._idealThreads,
             embeddings: _embeddings,
+            ranking: _ranking,
             performanceTracking: this._performanceTracking
         }));
         this._batchingOptions = {
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index a68a72e3..16d17bce 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -171,7 +171,13 @@ export type LlamaContextOptions = {
      * embedding mode only
      * @internal
      */
-    _embeddings?: boolean
+    _embeddings?: boolean,
+
+    /**
+     * ranking mode
+     * @internal
+     */
+    _ranking?: boolean
 };
 export type LlamaContextSequenceRepeatPenalty = {
     /** Tokens to lower the predication probability of to be the next predicted token */
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 08e4a9cd..ae0275ee 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -18,6 +18,7 @@ import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "../LlamaEmbed
 import {GgufArchitectureType, GgufMetadata} from "../../gguf/types/GgufMetadataTypes.js";
 import {OverridesObject} from "../../utils/OverridesObject.js";
 import {maxRecentDetokenizerTokens} from "../../consts.js";
+import {LlamaRankingContext, LlamaRankingContextOptions} from "../LlamaRankingContext.js";
 import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js";
 import type {Llama} from "../../bindings/Llama.js";
 import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js";
@@ -532,6 +533,16 @@ export class LlamaModel {
         return await LlamaEmbeddingContext._create({_model: this}, options);
     }
 
+    /**
+     * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+     */
+    public async createRankingContext(options: LlamaRankingContextOptions = {}) {
+        if (this._vocabOnly)
+            throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
+
+        return await LlamaRankingContext._create({_model: this}, options);
+    }
+
     /**
      * Get warnings about the model file that would affect its usage.
      *
diff --git a/src/evaluator/LlamaRankingContext.ts b/src/evaluator/LlamaRankingContext.ts
new file mode 100644
index 00000000..5bff9505
--- /dev/null
+++ b/src/evaluator/LlamaRankingContext.ts
@@ -0,0 +1,251 @@
+import {AsyncDisposeAggregator, EventRelay, withLock} from "lifecycle-utils";
+import {Token} from "../types.js";
+import {LlamaText} from "../utils/LlamaText.js";
+import {tokenizeInput} from "../utils/tokenizeInput.js";
+import type {LlamaModel} from "./LlamaModel/LlamaModel.js";
+import type {LlamaContext, LlamaContextSequence} from "./LlamaContext/LlamaContext.js";
+import type {GgufTensorInfo} from "../gguf/types/GgufTensorInfoTypes.js";
+
+export type LlamaRankingContextOptions = {
+    /**
+     * The number of tokens the model can see at once.
+     * - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
+     * the model was trained on.
+     * - **`number`** - set the context size to a specific number of tokens.
+     * If there's not enough VRAM, an error will be thrown.
+     * Use with caution.
+     * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
+     * up to the size the model was trained on, but at least `min` and at most `max`.
+     *
+     * Defaults to `"auto"`.
+     */
+    contextSize?: "auto" | number | {
+        min?: number,
+        max?: number
+    },
+
+    /** prompt processing batch size */
+    batchSize?: number,
+
+    /**
+     * number of threads to use to evaluate tokens.
+     * set to 0 to use the maximum threads supported by the current machine hardware
+     */
+    threads?: number,
+
+    /** An abort signal to abort the context creation */
+    createSignal?: AbortSignal,
+
+    /**
+     * Ignore insufficient memory errors and continue with the context creation.
+     * Can cause the process to crash if there's not enough VRAM for the new context.
+     *
+     * Defaults to `false`.
+     */
+    ignoreMemorySafetyChecks?: boolean
+};
+
+/**
+ * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+ */
+export class LlamaRankingContext {
+    /** @internal */ private readonly _llamaContext: LlamaContext;
+    /** @internal */ private readonly _sequence: LlamaContextSequence;
+    /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator();
+
+    public readonly onDispose = new EventRelay<void>();
+
+    private constructor({
+        _llamaContext
+    }: {
+        _llamaContext: LlamaContext
+    }) {
+        this._llamaContext = _llamaContext;
+        this._sequence = this._llamaContext.getSequence();
+
+        this._disposeAggregator.add(
+            this._llamaContext.onDispose.createListener(() => {
+                void this._disposeAggregator.dispose();
+            })
+        );
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+        this._disposeAggregator.add(async () => {
+            await this._llamaContext.dispose();
+        });
+    }
+
+    /**
+     * Get the ranking score for a document for a query.
+     */
+    public async rank(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText) {
+        if (this.model.tokens.bos == null || this.model.tokens.eos == null || this.model.tokens.sep == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        const resolvedInput = this._getEvaluationInput(query, document);
+
+        if (resolvedInput.length > this._llamaContext.contextSize)
+            throw new Error(
+                "Input is longer than the context size. " +
+                "Try to increase the context size or use another model that supports longer contexts."
+            );
+        else if (resolvedInput.length === 0)
+            return -Infinity;
+
+        return this._evaluateRankingForInput(resolvedInput);
+    }
+
+    /**
+     * Get the ranking scores for all the given documents for a query.
+     */
+    public async rankAll(query: Token[] | string | LlamaText, documents: Array<Token[] | string | LlamaText>): Promise<number[]> {{}
+        const resolvedTokens = documents.map((document) => this._getEvaluationInput(query, document));
+
+        if (resolvedTokens.some((tokens) => tokens.length > this._llamaContext.contextSize))
+            throw new Error(
+                "The input of one of the document is longer than the context size. " +
+                "Try to increase the context size or use another model that supports longer contexts."
+            );
+        else if (resolvedTokens.length === 0)
+            return [];
+
+        return await Promise.all(
+            resolvedTokens.map((tokens) => {
+                if (tokens.length === 0)
+                    return -Infinity;
+
+                return this._evaluateRankingForInput(tokens);
+            })
+        );
+    }
+
+    /**
+     * Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
+     */
+    public async rankAndSort<const T extends string>(query: Token[] | string | LlamaText, documents: T[]): Promise<Array<{
+        document: T,
+        score: number
+    }>> {
+        const scores = await this.rankAll(query, documents);
+
+        return documents
+            .map((document, index) => ({document: document as T, score: scores[index]!}))
+            .sort((a, b) => b.score - a.score);
+    }
+
+    public async dispose() {
+        await this._disposeAggregator.dispose();
+    }
+
+    /** @hidden */
+    public [Symbol.asyncDispose]() {
+        return this.dispose();
+    }
+
+    public get disposed() {
+        return this._llamaContext.disposed;
+    }
+
+    public get model() {
+        return this._llamaContext.model;
+    }
+
+    /** @internal */
+    private _getEvaluationInput(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText) {
+        if (this.model.tokens.bos == null || this.model.tokens.eos == null || this.model.tokens.sep == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        const resolvedQuery = tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+        const resolvedDocument = tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+
+        if (resolvedQuery.length === 0 && resolvedDocument.length === 0)
+            return [];
+
+        const resolvedInput = [
+            this.model.tokens.bos,
+            ...resolvedQuery,
+            this.model.tokens.eos,
+            this.model.tokens.sep,
+            ...resolvedDocument,
+            this.model.tokens.eos
+        ];
+
+        return resolvedInput;
+    }
+
+    /** @internal */
+    private _evaluateRankingForInput(input: Token[]): Promise<number> {
+        return withLock(this, "evaluate", async () => {
+            await this._sequence.eraseContextTokenRanges([{
+                start: 0,
+                end: this._sequence.nextTokenIndex
+            }]);
+
+            const iterator = this._sequence.evaluate(input, {_noSampling: true});
+            // eslint-disable-next-line @typescript-eslint/no-unused-vars
+            for await (const token of iterator) {
+                break; // only generate one token to get embeddings
+            }
+
+            const embedding = this._llamaContext._ctx.getEmbedding(input.length);
+            if (embedding.length === 0)
+                return 0;
+
+            return embedding[0]!;
+        });
+    }
+
+    /** @internal */
+    public static async _create({
+        _model
+    }: {
+        _model: LlamaModel
+    }, {
+        contextSize,
+        batchSize,
+        threads = 6,
+        createSignal,
+        ignoreMemorySafetyChecks
+    }: LlamaRankingContextOptions) {
+        const tensorInfo = _model.fileInfo.tensorInfo;
+
+        if (_model.tokens.bos == null || _model.tokens.eos == null || _model.tokens.sep == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        // source: `append_pooling` in `llama.cpp`
+        if (findLayer(tensorInfo, "cls", "weight") == null || findLayer(tensorInfo, "cls", "bias") == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        // source: `append_pooling` in `llama.cpp`
+        if (findLayer(tensorInfo, "cls.output", "weight") != null && findLayer(tensorInfo, "cls.output", "bias") == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
+            throw new Error("Computing rankings is not supported for encoder-decoder models.");
+
+        const llamaContext = await _model.createContext({
+            contextSize,
+            batchSize,
+            threads,
+            createSignal,
+            ignoreMemorySafetyChecks,
+            _embeddings: true,
+            _ranking: true
+        });
+
+        return new LlamaRankingContext({
+            _llamaContext: llamaContext
+        });
+    }
+}
+
+function findLayer(tensorInfo: GgufTensorInfo[] | undefined, name: string, suffix: string) {
+    if (tensorInfo == null)
+        return undefined;
+
+    for (const tensor of tensorInfo) {
+        if (tensor.name === name + "." + suffix)
+            return tensor;
+    }
+
+    return undefined;
+}
diff --git a/src/index.ts b/src/index.ts
index 59c6ef49..32313eae 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -15,6 +15,7 @@ import {LlamaGrammarEvaluationState, LlamaGrammarEvaluationStateOptions} from ".
 import {LlamaContext, LlamaContextSequence} from "./evaluator/LlamaContext/LlamaContext.js";
 import {LlamaEmbeddingContext, type LlamaEmbeddingContextOptions} from "./evaluator/LlamaEmbeddingContext.js";
 import {LlamaEmbedding, type LlamaEmbeddingOptions, type LlamaEmbeddingJSON} from "./evaluator/LlamaEmbedding.js";
+import {LlamaRankingContext, type LlamaRankingContextOptions} from "./evaluator/LlamaRankingContext.js";
 import {
     type LlamaContextOptions, type SequenceEvaluateOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty,
     type CustomBatchingDispatchSchedule, type CustomBatchingPrioritizationStrategy, type BatchItem, type PrioritizedBatchItem,
@@ -155,6 +156,8 @@ export {
     LlamaEmbedding,
     type LlamaEmbeddingOptions,
     type LlamaEmbeddingJSON,
+    LlamaRankingContext,
+    type LlamaRankingContextOptions,
     LlamaChatSession,
     defineChatSessionFunction,
     type LlamaChatSessionOptions,
diff --git a/test/modelDependent/bgeReranker/rank.test.ts b/test/modelDependent/bgeReranker/rank.test.ts
new file mode 100644
index 00000000..82fd43ff
--- /dev/null
+++ b/test/modelDependent/bgeReranker/rank.test.ts
@@ -0,0 +1,205 @@
+import {describe, expect, test} from "vitest";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+
+describe("bgeReranker", () => {
+    describe("rank", () => {
+        test("simple ranking", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const rankingContext = await model.createRankingContext({
+                contextSize: 512
+            });
+
+            const documents = [
+                "The sky is clear and blue today",
+                "I love eating pizza with extra cheese",
+                "Dogs love to play fetch with their owners",
+                "The capital of France is Paris",
+                "Drinking water is important for staying hydrated",
+                "Mount Everest is the tallest mountain in the world",
+                "A warm cup of tea is perfect for a cold winter day",
+                "Painting is a form of creative expression",
+                "Not all the things that shine are made of gold",
+                "Cleaning the house is a good way to keep it tidy"
+            ];
+
+            const query = "Tell me a geographical fact";
+
+            const ranks = await Promise.all(
+                documents.map((doc) => rankingContext.rank(query, doc))
+            );
+
+            const highestRank = ranks.reduce((highest, rank) => Math.max(highest, rank));
+            const highestRankIndex = ranks.indexOf(highestRank);
+
+            const highestRankDocument = documents[highestRankIndex];
+            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world",);
+
+            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot(`-4`);
+            expect(simplifyRanks(ranks)).toMatchInlineSnapshot(`
+              [
+                -11,
+                -11,
+                -11,
+                -5.6,
+                -11,
+                -4,
+                -11,
+                -11,
+                -11,
+                -11,
+              ]
+            `);
+        });
+
+        test("rank all", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const rankingContext = await model.createRankingContext({
+                contextSize: 512
+            });
+
+            const documents = [
+                "The sky is clear and blue today",
+                "I love eating pizza with extra cheese",
+                "Dogs love to play fetch with their owners",
+                "The capital of France is Paris",
+                "Drinking water is important for staying hydrated",
+                "Mount Everest is the tallest mountain in the world",
+                "A warm cup of tea is perfect for a cold winter day",
+                "Painting is a form of creative expression",
+                "Not all the things that shine are made of gold",
+                "Cleaning the house is a good way to keep it tidy"
+            ];
+
+            const query = "Tell me a geographical fact";
+
+            const ranks = await rankingContext.rankAll(query, documents);
+
+            const highestRank = ranks.reduce((highest, rank) => Math.max(highest, rank));
+            const highestRankIndex = ranks.indexOf(highestRank);
+
+            const highestRankDocument = documents[highestRankIndex];
+            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world",);
+
+            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot(`-4`);
+            expect(simplifyRanks(ranks)).toMatchInlineSnapshot(`
+              [
+                -11,
+                -11,
+                -11,
+                -5.6,
+                -11,
+                -4,
+                -11,
+                -11,
+                -11,
+                -11,
+              ]
+            `);
+        });
+
+        test("rank and sort", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const rankingContext = await model.createRankingContext({
+                contextSize: 512
+            });
+
+            const documents = [
+                "The sky is clear and blue today",
+                "I love eating pizza with extra cheese",
+                "Dogs love to play fetch with their owners",
+                "The capital of France is Paris",
+                "Drinking water is important for staying hydrated",
+                "Mount Everest is the tallest mountain in the world",
+                "A warm cup of tea is perfect for a cold winter day",
+                "Not all the things that shine are made of gold",
+                "Cleaning the house is a good way to keep it tidy"
+            ];
+
+            const query = "Tell me a geographical fact";
+
+            const rankedDocuments = await rankingContext.rankAndSort(query, documents);
+
+            const topDocument = rankedDocuments[0]!;
+
+            expect(topDocument.document).to.eql("Mount Everest is the tallest mountain in the world",);
+
+            expect(simplifySortedRanks([topDocument])[0]).toMatchInlineSnapshot(`
+              {
+                "document": "Mount Everest is the tallest mountain in the world",
+                "score": -4,
+              }
+            `);
+            expect(simplifySortedRanks(rankedDocuments)).toMatchInlineSnapshot(`
+              [
+                {
+                  "document": "Mount Everest is the tallest mountain in the world",
+                  "score": -4,
+                },
+                {
+                  "document": "The capital of France is Paris",
+                  "score": -5.6,
+                },
+                {
+                  "document": "Not all the things that shine are made of gold",
+                  "score": -11,
+                },
+                {
+                  "document": "I love eating pizza with extra cheese",
+                  "score": -11,
+                },
+                {
+                  "document": "Dogs love to play fetch with their owners",
+                  "score": -11,
+                },
+                {
+                  "document": "The sky is clear and blue today",
+                  "score": -11,
+                },
+                {
+                  "document": "Drinking water is important for staying hydrated",
+                  "score": -11,
+                },
+                {
+                  "document": "Cleaning the house is a good way to keep it tidy",
+                  "score": -11,
+                },
+                {
+                  "document": "A warm cup of tea is perfect for a cold winter day",
+                  "score": -11,
+                },
+              ]
+            `);
+        });
+    });
+});
+
+function simplifyRanks<const T extends number[]>(ranks: T): T {
+    return ranks.map((rank) => parseFloat(roundToPrecision(rank, 0.2).toFixed(1))) as T;
+}
+
+function simplifySortedRanks<const T extends {document: string, score: number}[]>(values: T): T {
+    return values.map((item) => ({
+        document: item.document,
+        score: parseFloat(roundToPrecision(item.score, 0.2).toFixed(1))
+    })) as T;
+}
+
+function roundToPrecision(value: number, precision: number): number {
+    return Math.round(value / precision) * precision;
+}
diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts
index 0d357977..bcc6a6c0 100644
--- a/test/utils/modelFiles.ts
+++ b/test/utils/modelFiles.ts
@@ -19,7 +19,8 @@ const supportedModels = {
     "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf?download=true",
     "codegemma-2b-Q4_K_M.gguf": "https://huggingface.co/bartowski/codegemma-2b-GGUF/resolve/main/codegemma-2b-Q4_K_M.gguf?download=true",
     "Llama-3.2-3B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct.Q4_K_M.gguf?download=true",
-    "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true"
+    "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true",
+    "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true"
 } as const;
 
 export async function getModelFile(modelName: keyof typeof supportedModels) {

From 38f56f4d7daa14a9c83342e4ea422498d87d1364 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 6 Jan 2025 04:33:39 +0200
Subject: [PATCH 70/73] docs: explain about reranking

---
 docs/guide/choosing-a-model.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/guide/choosing-a-model.md b/docs/guide/choosing-a-model.md
index bae67a4c..6ece13ce 100644
--- a/docs/guide/choosing-a-model.md
+++ b/docs/guide/choosing-a-model.md
@@ -124,6 +124,20 @@ Here are a few concepts to be aware of when choosing a model:
   
   Many embedding models include terms like `embed` in their name.
 
+* **Reranking models** - models that are trained to rerank (sort) a list of documents
+  based on their relevance to a given query.
+  These models are usually smaller and faster than general-purpose models,
+  making them more efficient and practical for reranking tasks.
+  
+  Reranking models are often significantly smaller (sometimes as small as 500MB), faster,
+  and consume less memory than general-purpose models, making them more efficient and practical.
+
+  While general-purpose models can also be used for reranking,
+  doing this requires prompting the model, which is more cumbersome and inefficient than
+  using a specialized model with a [ranking context](./embedding.md#reranking) for this task.
+  
+  Many reranking models include terms like `rerank` or `reranker` in their name.
+
 ### How much data do you plan to feed the model at once with?
 If you plan to feed the model with a lot of data at once, you'll need a model that supports a large context size.
 The larger the context size is, the more data the model can process at once.

From bd7f6bf6a01968b58587f6279303ef28619dc62a Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 6 Jan 2025 04:37:00 +0200
Subject: [PATCH 71/73] style: lint

---
 src/evaluator/LlamaChat/LlamaChat.ts         |  1 -
 src/evaluator/LlamaRankingContext.ts         |  2 +-
 test/modelDependent/bgeReranker/rank.test.ts | 10 +++++-----
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index d76a0f14..92414d89 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -15,7 +15,6 @@ import {EvaluationPriority} from "../LlamaContext/types.js";
 import {maxRecentDetokenizerTokens, UNKNOWN_UNICODE_CHAR} from "../../consts.js";
 import {getQueuedTokensBeforeStopTrigger} from "../../utils/getQueuedTokensBeforeStopTrigger.js";
 import {resolveChatWrapper} from "../../chatWrappers/utils/resolveChatWrapper.js";
-import {GeneralChatWrapper} from "../../chatWrappers/GeneralChatWrapper.js";
 import {TokenBias} from "../TokenBias.js";
 import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {pushAll} from "../../utils/pushAll.js";
diff --git a/src/evaluator/LlamaRankingContext.ts b/src/evaluator/LlamaRankingContext.ts
index 5bff9505..4c9ed593 100644
--- a/src/evaluator/LlamaRankingContext.ts
+++ b/src/evaluator/LlamaRankingContext.ts
@@ -97,7 +97,7 @@ export class LlamaRankingContext {
     /**
      * Get the ranking scores for all the given documents for a query.
      */
-    public async rankAll(query: Token[] | string | LlamaText, documents: Array<Token[] | string | LlamaText>): Promise<number[]> {{}
+    public async rankAll(query: Token[] | string | LlamaText, documents: Array<Token[] | string | LlamaText>): Promise<number[]> {
         const resolvedTokens = documents.map((document) => this._getEvaluationInput(query, document));
 
         if (resolvedTokens.some((tokens) => tokens.length > this._llamaContext.contextSize))
diff --git a/test/modelDependent/bgeReranker/rank.test.ts b/test/modelDependent/bgeReranker/rank.test.ts
index 82fd43ff..0debe294 100644
--- a/test/modelDependent/bgeReranker/rank.test.ts
+++ b/test/modelDependent/bgeReranker/rank.test.ts
@@ -38,9 +38,9 @@ describe("bgeReranker", () => {
             const highestRankIndex = ranks.indexOf(highestRank);
 
             const highestRankDocument = documents[highestRankIndex];
-            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world",);
+            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world");
 
-            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot(`-4`);
+            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("-4");
             expect(simplifyRanks(ranks)).toMatchInlineSnapshot(`
               [
                 -11,
@@ -89,9 +89,9 @@ describe("bgeReranker", () => {
             const highestRankIndex = ranks.indexOf(highestRank);
 
             const highestRankDocument = documents[highestRankIndex];
-            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world",);
+            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world");
 
-            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot(`-4`);
+            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("-4");
             expect(simplifyRanks(ranks)).toMatchInlineSnapshot(`
               [
                 -11,
@@ -137,7 +137,7 @@ describe("bgeReranker", () => {
 
             const topDocument = rankedDocuments[0]!;
 
-            expect(topDocument.document).to.eql("Mount Everest is the tallest mountain in the world",);
+            expect(topDocument.document).to.eql("Mount Everest is the tallest mountain in the world");
 
             expect(simplifySortedRanks([topDocument])[0]).toMatchInlineSnapshot(`
               {

From 55f5b26ca24da2fca5bbcdb24d69a9f445b03798 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 7 Jan 2025 00:25:42 +0200
Subject: [PATCH 72/73] test: fix tests

---
 test/modelDependent/bgeReranker/rank.test.ts | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/modelDependent/bgeReranker/rank.test.ts b/test/modelDependent/bgeReranker/rank.test.ts
index 0debe294..5966b214 100644
--- a/test/modelDependent/bgeReranker/rank.test.ts
+++ b/test/modelDependent/bgeReranker/rank.test.ts
@@ -124,7 +124,6 @@ describe("bgeReranker", () => {
                 "I love eating pizza with extra cheese",
                 "Dogs love to play fetch with their owners",
                 "The capital of France is Paris",
-                "Drinking water is important for staying hydrated",
                 "Mount Everest is the tallest mountain in the world",
                 "A warm cup of tea is perfect for a cold winter day",
                 "Not all the things that shine are made of gold",
@@ -171,10 +170,6 @@ describe("bgeReranker", () => {
                   "document": "The sky is clear and blue today",
                   "score": -11,
                 },
-                {
-                  "document": "Drinking water is important for staying hydrated",
-                  "score": -11,
-                },
                 {
                   "document": "Cleaning the house is a good way to keep it tidy",
                   "score": -11,

From 12bc47af31790c8763d43ef483315a6d4b0c81ae Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 7 Jan 2025 00:59:34 +0200
Subject: [PATCH 73/73] fix: adapt to breaking `llama.cpp` changes

---
 llama/addon/AddonModel.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
index 6ddf9705..780a94d5 100644
--- a/llama/addon/AddonModel.cpp
+++ b/llama/addon/AddonModel.cpp
@@ -92,13 +92,13 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker {
 
         void Execute() {
             try {
-                model->model = llama_load_model_from_file(model->modelPath.c_str(), model->model_params);
+                model->model = llama_model_load_from_file(model->modelPath.c_str(), model->model_params);
 
                 model->modelLoaded = model->model != nullptr && model->model != NULL;
             } catch (const std::exception& e) {
                 SetError(e.what());
             } catch(...) {
-                SetError("Unknown error when calling \"llama_load_model_from_file\"");
+                SetError("Unknown error when calling \"llama_model_load_from_file\"");
             }
         }
         void OnOK() {
@@ -141,14 +141,14 @@ class AddonModelUnloadModelWorker : public Napi::AsyncWorker {
 
         void Execute() {
             try {
-                llama_free_model(model->model);
+                llama_model_free(model->model);
                 model->modelLoaded = false;
 
                 model->dispose();
             } catch (const std::exception& e) {
                 SetError(e.what());
             } catch(...) {
-                SetError("Unknown error when calling \"llama_free_model\"");
+                SetError("Unknown error when calling \"llama_model_free\"");
             }
         }
         void OnOK() {
@@ -359,7 +359,7 @@ void AddonModel::dispose() {
     disposed = true;
     if (modelLoaded) {
         modelLoaded = false;
-        llama_free_model(model);
+        llama_model_free(model);
 
         adjustNapiExternalMemorySubtract(Env(), loadedModelSize);
         loadedModelSize = 0;