diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 004e4b94..4e2032b4 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -35,11 +35,10 @@ body:
     id: steps
     attributes:
       label: Steps to reproduce
-      description: >-
+      description: |-
         Your bug can be investigated much faster if your code can be run without any dependencies other than `node-llama-cpp`.
         Issues without reproduction steps or code examples may be closed as not actionable.
-        Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)).
-        Please include a link to the model file you used if possible.
+        Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)), including a link to the model file you used if possible.
         Also, please enable enable debug logs by using `getLlama({debug: true})` to get more information.
       placeholder: >-
         Please try to provide a Minimal, Complete, and Verifiable example.
@@ -50,10 +49,9 @@ body:
     id: env
     attributes:
       label: My Environment
-      description: >-
+      description: |-
         Please include the result of the command `npx --yes node-llama-cpp inspect gpu`.
-        Please also add any other relevant dependencies to this table at the end.
-        For example: Electron, Bun, Webpack.
+        Please also add any other relevant dependencies to this table at the end. For example: Electron, Bun, Webpack.
       value: |
         | Dependency               | Version             |
         | ---                      | ---                 |
diff --git a/.github/ISSUE_TEMPLATE/documentation-issue.yml b/.github/ISSUE_TEMPLATE/documentation-issue.yml
index 118756bd..53e74a4f 100644
--- a/.github/ISSUE_TEMPLATE/documentation-issue.yml
+++ b/.github/ISSUE_TEMPLATE/documentation-issue.yml
@@ -13,7 +13,7 @@ body:
     id: details
     attributes:
       label: What was unclear or otherwise insufficient?
-      description: >-
+      description: |-
         If relevant, please be clear about the documentation URL, as well as the location within the page.
         Add a link to the relevant documentation you're referring to.
       placeholder: >-
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
index 59ec39fd..c0ec58c2 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -51,8 +51,12 @@ body:
           required: false
         - label: CUDA support
           required: false
+        - label: Vulkan support
+          required: false
         - label: Grammar
           required: false
+        - label: Function calling
+          required: false
   - type: dropdown
     id: pr
     attributes:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bfe9a3e4..2814133f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -383,7 +383,7 @@ jobs:
 
   model-dependent-tests:
     name: Model dependent tests
-    runs-on: macos-13
+    runs-on: macos-12
     env:
       NODE_LLAMA_CPP_GPU: false
     needs:
@@ -417,6 +417,9 @@ jobs:
       - name: Build binary
         run: node ./dist/cli/cli.js source build --noUsageExample
 
+      - name: Inspect hardware
+        run: node ./dist/cli/cli.js inspect gpu
+
       - name: Cache models
         id: cache-test-models
         uses: actions/cache@v4
diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index 422b79a2..b51b0940 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -34,7 +34,8 @@ const packageVersion = env.get("DOCS_PACKAGE_VERSION")
     .default(packageJson.version)
     .asString();
 
-const hostname = "https://node-llama-cpp.withcat.ai/";
+const hostname = "https://node-llama-cpp.withcat.ai/"
+const buildDate = new Date();
 
 const socialPosterLink = hostname + "social.poster.jpg";
 const defaultPageTitle = "node-llama-cpp - node.js bindings for llama.cpp";
@@ -90,7 +91,7 @@ export default defineConfig({
     base: urlBase,
     sitemap: {
         hostname,
-        transformItems(items) {
+        async transformItems(items) {
             function priorityMatch(a: {url: string}, b: {url: string}, matchers: ((url: string) => boolean)[]): number {
                 for (const matcher of matchers) {
                     const aMatch = matcher(a.url);
@@ -105,13 +106,38 @@ export default defineConfig({
                 return 0;
             }
 
+            const blogPosts = await createContentLoader("blog/*.md", {
+                excerpt: true,
+                render: true
+            })
+                .load();
+            const blogPostMap = new Map<string, typeof blogPosts[number]>();
+            for (const blogPost of blogPosts) {
+                let url = blogPost.url;
+                if (url.startsWith("/"))
+                    url = url.slice("/".length);
+
+                blogPostMap.set(url, blogPost);
+            }
+
             return items
                 .map((item) => {
-                    if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
+                    if (item.url === "" || item.url === "blog/") {
+                        item.lastmod = new Date(buildDate);
+                    } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
                         item = {
                             ...item,
-                            lastmod: undefined
+                            lastmod: new Date(buildDate)
                         };
+                    } else if (item.lastmod == null && item.url.startsWith("blog/")) {
+                        const postDate = blogPostMap.get(item.url)?.frontmatter.date;
+                        if (postDate != null) {
+                            const parsedDate = new Date(postDate);
+                            if (Number.isFinite(parsedDate.getTime()))
+                                item.lastmod = parsedDate;
+                        }
+                    } else if (item.lastmod == null) {
+                        item.lastmod = new Date(buildDate);
                     }
 
                     return item;
diff --git a/README.md b/README.md
index faacd427..569f7990 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-    <img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.v3.roundEdges.avif" width="360px" />
+    <a href="https://node-llama-cpp.withcat.ai" target="_blank"><img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.v3.roundEdges.avif" width="360px" /></a>
     <h1>node-llama-cpp</h1>
     <p>Run AI models locally on your machine</p>
     <sub>Pre-built bindings are provided with a fallback to building from source with cmake</sub>
diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md
index 3f8c3cb5..dce8ecd1 100644
--- a/docs/guide/chat-session.md
+++ b/docs/guide/chat-session.md
@@ -671,3 +671,34 @@ await new Promise(resolve => setTimeout(resolve, 1500));
 const cachedCompletion = completionEngine.complete("Hi there! How");
 console.log("Cached completion:", cachedCompletion);
 ```
+
+## Response Prefix {#response-prefix}
+You can force the model response to start with a specific prefix,
+to make the model follow a certain direction in its response.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaChatSession, GeneralChatWrapper} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence(),
+    chatWrapper: new GeneralChatWrapper()
+});
+
+
+const q1 = "Hi there, how are you?";
+console.log("User: " + q1);
+
+const a1 = await session.prompt(q1, {
+    responsePrefix: "The weather today is"
+});
+console.log("AI: " + a1);
+```
diff --git a/docs/guide/electron.md b/docs/guide/electron.md
index 1e2204c8..dc1fc2aa 100644
--- a/docs/guide/electron.md
+++ b/docs/guide/electron.md
@@ -37,3 +37,27 @@ so that `node-llama-cpp` can find them.
 Cross packaging from one platform to another is not supported, since binaries for other platforms are not downloaded to you machine when your run `npm install`.
 
 Packaging an `arm64` app on an `x64` machine is supported, but packaging an `x64` app on an `arm64` machine is not.
+
+## Bundling
+When bundling your code for Electron using [Electron Vite](https://electron-vite.org) or Webpack,
+ensure that `node-llama-cpp` is not bundled, and is instead treated as an external module.
+
+Marking `node-llama-cpp` as an external module will prevent its code from being bundled with your application code,
+and instead, it'll be loaded from the `node_modules` directory at runtime (which should be packed into a `.asar` archive).
+
+The file structure of `node-llama-cpp` is crucial for it to function correctly,
+so bundling it will break its functionality.
+Moreover, since `node-llama-cpp` includes prebuilt binaries (and also local builds from source),
+those files must be retained in their original structure for it to work.
+
+Electron has [its own bundling solution called ASAR](https://www.electronjs.org/docs/latest/tutorial/asar-archives) that is designed to work with node modules.
+ASAR retains the original file structure of node modules by packing all the files into a single `.asar` archive file that Electron will read from at runtime like it would from the file system.
+This method ensures node modules work as intended in Electron applications, even though they are bundled into a single file.
+
+Using ASAR is the recommended way to bundle `node-llama-cpp` in your Electron app.
+
+If you're using the scaffolded Electron app, this is already taken care of.
+
+::: tip NOTE
+We recommend using [Electron Vite](https://electron-vite.org) over Webpack for your Electron app due to to Vite's speed and Webpack's lack of proper ESM support in the output bundle, which complicates the bundling process.
+:::
diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md
index d8d1eea6..df3949e3 100644
--- a/docs/guide/tips-and-tricks.md
+++ b/docs/guide/tips-and-tricks.md
@@ -85,3 +85,37 @@ npx --no node-llama-cpp source download
 ```
 
 Now, just use `node-llama-cpp` as you normally would.
+
+## Intel AMX {#intel-amx}
+> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors
+> that helps optimize and accelerate matrix multiplication operations.
+> 
+> It's available on the 4th Gen and newer Intel Xeon processors.
+
+Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions).
+
+If you're using a 4th Gen or newer Intel Xeon processor,
+you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware.
+
+To do this, run this command inside your project on the machine you run your project on:
+```shell
+npx --no node-llama-cpp source download
+```
+
+Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries
+and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU:
+
+```typescript
+import os from "os";
+import {getLlama} from "node-llama-cpp";
+
+const llama = await getLlama({
+    usePrebuiltBinaries: !os.cpus().some((cpu) => (
+        cpu.model.toLowerCase().includes("Xeon".toLowerCase())
+    ))
+});
+```
+::: info NOTE
+Building from source can take some time (when using CUDA even up to an hour in extreme cases),
+so ensure you dedicate some time for this as part of the deployment process.
+:::
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index fc7c5504..90a424c2 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -22,6 +22,12 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
                 OUTPUT_VARIABLE NODE_ADDON_API_DIR
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
 
+set(LLAMA_BUILD_COMMON ON)
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    add_compile_options(-Wno-c++17-extensions)
+endif()
+
 include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
 
 add_subdirectory("llama.cpp")
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 93cbe413..21ccab8c 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -447,7 +447,7 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {
     GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);
 
     for (size_t i = 0; i < tokensLength; i++) {
-        llama_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
+        common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
     }
 
     if (generateLogitAtTheEnd) {
diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
index 27340fa4..ec73c45b 100644
--- a/llama/addon/AddonModel.cpp
+++ b/llama/addon/AddonModel.cpp
@@ -426,7 +426,7 @@ Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) {
     std::string text = info[0].As<Napi::String>().Utf8Value();
     bool specialTokens = info[1].As<Napi::Boolean>().Value();
 
-    std::vector<llama_token> tokens = llama_tokenize(model, text, false, specialTokens);
+    std::vector<llama_token> tokens = common_tokenize(model, text, false, specialTokens);
 
     Napi::Uint32Array result = Napi::Uint32Array::New(info.Env(), tokens.size());
     for (size_t i = 0; i < tokens.size(); ++i) {
@@ -539,7 +539,7 @@ Napi::Value AddonModel::PrefixToken(const Napi::CallbackInfo& info) {
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_prefix(model));
+    return getNapiToken(info, model, llama_token_fim_pre(model));
 }
 Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) {
     if (disposed) {
@@ -547,7 +547,7 @@ Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) {
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_middle(model));
+    return getNapiToken(info, model, llama_token_fim_mid(model));
 }
 Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) {
     if (disposed) {
@@ -555,7 +555,7 @@ Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) {
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_suffix(model));
+    return getNapiToken(info, model, llama_token_fim_suf(model));
 }
 Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {
     if (disposed) {
diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp
index 89d0b075..d84160d7 100644
--- a/llama/addon/AddonSampler.cpp
+++ b/llama/addon/AddonSampler.cpp
@@ -52,11 +52,6 @@ void AddonSampler::dispose() {
         topPSampler = nullptr;
     }
 
-    if (softmaxSampler != nullptr) {
-        llama_sampler_free(softmaxSampler);
-        softmaxSampler = nullptr;
-    }
-
     if (seedSampler != nullptr) {
         llama_sampler_free(seedSampler);
         seedSampler = nullptr;
@@ -135,10 +130,6 @@ void AddonSampler::rebuildChainIfNeeded() {
             llama_sampler_chain_add(chain, temperatureSampler);
         }
 
-        if (softmaxSampler != nullptr) {
-            llama_sampler_chain_add(chain, softmaxSampler);
-        }
-
         if (seedSampler != nullptr) {
             llama_sampler_chain_add(chain, seedSampler);
         }
@@ -206,10 +197,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {
         }
     }
 
-    if (softmaxSampler == nullptr) {
-        softmaxSampler = llama_sampler_init_softmax();
-    }
-
     if (config.Has("minP")) {
         auto minP = config.Get("minP").As<Napi::Number>().FloatValue();
         if (minP != minPSampler_minP) {
diff --git a/llama/addon/AddonSampler.h b/llama/addon/AddonSampler.h
index 942d03d2..33114b49 100644
--- a/llama/addon/AddonSampler.h
+++ b/llama/addon/AddonSampler.h
@@ -25,8 +25,6 @@ class AddonSampler : public Napi::ObjectWrap<AddonSampler> {
 
         llama_sampler * topPSampler = nullptr;
         float topPSampler_topP = 0.0f; // Top p sampling >=1.0 = disabled
-
-        llama_sampler * softmaxSampler = nullptr;
         
         llama_sampler * seedSampler = nullptr;
         uint32_t seedSampler_seed = 0;
diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
index 16393618..5c2d1c52 100644
--- a/llama/addon/addon.cpp
+++ b/llama/addon/addon.cpp
@@ -8,6 +8,7 @@
 #include "globals/addonLog.h"
 #include "globals/addonProgress.h"
 #include "globals/getGpuInfo.h"
+#include "globals/getSwapInfo.h"
 
 bool backendInitialized = false;
 bool backendDisposed = false;
@@ -203,6 +204,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("getGpuVramInfo", getGpuVramInfo),
         Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo),
         Napi::PropertyDescriptor::Function("getGpuType", getGpuType),
+        Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo),
         Napi::PropertyDescriptor::Function("init", addonInit),
         Napi::PropertyDescriptor::Function("dispose", addonDispose),
     });
diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp
index f3a67185..ef51c1cd 100644
--- a/llama/addon/globals/getGpuInfo.cpp
+++ b/llama/addon/globals/getGpuInfo.cpp
@@ -26,6 +26,7 @@ void logVulkanWarning(const char* message) {
 Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
     uint64_t total = 0;
     uint64_t used = 0;
+    uint64_t unifiedVramSize = 0;
 
 #ifdef GPU_INFO_USE_CUDA
     size_t cudaDeviceTotal = 0;
@@ -41,26 +42,31 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
 #ifdef GPU_INFO_USE_VULKAN
     uint64_t vulkanDeviceTotal = 0;
     uint64_t vulkanDeviceUsed = 0;
-    const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, logVulkanWarning);
+    uint64_t vulkanDeviceUnifiedVramSize = 0;
+    const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, &vulkanDeviceUnifiedVramSize, logVulkanWarning);
 
     if (vulkanDeviceSupportsMemoryBudgetExtension) {
         total += vulkanDeviceTotal;
         used += vulkanDeviceUsed;
+        unifiedVramSize += vulkanDeviceUnifiedVramSize;
     }
 #endif
 
 #ifdef GPU_INFO_USE_METAL
     uint64_t metalDeviceTotal = 0;
     uint64_t metalDeviceUsed = 0;
-    getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed);
+    uint64_t metalDeviceUnifiedVramSize = 0;
+    getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed, &metalDeviceUnifiedVramSize);
 
     total += metalDeviceTotal;
     used += metalDeviceUsed;
+    unifiedVramSize += metalDeviceUnifiedVramSize;
 #endif
 
     Napi::Object result = Napi::Object::New(info.Env());
     result.Set("total", Napi::Number::From(info.Env(), total));
     result.Set("used", Napi::Number::From(info.Env(), used));
+    result.Set("unifiedSize", Napi::Number::From(info.Env(), unifiedVramSize));
 
     return result;
 }
diff --git a/llama/addon/globals/getSwapInfo.cpp b/llama/addon/globals/getSwapInfo.cpp
new file mode 100644
index 00000000..bae94612
--- /dev/null
+++ b/llama/addon/globals/getSwapInfo.cpp
@@ -0,0 +1,69 @@
+#include "getSwapInfo.h"
+#include "addonLog.h"
+
+#ifdef __APPLE__
+#include <iostream>
+#include <mach/mach.h>
+#include <sys/sysctl.h>
+#elif __linux__
+#include <iostream>
+#include <sys/sysinfo.h>
+#elif _WIN32
+#include <iostream>
+#include <windows.h>
+#include <psapi.h>
+#endif
+
+
+Napi::Value getSwapInfo(const Napi::CallbackInfo& info) {
+    uint64_t totalSwap = 0;
+    uint64_t freeSwap = 0;
+    uint64_t maxSize = 0;
+    bool maxSizeSet = true;
+
+#ifdef __APPLE__
+    struct xsw_usage swapInfo;
+    size_t size = sizeof(swapInfo);
+
+    if (sysctlbyname("vm.swapusage", &swapInfo, &size, NULL, 0) == 0) {
+        totalSwap = swapInfo.xsu_total;
+        freeSwap = swapInfo.xsu_avail;
+        maxSizeSet = false;
+    } else {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get swap info").c_str(), nullptr);
+    }
+#elif __linux__
+    struct sysinfo sysInfo;
+
+    if (sysinfo(&sysInfo) == 0) {
+        totalSwap = sysInfo.totalswap;
+        freeSwap = sysInfo.freeswap;
+        maxSize = sysInfo.totalswap;
+    } else {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get swap info").c_str(), nullptr);
+    }
+#elif _WIN32
+    MEMORYSTATUSEX memInfo;
+    memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+
+    if (GlobalMemoryStatusEx(&memInfo)) {
+        PERFORMANCE_INFORMATION perfInfo;
+        perfInfo.cb = sizeof(PERFORMANCE_INFORMATION);
+        if (GetPerformanceInfo(&perfInfo, sizeof(perfInfo))) {
+            totalSwap = memInfo.ullTotalPageFile;
+            freeSwap = memInfo.ullAvailPageFile;
+            maxSize = perfInfo.CommitLimit * perfInfo.PageSize;
+        } else {
+            addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get max pagefile size").c_str(), nullptr);
+        }
+    } else {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get pagefile info").c_str(), nullptr);
+    }
+#endif
+    
+    Napi::Object obj = Napi::Object::New(info.Env());
+    obj.Set("total", Napi::Number::New(info.Env(), totalSwap));
+    obj.Set("free", Napi::Number::New(info.Env(), freeSwap));
+    obj.Set("maxSize", maxSizeSet ? Napi::Number::New(info.Env(), maxSize) : Napi::Number::New(info.Env(), -1));
+    return obj;
+}
diff --git a/llama/addon/globals/getSwapInfo.h b/llama/addon/globals/getSwapInfo.h
new file mode 100644
index 00000000..dd265c60
--- /dev/null
+++ b/llama/addon/globals/getSwapInfo.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "napi.h"
+
+Napi::Value getSwapInfo(const Napi::CallbackInfo& info);
diff --git a/llama/gpuInfo/metal-gpu-info.h b/llama/gpuInfo/metal-gpu-info.h
index 30056ce7..9a199bee 100644
--- a/llama/gpuInfo/metal-gpu-info.h
+++ b/llama/gpuInfo/metal-gpu-info.h
@@ -4,5 +4,5 @@
 #include <string>
 #include <vector>
 
-void getMetalGpuInfo(uint64_t * total, uint64_t * used);
+void getMetalGpuInfo(uint64_t * total, uint64_t * used, uint64_t * unifiedMemorySize);
 void getMetalGpuDeviceNames(std::vector<std::string> * deviceNames);
\ No newline at end of file
diff --git a/llama/gpuInfo/metal-gpu-info.mm b/llama/gpuInfo/metal-gpu-info.mm
index 7bfd6bce..46ac0b18 100644
--- a/llama/gpuInfo/metal-gpu-info.mm
+++ b/llama/gpuInfo/metal-gpu-info.mm
@@ -3,15 +3,22 @@
 #include <string>
 #import <Metal/Metal.h>
 
-void getMetalGpuInfo(uint64_t * total, uint64_t * used) {
+void getMetalGpuInfo(uint64_t * total, uint64_t * used, uint64_t * unifiedMemorySize) {
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
 
     if (device) {
         *total = device.recommendedMaxWorkingSetSize;
         *used = device.currentAllocatedSize;
+
+        if (device.hasUnifiedMemory) {
+            *unifiedMemorySize = device.recommendedMaxWorkingSetSize;
+        } else {
+            *unifiedMemorySize = 0;
+        }
     } else {
         *total = 0;
         *used = 0;
+        *unifiedMemorySize = 0;
     }
 
     [device release];
diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp
index 0b9a6556..25356546 100644
--- a/llama/gpuInfo/vulkan-gpu-info.cpp
+++ b/llama/gpuInfo/vulkan-gpu-info.cpp
@@ -5,7 +5,7 @@
 
 typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
 
-static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
+static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
     vk::ApplicationInfo appInfo("node-llama-cpp GPU info", 1, "llama.cpp", 1, VK_API_VERSION_1_2);
     vk::InstanceCreateInfo createInfo(vk::InstanceCreateFlags(), &appInfo, {}, {});
     vk::Instance instance = vk::createInstance(createInfo);
@@ -14,6 +14,7 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa
 
     size_t usedMem = 0;
     size_t totalMem = 0;
+    size_t totalUnifiedMemorySize = 0;
 
     for (size_t i = 0; i < physicalDevices.size(); i++) {
         vk::PhysicalDevice physicalDevice = physicalDevices[i];
@@ -41,16 +42,20 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa
             physicalDevice.getMemoryProperties2(&memProps2);
 
             for (uint32_t i = 0; i < memProps.memoryHeapCount; ++i) {
-                if (memProps.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+                const auto flags = memProps.memoryHeaps[i].flags;
+
+                if (flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
                     const auto size = memProps.memoryHeaps[i].size;
                     totalMem += size;
                     usedMem += memoryBudgetProperties.heapUsage[i];
 
+                    if (flags & vk::MemoryHeapFlagBits::eMultiInstance) {
+                        totalUnifiedMemorySize += size;
+                    }
+
                     if (size > 0 && addDeviceNames) {
                         (*deviceNames).push_back(std::string(deviceProps.deviceName.data()));
                     }
-
-                    break;
                 }
             }
         } else {
@@ -58,9 +63,8 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa
             warningLogCallback(
                 (
                     "Vulkan VK_EXT_memory_budget extension not supported for device \"" +
-                    std::string(deviceProps.deviceName.data()) + "\", so VRAM info cannot be determained for it"
-                )
-                    .c_str()
+                    std::string(deviceProps.deviceName.data()) + "\", so VRAM info cannot be determined for it"
+                ).c_str()
             );
             return false;
         }
@@ -68,16 +72,19 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa
 
     *total = totalMem;
     *used = usedMem;
+    *unifiedMemorySize = totalUnifiedMemorySize;
+
     return true;
 }
 
-bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
-    return enumerateVulkanDevices(total, used, false, nullptr, warningLogCallback);
+bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
+    return enumerateVulkanDevices(total, used, unifiedMemorySize, false, nullptr, warningLogCallback);
 }
 
 bool gpuInfoGetVulkanDeviceNames(std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
     size_t vulkanDeviceTotal = 0;
     size_t vulkanDeviceUsed = 0;
+    size_t unifiedMemorySize = 0;
 
-    return enumerateVulkanDevices(&vulkanDeviceTotal, &vulkanDeviceUsed, true, deviceNames, warningLogCallback);
+    return enumerateVulkanDevices(&vulkanDeviceTotal, &vulkanDeviceUsed, &unifiedMemorySize, true, deviceNames, warningLogCallback);
 }
diff --git a/llama/gpuInfo/vulkan-gpu-info.h b/llama/gpuInfo/vulkan-gpu-info.h
index d2457f10..f8eb0527 100644
--- a/llama/gpuInfo/vulkan-gpu-info.h
+++ b/llama/gpuInfo/vulkan-gpu-info.h
@@ -5,5 +5,5 @@
 
 typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
 
-bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, gpuInfoVulkanWarningLogCallback_t warningLogCallback);
+bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback);
 bool gpuInfoGetVulkanDeviceNames(std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback);
\ No newline at end of file
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 2422c16b..891d9df4 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -63,12 +63,18 @@ export type BindingModule = {
     setLoggerLogLevel(level: number): void,
     getGpuVramInfo(): {
         total: number,
-        used: number
+        used: number,
+        unifiedSize: number
     },
     getGpuDeviceInfo(): {
         deviceNames: string[]
     },
     getGpuType(): "cuda" | "vulkan" | "metal" | undefined,
+    getSwapInfo(): {
+        total: number,
+        maxSize: number,
+        free: number
+    },
     init(): Promise<void>,
     dispose(): Promise<void>
 };
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index 50831cc9..a30395a2 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -1,3 +1,4 @@
+import os from "os";
 import chalk from "chalk";
 import {DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js";
@@ -34,6 +35,9 @@ export class Llama {
     /** @internal */ public readonly _consts: ReturnType<BindingModule["getConsts"]>;
     /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator;
     /** @internal */ public readonly _vramPadding: MemoryReservation;
+    /** @internal */ public readonly _ramOrchestrator: MemoryOrchestrator;
+    /** @internal */ public readonly _ramPadding: MemoryReservation;
+    /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator;
     /** @internal */ public readonly _debug: boolean;
     /** @internal */ public readonly _threadsSplitter: ThreadsSplitter;
     /** @internal */ private readonly _gpu: LlamaGpuType;
@@ -61,7 +65,8 @@ export class Llama {
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        bindings, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, gpu, maxThreads, vramOrchestrator, vramPadding
+        bindings, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, gpu, maxThreads, vramOrchestrator, vramPadding,
+        ramOrchestrator, ramPadding, swapOrchestrator
     }: {
         bindings: BindingModule,
         logLevel: LlamaLogLevel,
@@ -76,7 +81,10 @@ export class Llama {
         gpu: BuildGpu,
         maxThreads?: number,
         vramOrchestrator: MemoryOrchestrator,
-        vramPadding: MemoryReservation
+        vramPadding: MemoryReservation,
+        ramOrchestrator: MemoryOrchestrator,
+        ramPadding: MemoryReservation,
+        swapOrchestrator: MemoryOrchestrator
     }) {
         this._bindings = bindings;
         this._gpu = gpu;
@@ -88,6 +96,9 @@ export class Llama {
         this._debug = debug;
         this._vramOrchestrator = vramOrchestrator;
         this._vramPadding = vramPadding;
+        this._ramOrchestrator = ramOrchestrator;
+        this._ramPadding = ramPadding;
+        this._swapOrchestrator = swapOrchestrator;
         this._threadsSplitter = new ThreadsSplitter(
             maxThreads ?? (
                 this._gpu === false
@@ -235,15 +246,60 @@ export class Llama {
         return this._vramPadding.size;
     }
 
+    /**
+     * The total amount of VRAM that is currently being used.
+     *
+     * `unifiedSize` represents the amount of VRAM that is shared between the CPU and GPU.
+     * On SoC devices, this is usually the same as `total`.
+     */
     public async getVramState() {
         this._ensureNotDisposed();
 
-        const {total, used} = this._bindings.getGpuVramInfo();
+        const {total, used, unifiedSize} = this._bindings.getGpuVramInfo();
 
         return {
             total,
             used,
-            free: Math.max(0, total - used)
+            free: Math.max(0, total - used),
+            unifiedSize
+        };
+    }
+
+    /**
+     * Get the state of the swap memory.
+     *
+     * **`maxSize`** - The maximum size of the swap memory that the system can allocate.
+     * If the swap size is dynamic (like on macOS), this will be `Infinity`.
+     *
+     * **`allocated`** - The total size allocated by the system for swap memory.
+     *
+     * **`used`** - The amount of swap memory that is currently being used from the `allocated` size.
+     *
+     * On Windows, this will return the info for the page file.
+     */
+    public async getSwapState(): Promise<{
+        /**
+         * The maximum size of the swap memory that the system can allocate.
+         * If the swap size is dynamic (like on macOS), this will be `Infinity`
+         */
+        maxSize: number,
+
+        /** The total size allocated by the system for swap memory */
+        allocated: number,
+
+        /** The amount of swap memory that is currently being used from the `allocated` size */
+        used: number
+    }> {
+        this._ensureNotDisposed();
+
+        const {total, maxSize, free} = this._bindings.getSwapInfo();
+
+        return {
+            maxSize: maxSize === -1
+                ? Infinity
+                : maxSize,
+            allocated: total,
+            used: total - free
         };
     }
 
@@ -383,7 +439,7 @@ export class Llama {
 
     /** @internal */
     public static async _create({
-        bindings, buildType, buildMetadata, logLevel, logger, vramPadding, maxThreads, skipLlamaInit = false, debug
+        bindings, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug
     }: {
         bindings: BindingModule,
         buildType: "localBuild" | "prebuilt",
@@ -392,16 +448,45 @@ export class Llama {
         logger: (level: LlamaLogLevel, message: string) => void,
         maxThreads?: number,
         vramPadding: number | ((totalVram: number) => number),
+        ramPadding: number | ((totalRam: number) => number),
         skipLlamaInit?: boolean,
         debug: boolean
     }) {
         const gpu = bindings.getGpuType() ?? false;
         const vramOrchestrator = new MemoryOrchestrator(() => {
-            const {total, used} = bindings.getGpuVramInfo();
+            const {total, used, unifiedSize} = bindings.getGpuVramInfo();
+
+            return {
+                total,
+                free: Math.max(0, total - used),
+                unifiedSize
+            };
+        });
+        const ramOrchestrator = new MemoryOrchestrator(() => {
+            const used = process.memoryUsage().rss;
+            const total = os.totalmem();
 
             return {
                 total,
-                free: Math.max(0, total - used)
+                free: Math.max(0, total - used),
+                unifiedSize: total
+            };
+        });
+        const swapOrchestrator = new MemoryOrchestrator(() => {
+            const {total, maxSize, free} = bindings.getSwapInfo();
+            const used = total - free;
+
+            if (maxSize === -1)
+                return {
+                    total: Infinity,
+                    free: Infinity,
+                    unifiedSize: Infinity
+                };
+
+            return {
+                total: maxSize,
+                free:  maxSize - used,
+                unifiedSize: maxSize
             };
         });
 
@@ -413,6 +498,12 @@ export class Llama {
         else
             resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding);
 
+        let resolvedRamPadding: MemoryReservation;
+        if (ramPadding instanceof Function)
+            resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total));
+        else
+            resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding);
+
         const llama =  new Llama({
             bindings,
             buildType,
@@ -427,7 +518,10 @@ export class Llama {
             gpu,
             vramOrchestrator,
             maxThreads,
-            vramPadding: resolvedVramPadding
+            vramPadding: resolvedVramPadding,
+            ramOrchestrator,
+            ramPadding: resolvedRamPadding,
+            swapOrchestrator
         });
 
         if (!skipLlamaInit)
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
index 9df05cba..d9e3255f 100644
--- a/src/bindings/getLlama.ts
+++ b/src/bindings/getLlama.ts
@@ -132,6 +132,17 @@ export type LlamaOptions = {
      */
     vramPadding?: number | ((totalVram: number) => number),
 
+    /**
+     * Pad the available RAM for the memory size calculations, as these calculations are not always accurate.
+     * Recommended to ensure stability.
+     *
+     * Defaults to `25%` of the total RAM or 6GB (1GB on Linux), whichever is lower.
+     * Set to `0` to disable.
+     *
+     * > Since the OS also needs RAM to function, the default value can get up to 6GB on Windows and macOS, and 1GB on Linux.
+     */
+    ramPadding?: number | ((totalRam: number) => number),
+
     /**
      * Enable debug mode to find issues with llama.cpp.
      * Makes logs print directly to the console from `llama.cpp` and not through the provided logger.
@@ -196,6 +207,17 @@ export type LastBuildOptions = {
      */
     vramPadding?: number | ((totalVram: number) => number),
 
+    /**
+     * Pad the available RAM for the memory size calculations, as these calculations are not always accurate.
+     * Recommended to ensure stability.
+     *
+     * Defaults to `25%` of the total RAM or 6GB (1GB on Linux), whichever is lower.
+     * Set to `0` to disable.
+     *
+     * > Since the OS also needs RAM to function, the default value can get up to 6GB on Windows and macOS, and 1GB on Linux.
+     */
+    ramPadding?: number | ((totalRam: number) => number),
+
     /**
      * Enable debug mode to find issues with llama.cpp.
      * Makes logs print directly to the console from `llama.cpp` and not through the provided logger.
@@ -210,6 +232,14 @@ export type LastBuildOptions = {
 export const getLlamaFunctionName = "getLlama";
 
 export const defaultLlamaVramPadding = (totalVram: number) => Math.floor(Math.min(totalVram * 0.06, 1024 * 1024 * 1024));
+export const defaultLlamaRamPadding = (totalRam: number) => {
+    const platform = getPlatform();
+
+    if (platform === "linux")
+        return Math.floor(Math.min(totalRam * 0.25, 1024 * 1024 * 1024));
+
+    return Math.floor(Math.min(totalRam * 0.25, 1024 * 1024 * 1024 * 6));
+};
 const defaultBuildOption: Exclude<LlamaOptions["build"], undefined> = runningInElectron
     ? "never"
     : "auto";
@@ -251,6 +281,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp
             skipDownload: lastBuildOptions?.skipDownload ?? defaultSkipDownload,
             maxThreads: lastBuildOptions?.maxThreads,
             vramPadding: lastBuildOptions?.vramPadding ?? defaultLlamaVramPadding,
+            ramPadding: lastBuildOptions?.ramPadding ?? defaultLlamaRamPadding,
             debug: lastBuildOptions?.debug ?? defaultLlamaCppDebugMode
         };
 
@@ -274,6 +305,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp
                     logLevel: lastBuildOptions?.logLevel ?? defaultLlamaCppLogLevel,
                     maxThreads: lastBuildOptions?.maxThreads,
                     vramPadding: lastBuildOptions?.vramPadding ?? defaultLlamaVramPadding,
+                    ramPadding: lastBuildOptions?.ramPadding ?? defaultLlamaRamPadding,
                     debug: lastBuildOptions?.debug ?? defaultLlamaCppDebugMode
                 });
             } catch (err) {
@@ -300,6 +332,7 @@ export async function getLlamaForOptions({
     skipDownload = defaultSkipDownload,
     maxThreads,
     vramPadding = defaultLlamaVramPadding,
+    ramPadding = defaultLlamaRamPadding,
     debug = defaultLlamaCppDebugMode
 }: LlamaOptions, {
     updateLastBuildInfoOnCompile = false,
@@ -320,6 +353,7 @@ export async function getLlamaForOptions({
     if (progressLogs == null) progressLogs = true;
     if (skipDownload == null) skipDownload = defaultSkipDownload;
     if (vramPadding == null) vramPadding = defaultLlamaVramPadding;
+    if (ramPadding == null) ramPadding = defaultLlamaRamPadding;
     if (debug == null) debug = defaultLlamaCppDebugMode;
 
     const clonedLlamaCppRepoReleaseInfo = await getClonedLlamaCppRepoReleaseInfo();
@@ -376,6 +410,7 @@ export async function getLlamaForOptions({
                 skipLlamaInit,
                 maxThreads,
                 vramPadding,
+                ramPadding,
                 fallbackMessage: !isLastItem
                     ? `falling back to using ${getPrettyBuildGpuName(buildGpusToTry[i + 1])}`
                     : (
@@ -437,6 +472,7 @@ export async function getLlamaForOptions({
                 updateLastBuildInfoOnCompile,
                 maxThreads,
                 vramPadding,
+                ramPadding,
                 skipLlamaInit,
                 debug
             });
@@ -473,6 +509,7 @@ async function loadExistingLlamaBinary({
     skipLlamaInit,
     maxThreads,
     vramPadding,
+    ramPadding,
     fallbackMessage,
     debug
 }: {
@@ -487,6 +524,7 @@ async function loadExistingLlamaBinary({
     skipLlamaInit: boolean,
     maxThreads: number | undefined,
     vramPadding: Required<LlamaOptions>["vramPadding"],
+    ramPadding: Required<LlamaOptions>["ramPadding"],
     fallbackMessage: string | null,
     debug: boolean
 }) {
@@ -520,6 +558,7 @@ async function loadExistingLlamaBinary({
                     logger,
                     maxThreads,
                     vramPadding,
+                    ramPadding,
                     skipLlamaInit,
                     debug
                 });
@@ -576,6 +615,7 @@ async function loadExistingLlamaBinary({
                         logger,
                         maxThreads,
                         vramPadding,
+                        ramPadding,
                         skipLlamaInit,
                         debug
                     });
@@ -630,6 +670,7 @@ async function buildAndLoadLlamaBinary({
     updateLastBuildInfoOnCompile,
     maxThreads,
     vramPadding,
+    ramPadding,
     skipLlamaInit,
     debug
 }: {
@@ -640,6 +681,7 @@ async function buildAndLoadLlamaBinary({
     updateLastBuildInfoOnCompile: boolean,
     maxThreads: number | undefined,
     vramPadding: Required<LlamaOptions>["vramPadding"],
+    ramPadding: Required<LlamaOptions>["ramPadding"],
     skipLlamaInit: boolean,
     debug: boolean
 }) {
@@ -671,6 +713,7 @@ async function buildAndLoadLlamaBinary({
         logger,
         maxThreads,
         vramPadding,
+        ramPadding,
         skipLlamaInit,
         debug
     });
diff --git a/src/bindings/utils/MemoryOrchestrator.ts b/src/bindings/utils/MemoryOrchestrator.ts
index 052651cf..992f336e 100644
--- a/src/bindings/utils/MemoryOrchestrator.ts
+++ b/src/bindings/utils/MemoryOrchestrator.ts
@@ -1,12 +1,12 @@
 import {EventRelay} from "lifecycle-utils";
 
 export class MemoryOrchestrator {
-    /** @internal */ private readonly _getMemoryState: () => {free: number, total: number};
+    /** @internal */ private readonly _getMemoryState: () => {free: number, total: number, unifiedSize: number};
     /** @internal */ private _reservedMemory: number = 0;
 
     public readonly onMemoryReservationRelease = new EventRelay<void>();
 
-    public constructor(getMemoryState: () => {free: number, total: number}) {
+    public constructor(getMemoryState: () => {free: number, total: number, unifiedSize: number}) {
         this._getMemoryState = getMemoryState;
     }
 
@@ -20,11 +20,12 @@ export class MemoryOrchestrator {
     }
 
     public async getMemoryState() {
-        const {free, total} = this._getMemoryState();
+        const {free, total, unifiedSize} = this._getMemoryState();
 
         return {
             free: Math.max(0, free - this._reservedMemory),
-            total
+            total,
+            unifiedSize
         };
     }
 }
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index ab61644f..1ff9f01d 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -1,6 +1,7 @@
 import path from "path";
 import {fileURLToPath} from "url";
 import process from "process";
+import os from "os";
 import fs from "fs-extra";
 import chalk from "chalk";
 import which from "which";
@@ -17,7 +18,7 @@ import {getModuleVersion} from "../../utils/getModuleVersion.js";
 import {ensureLlamaCppRepoIsCloned, isLlamaCppRepoCloned} from "./cloneLlamaCppRepo.js";
 import {getBuildFolderNameForBuildOptions} from "./getBuildFolderNameForBuildOptions.js";
 import {setLastBuildInfo} from "./lastBuildInfo.js";
-import {getPlatform} from "./getPlatform.js";
+import {BinaryPlatform, getPlatform} from "./getPlatform.js";
 import {logDistroInstallInstruction} from "./logDistroInstallInstruction.js";
 import {testCmakeBinary} from "./testCmakeBinary.js";
 import {getCudaNvccPaths} from "./detectAvailableComputeLayers.js";
@@ -45,6 +46,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
         ciMode = false
     } = compileOptions;
 
+    const platform = getPlatform();
     const buildFolderName = await getBuildFolderNameForBuildOptions(buildOptions);
     const finalBuildFolderName = includeBuildOptionsInBinaryFolderName
         ? buildFolderName.withCustomCmakeOptions
@@ -94,6 +96,9 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 if (ciMode) {
                     if (!cmakeCustomOptions.has("GGML_OPENMP"))
                         cmakeCustomOptions.set("GGML_OPENMP", "OFF");
+
+                    if (!cmakeCustomOptions.has("GGML_AMX"))
+                        cmakeCustomOptions.set("GGML_AMX", "OFF");
                 }
 
                 await fs.remove(outDirectory);
@@ -120,6 +125,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                         "--arch=" + buildOptions.arch,
                         "--out", path.relative(llamaDirectory, outDirectory),
                         "--runtime-version=" + runtimeVersion,
+                        "--parallel=" + getParallelBuildThreadsToUse(platform),
                         ...cmakePathArgs,
                         ...(
                             [...cmakeCustomOptions].map(([key, value]) => "--CD" + key + "=" + value)
@@ -171,7 +177,6 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
             }
         });
     } catch (err) {
-        const platform = getPlatform();
         if (platform === "linux" && await which("make", {nothrow: true}) == null) {
             console.info("\n" +
                 getConsoleLogPrefix(true) +
@@ -453,3 +458,15 @@ async function getToolchainFileForArch(targetArch: string) {
 
     return null;
 }
+
+function getParallelBuildThreadsToUse(platform: BinaryPlatform) {
+    const cpuCount = os.cpus().length;
+
+    if (cpuCount <= 4)
+        return cpuCount;
+
+    if (platform === "mac" && process.arch === "arm64")
+        return cpuCount - 1;
+
+    return cpuCount - 2;
+}
diff --git a/src/bindings/utils/getLinuxDistroInfo.ts b/src/bindings/utils/getLinuxDistroInfo.ts
index 7ac09bdd..ccea0c56 100644
--- a/src/bindings/utils/getLinuxDistroInfo.ts
+++ b/src/bindings/utils/getLinuxDistroInfo.ts
@@ -29,7 +29,7 @@ async function getOsReleaseInfo() {
             if (!(await fs.pathExists(osReleasePath)))
                 continue;
 
-            const osReleaseFile = await fs.readFile(osReleasePath, "utf-8");
+            const osReleaseFile = await fs.readFile(osReleasePath, "utf8");
 
             const res = new Map<string, string>();
             for (const line of osReleaseFile.split("\n")) {
diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
index dc5a2f4c..85156f8e 100644
--- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
@@ -256,7 +256,6 @@ function logCompatibilityScore(
             title: "VRAM usage",
             value: () => bytes(compatibilityScore.resolvedValues.totalVramUsage)
         }, {
-            show: compatibilityScore.resolvedValues.totalRamUsage > 0,
             title: "RAM usage",
             value: () => bytes(compatibilityScore.resolvedValues.totalRamUsage)
         }, {
diff --git a/src/cli/commands/inspect/commands/InspectGgufCommand.ts b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
index fa1ca7e3..49afe0ed 100644
--- a/src/cli/commands/inspect/commands/InspectGgufCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
@@ -13,10 +13,12 @@ import {documentationPageUrls} from "../../../../config.js";
 import withOra from "../../../../utils/withOra.js";
 import {resolveModelDestination} from "../../../../utils/resolveModelDestination.js";
 import {printModelDestination} from "../../../utils/printModelDestination.js";
+import {getGgufMetadataKeyValue} from "../../../../gguf/utils/getGgufMetadataKeyValue.js";
 
 type InspectGgufCommand = {
     modelPath: string,
     header?: string[],
+    key?: string,
     noSplice: boolean,
     fullTensorInfo: boolean,
     fullMetadataArrays: boolean,
@@ -46,6 +48,12 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
                 description: "Headers to use when reading a model file from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers.",
                 group: "Optional:"
             })
+            .option("key", {
+                alias: ["k"],
+                type: "string",
+                description: "A single metadata key to print the value of. If not provided, all metadata will be printed",
+                group: "Optional:"
+            })
             .option("noSplice", {
                 alias: "s",
                 type: "boolean",
@@ -80,7 +88,7 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, noSplice, fullTensorInfo, fullMetadataArrays, plainJson, outputToJsonFile
+        modelPath: ggufPath, header: headerArg, key, noSplice, fullTensorInfo, fullMetadataArrays, plainJson, outputToJsonFile
     }: InspectGgufCommand) {
         const resolvedModelDestination = resolveModelDestination(ggufPath);
         const resolvedGgufPath = resolvedModelDestination.type == "file"
@@ -116,16 +124,30 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
         const fileTypeName = getGgufFileTypeName(parsedMetadata.metadata.general?.file_type);
 
         if (plainJson || outputToJsonFile != null) {
-            const outputJson = JSON.stringify({
-                splicedParts: parsedMetadata.splicedParts,
-                version: parsedMetadata.version,
-                fileType: fileTypeName,
-                tensorCount: parsedMetadata.totalTensorCount,
-                metadataSize: parsedMetadata.totalMetadataSize,
-                tensorInfoSize: parsedMetadata.totalTensorInfoSize,
-                metadata: parsedMetadata.metadata,
-                tensorInfo: parsedMetadata.fullTensorInfo
-            }, undefined, 4);
+            const getOutputJson = () => {
+                if (key != null) {
+                    const keyValue = getGgufMetadataKeyValue(parsedMetadata.metadata, key);
+                    if (keyValue === undefined) {
+                        console.log(`Key not found: ${key}`);
+                        process.exit(1);
+                    }
+
+                    return JSON.stringify(keyValue, undefined, 4);
+                }
+
+                return JSON.stringify({
+                    splicedParts: parsedMetadata.splicedParts,
+                    version: parsedMetadata.version,
+                    fileType: fileTypeName,
+                    tensorCount: parsedMetadata.totalTensorCount,
+                    metadataSize: parsedMetadata.totalMetadataSize,
+                    tensorInfoSize: parsedMetadata.totalTensorInfoSize,
+                    metadata: parsedMetadata.metadata,
+                    tensorInfo: parsedMetadata.fullTensorInfo
+                }, undefined, 4);
+            };
+
+            const outputJson = getOutputJson();
 
             if (outputToJsonFile != null) {
                 const filePath = path.resolve(process.cwd(), outputToJsonFile);
@@ -134,6 +156,27 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
             } else {
                 console.info(outputJson);
             }
+        } else if (key != null) {
+            const keyValue = getGgufMetadataKeyValue(parsedMetadata.metadata, key);
+            if (keyValue === undefined) {
+                console.log(`${chalk.red("Metadata key not found:")} ${key}`);
+                process.exit(1);
+            }
+
+            const metadataPrettyPrintOptions: PrettyPrintObjectOptions = {
+                maxArrayValues: fullMetadataArrays
+                    ? undefined
+                    : 10,
+                useNumberGrouping: true,
+                maxArrayItemsWidth: process.stdout.columns - 1
+            };
+
+            console.info(`${chalk.yellow("Metadata key:")} ${prettyPrintObject(key)}`);
+            console.info(`${chalk.yellow("Metadata:")} ${
+                typeof keyValue === "string"
+                    ? keyValue
+                    : prettyPrintObject(keyValue, undefined, metadataPrettyPrintOptions)
+            }`);
         } else {
             const metadataPrettyPrintOptions: PrettyPrintObjectOptions = {
                 maxArrayValues: fullMetadataArrays
diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
index c3a710c3..4e8e47ac 100644
--- a/src/cli/commands/inspect/commands/InspectGpuCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
@@ -129,6 +129,9 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
             }
         }
 
+        if (lastLlama == null)
+            await loadLlamaForGpu(false);
+
         for (const gpu of gpusToLogVramUsageOf) {
             const llama = gpuToLlama.get(gpu);
             if (llama == null)
@@ -140,6 +143,9 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
 
         console.info();
         await logRamUsage(lastLlama?.cpuMathCores);
+
+        if (lastLlama != null)
+            await logSwapUsage(lastLlama);
     }
 };
 
@@ -162,14 +168,17 @@ async function getLlamaForGpu(gpu: BuildGpu) {
 async function logGpuVramUsage(gpu: BuildGpu, llama: Llama) {
     try {
         const gpuName = getPrettyBuildGpuName(gpu);
-        const vramStatus = await llama.getVramState();
+        const vramState = await llama.getVramState();
         const gpuDeviceNames = await llama.getGpuDeviceNames();
 
         if (gpuDeviceNames.length > 0)
             console.info(`${chalk.yellow(`${gpuName} device${gpuDeviceNames.length > 1 ? "s" : ""}:`)} ${gpuDeviceNames.join(", ")}`);
 
-        console.info(`${chalk.yellow(`${gpuName} used VRAM:`)} ${getPercentageString(vramStatus.used, vramStatus.total)}% ${chalk.gray("(" + bytes(vramStatus.used) + "/" + bytes(vramStatus.total) + ")")}`);
-        console.info(`${chalk.yellow(`${gpuName} free VRAM:`)} ${getPercentageString(vramStatus.free, vramStatus.total)}% ${chalk.gray("(" + bytes(vramStatus.free) + "/" + bytes(vramStatus.total) + ")")}`);
+        console.info(`${chalk.yellow(`${gpuName} used VRAM:`)} ${getPercentageString(vramState.used, vramState.total)}% ${chalk.gray("(" + bytes(vramState.used) + "/" + bytes(vramState.total) + ")")}`);
+        console.info(`${chalk.yellow(`${gpuName} free VRAM:`)} ${getPercentageString(vramState.free, vramState.total)}% ${chalk.gray("(" + bytes(vramState.free) + "/" + bytes(vramState.total) + ")")}`);
+
+        if (vramState.unifiedSize > 0)
+            console.info(`${chalk.yellow(`${gpuName} unified memory:`)} ${bytes(vramState.unifiedSize)} ${chalk.gray("(" + getPercentageString(vramState.unifiedSize, vramState.total) + "%)")}`);
     } catch (err) {}
 }
 
@@ -195,6 +204,13 @@ async function logRamUsage(cpuMathCores?: number) {
     console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(freeMemory, totalMemory)}% ${chalk.gray("(" + bytes(freeMemory) + "/" + bytes(totalMemory) + ")")}`);
 }
 
+async function logSwapUsage(llama: Llama) {
+    const swapState = await llama.getSwapState();
+
+    console.info(`${chalk.yellow("Used swap:")} ${getPercentageString(swapState.used, swapState.allocated)}% ${chalk.gray("(" + bytes(swapState.used) + "/" + bytes(swapState.allocated) + ")")}`);
+    console.info(`${chalk.yellow("Max swap size:")} ${swapState.maxSize === Infinity ? "dynamic" : bytes(swapState.maxSize)}`);
+}
+
 function getPercentageString(amount: number, total: number) {
     if (total === 0)
         return "0";
diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts
index 7dc264a2..9201e3e3 100644
--- a/src/cli/utils/interactivelyAskForModel.ts
+++ b/src/cli/utils/interactivelyAskForModel.ts
@@ -544,6 +544,9 @@ function renderRecommendedModelTechnicalInfo(
                 show: canUseGpu,
                 title: "VRAM usage",
                 value: () => bytes(compatibilityScore.resolvedValues.totalVramUsage)
+            }, {
+                title: "RAM usage",
+                value: () => bytes(compatibilityScore.resolvedValues.totalRamUsage)
             }]
         })
     ].join("\n");
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 15c8f1b3..7e265929 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -2166,21 +2166,24 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     }
 
     public async alignCurrentSequenceStateWithCurrentTokens() {
-        let {firstDifferentIndex} = this.llamaChat.sequence.compareContextTokens(this.tokens);
-
-        // we need to decode at least one token to generate a response
-        if (firstDifferentIndex === this.tokens.length && firstDifferentIndex > 0)
-            firstDifferentIndex -= 1;
-
-        this.tokens.splice(0, firstDifferentIndex);
-
-        if (firstDifferentIndex < this.llamaChat.sequence.nextTokenIndex) {
+        if (this.tokens.length === 1 && this.llamaChat.sequence.nextTokenIndex !== 0) {
             await this.llamaChat.sequence.eraseContextTokenRanges([{
-                start: firstDifferentIndex,
+                start: 0,
                 end: this.llamaChat.sequence.nextTokenIndex
             }]);
-            this.ensureNotAborted();
+            return;
         }
+
+        const lastToken = this.tokens[this.tokens.length - 1]!;
+
+        // we need to decode at least one token to generate a response
+        this.tokens.pop();
+        await this.llamaChat.sequence.adaptStateToTokens(this.tokens, false);
+        this.tokens.push(lastToken);
+        this.ensureNotAborted();
+
+        const firstDifferentIndex = this.llamaChat.sequence.nextTokenIndex;
+        this.tokens.splice(0, firstDifferentIndex);
     }
 
     public async evaluateWithoutGeneratingNewTokens() {
diff --git a/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts b/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts
index 91f0bfc7..ad96d83b 100644
--- a/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts
+++ b/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts
@@ -30,6 +30,8 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
         initialCharactersRemovalCount,
         tokenizer,
         chatWrapper,
+        failedCompressionErrorMessage: "Failed to compress chat history for context shift due to a too long prompt or system message that cannot be compressed without affecting the generation quality. " +
+            "Consider increasing the context size or shortening the long prompt or system message.",
         compressChatHistory({chatHistory, charactersToRemove, estimatedCharactersPerToken}) {
             const res = chatHistory.map(item => structuredClone(item));
             let charactersLeftToRemove = charactersToRemove;
@@ -66,6 +68,8 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
             }
 
             function removeHistoryThatLedToModelResponseAtIndex(index: number) {
+                let removedItems = 0;
+
                 for (let i = index - 1; i >= 0; i--) {
                     const historyItem = res[i];
 
@@ -79,13 +83,19 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                         break; // keep the first system message
 
                     if (historyItem.type === "user" || historyItem.type === "system") {
-                        const newText = truncateLlamaTextAndRoundToWords(LlamaText.fromJSON(historyItem.text), charactersLeftToRemove);
+                        const newText = truncateLlamaTextAndRoundToWords(
+                            LlamaText.fromJSON(historyItem.text),
+                            charactersLeftToRemove,
+                            undefined,
+                            false
+                        );
                         const newTextString = newText.toString();
                         const historyItemString = LlamaText.fromJSON(historyItem.text).toString();
 
                         if (newText.values.length === 0) {
                             res.splice(i, 1);
                             i++;
+                            removedItems++;
                             charactersLeftToRemove -= historyItemString.length;
                         } else if (newTextString.length < historyItemString.length) {
                             charactersLeftToRemove -= historyItemString.length - newTextString.length;
@@ -98,6 +108,66 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                         void (historyItem satisfies never);
                     }
                 }
+
+                return removedItems;
+            }
+
+            function compressHistoryThatLedToModelResponseAtIndex(index: number, keepTokensCount: number = 0) {
+                let removedItems = 0;
+                let promptStartIndex: number | undefined = undefined;
+
+                for (let i = index - 1; i >= 0; i--) {
+                    const historyItem = res[i];
+
+                    if (historyItem == null)
+                        continue;
+
+                    if (historyItem.type === "model") {
+                        promptStartIndex = i + 1;
+                        break;
+                    }
+
+                    if (i === 0 && historyItem.type === "system") {
+                        promptStartIndex = i + 1;
+                        break; // keep the first system message
+                    }
+                }
+
+                if (promptStartIndex == null || promptStartIndex >= index)
+                    return 0;
+
+                for (let i = promptStartIndex; i < index && charactersLeftToRemove > 0; i++) {
+                    const historyItem = res[i];
+
+                    if (historyItem == null || historyItem.type !== "user")
+                        continue;
+
+                    let removeChars = Math.min(charactersLeftToRemove, historyItem.text.length);
+                    if (keepTokensCount > 0) {
+                        removeChars -= Math.floor(keepTokensCount * estimatedCharactersPerToken);
+                        if (removeChars < 0)
+                            removeChars = 0;
+
+                        keepTokensCount -= Math.min(
+                            keepTokensCount,
+                            Math.max(0, historyItem.text.length - removeChars) / estimatedCharactersPerToken
+                        );
+                    }
+
+                    const newText = truncateTextAndRoundToWords(historyItem.text, removeChars, undefined, false);
+                    if (newText.length === 0) {
+                        res.splice(i, 1);
+                        i--;
+                        index--;
+                        removedItems++;
+                        charactersLeftToRemove -= historyItem.text.length;
+                    } else {
+                        charactersLeftToRemove -= historyItem.text.length - newText.length;
+                        historyItem.text = newText;
+                    }
+                }
+
+                return removedItems;
             }
 
             function compressFirstModelResponse() {
@@ -116,7 +186,7 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                             continue;
 
                         if (typeof item === "string") {
-                            const newText = truncateTextAndRoundToWords(item, charactersLeftToRemove);
+                            const newText = truncateTextAndRoundToWords(item, charactersLeftToRemove, undefined, true);
 
                             if (newText === "") {
                                 historyItem.response.splice(t, 1);
@@ -139,14 +209,14 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                     if (historyItem.response.length === 0) {
                         // if the model response is removed from the history,
                         // the things that led to it are not important anymore
-                        removeHistoryThatLedToModelResponseAtIndex(i);
+                        i -= removeHistoryThatLedToModelResponseAtIndex(i);
                         res.splice(i, 1);
                         i--;
                     }
                 }
             }
 
-            function compressLastModelResponse(minCharactersToKeep: number = 20) {
+            function compressLastModelResponse(minCharactersToKeep: number = 60) {
                 const lastHistoryItem = res[res.length - 1];
 
                 if (lastHistoryItem == null || lastHistoryItem.type !== "model")
@@ -157,14 +227,27 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate
                 if (lastResponseItem == null || typeof lastResponseItem !== "string")
                     return;
 
-                const nextTextLength = lastResponseItem.length - charactersLeftToRemove;
-                const charactersToRemoveFromText = charactersLeftToRemove + Math.max(0, nextTextLength - minCharactersToKeep);
-                const newText = truncateTextAndRoundToWords(lastResponseItem, charactersToRemoveFromText);
+                compressHistoryThatLedToModelResponseAtIndex(res.length - 1, maxTokensCount / 4);
+
+                if (charactersLeftToRemove <= 0)
+                    return;
+
+                const nextTextLength = Math.max(
+                    Math.min(lastResponseItem.length, minCharactersToKeep),
+                    lastResponseItem.length - charactersLeftToRemove
+                );
+                const charactersToRemoveFromText = lastResponseItem.length - nextTextLength;
+                const newText = truncateTextAndRoundToWords(lastResponseItem, charactersToRemoveFromText, undefined, true);
 
                 if (newText.length < lastResponseItem.length) {
                     lastHistoryItem.response[lastHistoryItem.response.length - 1] = newText;
                     charactersLeftToRemove -= lastResponseItem.length - newText.length;
                 }
+
+                if (charactersLeftToRemove <= 0)
+                    return;
+
+                compressHistoryThatLedToModelResponseAtIndex(res.length - 1);
             }
 
             compressFunctionCalls();
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 41e2f952..cc1d0563 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -148,6 +148,14 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
      */
     trimWhitespaceSuffix?: boolean,
 
+    /**
+     * Force a given text prefix to be the start of the model response, to make the model follow a certain direction.
+     *
+     * May cause some models to not use the given functions in some scenarios where they would have been used otherwise,
+     * so avoid using it together with function calling if you notice unexpected behavior.
+     */
+    responsePrefix?: string,
+
     /**
      * See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
      */
@@ -403,6 +411,7 @@ export class LlamaChatSession {
             seed,
             grammar,
             trimWhitespaceSuffix = false,
+            responsePrefix,
             repeatPenalty,
             tokenBias,
             customStopTriggers
@@ -415,7 +424,7 @@ export class LlamaChatSession {
             maxParallelFunctionCalls: maxParallelFunctionCalls as undefined,
 
             onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix,
-            repeatPenalty, tokenBias, customStopTriggers
+            responsePrefix, repeatPenalty, tokenBias, customStopTriggers
         });
 
         return responseText;
@@ -441,6 +450,7 @@ export class LlamaChatSession {
         seed,
         grammar,
         trimWhitespaceSuffix = false,
+        responsePrefix,
         repeatPenalty,
         tokenBias,
         customStopTriggers,
@@ -467,17 +477,30 @@ export class LlamaChatSession {
                 ? undefined
                 : appendUserMessageToChatHistory(lastEvaluation?.contextWindow, prompt);
 
+            const resolvedResponsePrefix = (responsePrefix != null && responsePrefix !== "")
+                ? responsePrefix
+                : undefined;
+
             newChatHistory.push({
                 type: "model",
-                response: []
+                response: resolvedResponsePrefix != null
+                    ? [resolvedResponsePrefix]
+                    : []
             });
 
             if (newContextWindowChatHistory != null)
                 newContextWindowChatHistory.push({
                     type: "model",
-                    response: []
+                    response: resolvedResponsePrefix != null
+                        ? [resolvedResponsePrefix]
+                        : []
                 });
 
+            if (resolvedResponsePrefix != null) {
+                safeEventCallback(onToken)?.(this.model.tokenize(resolvedResponsePrefix));
+                safeEventCallback(onTextChunk)?.(resolvedResponsePrefix);
+            }
+
             // eslint-disable-next-line no-constant-condition
             while (true) {
                 const functionCallsAndResults: Array<Promise<null | {
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index 4bd84e46..3c84a0cd 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -660,20 +660,22 @@ export class LlamaCompletion {
 
             let shouldContextShift = false;
 
-            let {firstDifferentIndex} = sequence.compareContextTokens(inputTokens);
-
-            // we need to decode at least one token to generate a response
-            if (firstDifferentIndex === inputTokens.length && firstDifferentIndex > 0)
-                firstDifferentIndex -= 1;
-
-            inputTokens.splice(0, firstDifferentIndex);
-
-            if (firstDifferentIndex < sequence.nextTokenIndex) {
+            if (inputTokens.length === 1 && sequence.nextTokenIndex !== 0)
                 await sequence.eraseContextTokenRanges([{
-                    start: firstDifferentIndex,
+                    start: 0,
                     end: sequence.nextTokenIndex
                 }]);
+            else {
+                const lastToken = inputTokens[inputTokens.length - 1]!;
+
+                // we need to decode at least one token to generate a response
+                inputTokens.pop();
+                await sequence.adaptStateToTokens(inputTokens, false);
+                inputTokens.push(lastToken);
                 ensureNotAborted();
+
+                const firstDifferentIndex = sequence.nextTokenIndex;
+                inputTokens.splice(0, firstDifferentIndex);
             }
 
             const evaluationIterator = sequence.evaluate(inputTokens, removeNullFields({
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 845efd2e..4a241635 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -702,19 +702,22 @@ export class LlamaContext {
 
         async function createContext(contextSize: number) {
             const batchSize = options.batchSize ?? getDefaultContextBatchSize({contextSize, sequences});
-            const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
+            const resourceRequirementsEstimation = _model.fileInsights.estimateContextResourceRequirements({
                 contextSize,
                 sequences,
                 isEmbeddingContext: options._embeddings,
                 modelGpuLayers: _model.gpuLayers,
                 batchSize,
                 flashAttention
-            }).gpuVram;
+            });
 
             const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention});
-            const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
+            const contextCreationVramReservation = options.ignoreMemorySafetyChecks
+                ? null
+                : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
+            const contextCreationRamReservation = options.ignoreMemorySafetyChecks
                 ? null
-                : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
+                : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam);
 
             try {
                 if (createSignal?.aborted)
@@ -730,7 +733,8 @@ export class LlamaContext {
                 } else if (!contextLoaded)
                     throw new Error("Failed to create context");
 
-                contextCreationMemoryReservation?.dispose?.();
+                contextCreationVramReservation?.dispose?.();
+                contextCreationRamReservation?.dispose?.();
 
                 if (loraOptions != null && loraOptions.adapters.length > 0) {
                     let loadedAdapters = 0;
@@ -768,7 +772,8 @@ export class LlamaContext {
 
                 return context;
             } finally {
-                contextCreationMemoryReservation?.dispose?.();
+                contextCreationVramReservation?.dispose?.();
+                contextCreationRamReservation?.dispose?.();
             }
         }
 
@@ -904,6 +909,61 @@ export class LlamaContextSequence {
         };
     }
 
+    /**
+     * Erase parts of the context state to align it with the given tokens.
+     *
+     * If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
+     *
+     * To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
+     *
+     * If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
+     * which incurs token evaluation of the shifted tokens.
+     */
+    public async adaptStateToTokens(tokens: Token[], allowShift: boolean = true) {
+        if (this.model.fileInsights.isRecurrent || !allowShift) {
+            const {firstDifferentIndex} = this.compareContextTokens(tokens);
+            if (firstDifferentIndex < this._nextTokenIndex)
+                await this.eraseContextTokenRanges([{
+                    start: firstDifferentIndex,
+                    end: this._nextTokenIndex
+                }]);
+
+            return;
+        }
+
+        const eraseRanges: ContextTokensDeleteRange[] = [];
+
+        let tokensIndex = 0;
+        let differentTokenIndex: number | undefined = undefined;
+        for (let i = 0; i < this._contextTokens.length && tokensIndex < tokens.length; i++) {
+            if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) {
+                if (differentTokenIndex != null) {
+                    eraseRanges.push({
+                        start: differentTokenIndex,
+                        end: i
+                    });
+
+                    differentTokenIndex = undefined;
+                }
+
+                tokensIndex++;
+                continue;
+            }
+
+            if (differentTokenIndex == null)
+                differentTokenIndex = i;
+        }
+
+        if (differentTokenIndex != null)
+            eraseRanges.push({
+                start: differentTokenIndex,
+                end: this._nextTokenIndex
+            });
+
+        if (eraseRanges.length > 0)
+            await this.eraseContextTokenRanges(eraseRanges);
+    }
+
     /**
      * Clear the history of the sequence.
      * If `prependBos` was enabled, the BOS token will be prepended to the sequence again.
@@ -970,15 +1030,23 @@ export class LlamaContextSequence {
                 if (deletionSuccessful)
                     deletionSuccessful &&= this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end);
 
-                if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start)
+                if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start) {
                     this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, range.start, -removedTokens);
+                    const shiftedTokens = range.start - lastDeleteRangeEndPos;
+                    this._tokenMeter.useTokens(shiftedTokens, "input");
+                }
 
                 removedTokens += range.end - range.start;
                 lastDeleteRangeEndPos = range.end;
             }
 
-            if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex)
+            if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 &&
+                lastDeleteRangeEndPos !== this._nextTokenIndex
+            ) {
                 this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
+                const shiftedTokens = this._nextTokenIndex - lastDeleteRangeEndPos;
+                this._tokenMeter.useTokens(shiftedTokens, "input");
+            }
 
             this._nextTokenIndex -= removedTokens;
 
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 7d0f31a3..8cdc5e52 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -684,7 +684,7 @@ export class LlamaModel {
             ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
             defaultContextFlashAttention: resolvedDefaultContextFlashAttention
         });
-        const vramRequiredEstimate = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers}).gpuVram;
+        const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers});
 
         const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, {
             _fileInfo: fileInfo,
@@ -694,9 +694,12 @@ export class LlamaModel {
             _flashAttentionSupported: flashAttentionSupported,
             _defaultContextFlashAttention: resolvedDefaultContextFlashAttention
         });
-        const modelCreationMemoryReservation = modelOptions.ignoreMemorySafetyChecks
+        const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks
             ? null
-            : _llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
+            : _llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
+        const modelCreationRamReservation = modelOptions.ignoreMemorySafetyChecks
+            ? null
+            : _llama._ramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam);
         const loggedWarnings = new Set<string>();
 
         function onAbort() {
@@ -741,7 +744,8 @@ export class LlamaModel {
             return model;
         } finally {
             loadSignal?.removeEventListener("abort", onAbort);
-            modelCreationMemoryReservation?.dispose?.();
+            modelCreationVramReservation?.dispose?.();
+            modelCreationRamReservation?.dispose?.();
         }
     }
 }
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index e6e4bca3..5833e5a4 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -104,6 +104,16 @@ export class GgufInsights {
         return true;
     }
 
+    public get isRecurrent() {
+        switch (this._ggufFileInfo.metadata?.general?.architecture) {
+            case GgufArchitectureType.mamba:
+            case GgufArchitectureType.rwkv6:
+                return true;
+        }
+
+        return false;
+    }
+
     public estimateModelResourceRequirements({gpuLayers}: {gpuLayers: number}): GgufInsightsResourceRequirements {
         const {cpu, gpu} = this._getTensorResourceSplit(gpuLayers);
 
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index edf56084..ce57ee1d 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -1,14 +1,16 @@
-import os from "os";
 import {BuildGpu} from "../../bindings/types.js";
 import {LlamaModelOptions} from "../../evaluator/LlamaModel/LlamaModel.js";
 import {LlamaContextOptions} from "../../evaluator/LlamaContext/types.js";
 import {getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js";
+import {InsufficientMemoryError} from "../../utils/InsufficientMemoryError.js";
 import {resolveModelGpuLayersOption} from "./utils/resolveModelGpuLayersOption.js";
 import {resolveContextContextSizeOption} from "./utils/resolveContextContextSizeOption.js";
 import {scoreLevels} from "./utils/scoreLevels.js";
+import {getRamUsageFromUnifiedVram} from "./utils/getRamUsageFromUnifiedVram.js";
 import type {GgufInsights} from "./GgufInsights.js";
 
 export const defaultTrainContextSizeForEstimationPurposes = 4096;
+const defaultContextSizeForUnfitContextSizeConfiguration = 2048;
 
 
 export class GgufInsightsConfigurationResolver {
@@ -44,13 +46,15 @@ export class GgufInsightsConfigurationResolver {
         flashAttention?: boolean
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
-        getRamState = (async () => ({total: os.totalmem(), free: os.freemem()})),
+        getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
+        getSwapState = (() =>  this._ggufInsights._llama._swapOrchestrator.getMemoryState()),
         llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize,
         llamaGpu = this._ggufInsights._llama.gpu,
         llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading
     }: {
-        getVramState?(): Promise<{total: number, free: number}>,
+        getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>,
         getRamState?(): Promise<{total: number, free: number}>,
+        getSwapState?(): Promise<{total: number, free: number}>,
         llamaVramPaddingSize?: number,
         llamaGpu?: BuildGpu,
         llamaSupportsGpuOffloading?: boolean
@@ -58,78 +62,18 @@ export class GgufInsightsConfigurationResolver {
         const compatibilityScore = await this.scoreModelConfigurationCompatibility({
             flashAttention,
             contextSize: targetContextSize,
-            embeddingContext
+            embeddingContext,
+            forceGpuLayers: targetGpuLayers,
+            forceStrictContextSize: targetContextSize != null
         }, {
             getVramState,
             getRamState,
+            getSwapState,
             llamaVramPaddingSize,
             llamaGpu,
             llamaSupportsGpuOffloading
         });
 
-        if (targetContextSize != null || targetGpuLayers != null) {
-            const vramState = await getVramState();
-            const resolvedGpuLayers = await this.resolveModelGpuLayers(
-                targetGpuLayers == null
-                    ? {
-                        fitContext: {
-                            contextSize: targetContextSize,
-                            embeddingContext
-                        }
-                    }
-                    : targetGpuLayers,
-                {
-                    getVramState: async () => vramState,
-                    defaultContextFlashAttention: flashAttention,
-                    ignoreMemorySafetyChecks: targetGpuLayers != null,
-                    llamaGpu,
-                    llamaSupportsGpuOffloading,
-                    llamaVramPaddingSize
-                }
-            );
-            const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({
-                gpuLayers: resolvedGpuLayers
-            });
-
-            const resolvedContextSize = await this._ggufInsights.configurationResolver.resolveContextContextSize(targetContextSize ?? "auto", {
-                getVramState: async () => ({
-                    total: vramState.total,
-                    free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram)
-                }),
-                isEmbeddingContext: embeddingContext,
-                modelGpuLayers: resolvedGpuLayers,
-                modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
-                flashAttention,
-                ignoreMemorySafetyChecks: targetContextSize != null,
-                llamaGpu
-            });
-            const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({
-                contextSize: resolvedContextSize,
-                isEmbeddingContext: embeddingContext,
-                modelGpuLayers: resolvedGpuLayers,
-                flashAttention
-            });
-
-            compatibilityScore.resolvedValues = {
-                gpuLayers: resolvedGpuLayers,
-                contextSize: resolvedContextSize,
-
-                modelRamUsage: estimatedModelResourceUsage.cpuRam,
-                contextRamUsage: estimatedContextResourceUsage.cpuRam,
-                totalRamUsage: estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam,
-
-                modelVramUsage: estimatedModelResourceUsage.gpuVram,
-                contextVramUsage: estimatedContextResourceUsage.gpuVram,
-                totalVramUsage: estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram
-            };
-
-            if (compatibilityScore.resolvedValues.totalVramUsage > vramState.total) {
-                compatibilityScore.compatibilityScore = 0;
-                compatibilityScore.bonusScore = 0;
-                compatibilityScore.totalScore = 0;
-            }
-        }
-
         return compatibilityScore;
     }
 
@@ -148,27 +92,46 @@ export class GgufInsightsConfigurationResolver {
      * Set this to any value higher than `<max compared model context size> / contextSize`.
      * Defaults to `100`.
      *
+     * `maximumUnfitConfigurationResourceMultiplier` is used to improve the proportionality of the bonus score between unfit models.
+     * Set this to any value higher than `<max compared model resource usage> / <total available resources>`.
+     * Defaults to `100`.
+     *
      * `contextSize` defaults to `4096` (if the model train context size is lower than this, the model train context size is used instead).
      */
     public async scoreModelConfigurationCompatibility({
         contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096),
         embeddingContext = false,
         flashAttention = false,
-        maximumFittedContextSizeMultiplier = 100
+        maximumFittedContextSizeMultiplier = 100,
+        maximumUnfitConfigurationResourceMultiplier = 100,
+        forceStrictContextSize = false,
+        forceGpuLayers
     }: {
         contextSize?: number,
         embeddingContext?: boolean,
         flashAttention?: boolean,
-        maximumFittedContextSizeMultiplier?: number
+        maximumFittedContextSizeMultiplier?: number,
+        maximumUnfitConfigurationResourceMultiplier?: number,
+
+        /**
+         * Do not resolve a context size larger than the specified `contextSize`.
+         *
+         * Defaults to `false`.
+         */
+        forceStrictContextSize?: boolean,
+
+        forceGpuLayers?: number | "max"
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
-        getRamState = (async () => ({total: os.totalmem(), free: os.freemem()})),
+        getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
+        getSwapState = (() =>  this._ggufInsights._llama._swapOrchestrator.getMemoryState()),
         llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize,
         llamaGpu = this._ggufInsights._llama.gpu,
         llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading
     }: {
-        getVramState?(): Promise<{total: number, free: number}>,
+        getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>,
         getRamState?(): Promise<{total: number, free: number}>,
+        getSwapState?(): Promise<{total: number, free: number}>,
         llamaVramPaddingSize?: number,
         llamaGpu?: BuildGpu,
         llamaSupportsGpuOffloading?: boolean
@@ -207,39 +170,100 @@ export class GgufInsightsConfigurationResolver {
     }> {
         const [
             vramState,
-            ramState
+            ramState,
+            swapState
         ] = await Promise.all([
             getVramState(),
-            getRamState()
+            getRamState(),
+            getSwapState()
         ]);
-        const resolvedGpuLayers = await this.resolveModelGpuLayers(
-            embeddingContext
-                ? {fitContext: {embeddingContext: true}}
-                : "auto",
-            {
-                getVramState: async () => vramState,
-                llamaVramPaddingSize,
-                llamaGpu,
-                llamaSupportsGpuOffloading,
-                defaultContextFlashAttention: flashAttention
-            }
-        );
+        let resolvedGpuLayers = (forceGpuLayers == null || forceGpuLayers == "max")
+            ? this.ggufInsights.totalLayers
+            : forceGpuLayers;
+        let gpuLayersFitMemory = false;
+
+        try {
+            resolvedGpuLayers = await this.resolveModelGpuLayers(
+                forceGpuLayers != null
+                    ? forceGpuLayers
+                    : embeddingContext
+                        ? {
+                            fitContext: {
+                                embeddingContext: true,
+                                contextSize: forceStrictContextSize
+                                    ? contextSize
+                                    : undefined
+                            }
+                        }
+                        : forceStrictContextSize != null
+                            ? {fitContext: {contextSize}}
+                            : "auto",
+                {
+                    getVramState: async () => vramState,
+                    llamaVramPaddingSize,
+                    llamaGpu,
+                    llamaSupportsGpuOffloading,
+                    defaultContextFlashAttention: flashAttention,
+                    ignoreMemorySafetyChecks: forceGpuLayers != null
+                }
+            );
+            gpuLayersFitMemory = true;
+        } catch (err) {
+            if (!(err instanceof InsufficientMemoryError))
+                throw err;
+        }
+
         const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false;
         const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({
             gpuLayers: resolvedGpuLayers
         });
 
-        const resolvedContextSize = await this.resolveContextContextSize("auto", {
-            getVramState: async () => ({
-                total: vramState.total,
-                free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram)
-            }),
-            llamaGpu,
-            isEmbeddingContext: embeddingContext,
-            modelGpuLayers: resolvedGpuLayers,
-            modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
-            flashAttention
-        });
+        let resolvedContextSize = Math.min(
+            this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration,
+            defaultContextSizeForUnfitContextSizeConfiguration
+        );
+        let contextFitsMemory = false;
+
+        try {
+            resolvedContextSize = await this.resolveContextContextSize("auto", {
+                getVramState: async () => ({
+                    total: vramState.total,
+                    free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram),
+                    unifiedSize: vramState.unifiedSize
+                }),
+                getRamState: async () => ({
+                    total: ramState.total,
+                    free: Math.max(
+                        0,
+                        ramState.free - estimatedModelResourceUsage.cpuRam +
+                        (-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState))
+                    )
+                }),
+                getSwapState: async () => ({
+                    total: swapState.total,
+                    free: Math.max(
+                        0,
+                        swapState.free - Math.max(
+                            0,
+                            estimatedModelResourceUsage.cpuRam +
+                            (-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) +
+                            (-ramState.free)
+                        )
+                    )
+                }),
+                llamaGpu,
+                isEmbeddingContext: embeddingContext,
+                modelGpuLayers: resolvedGpuLayers,
+                modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
+                ignoreMemorySafetyChecks: forceStrictContextSize,
+                flashAttention
+            });
+            contextFitsMemory = true;
+        } catch (err) {
+            if (!(err instanceof InsufficientMemoryError))
+                throw err;
+        }
+
         const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({
             contextSize: resolvedContextSize,
             isEmbeddingContext: embeddingContext,
@@ -252,7 +276,7 @@ export class GgufInsightsConfigurationResolver {
             allLayersAreOffloaded: 10,
             contextSize: 30,
             ramUsageFitsInRam: 10,
-            cpuOnlySmallModelSize: 60, // also defined inside `scoreModelSizeForCpuOnlyUsage`
+            cpuOnlySmallModelSize: 70, // also defined inside `scoreModelSizeForCpuOnlyUsage`
             bonusContextSize: 10
         } as const;
 
@@ -260,29 +284,37 @@ export class GgufInsightsConfigurationResolver {
         const allLayersAreOffloadedPoints = rankPoints.allLayersAreOffloaded * (
             resolvedGpuLayers === this._ggufInsights.totalLayers ? 1 : 0
         );
-        const contextSizePoints = rankPoints.contextSize * Math.min(1, resolvedContextSize / contextSize);
+        const contextSizePoints = contextFitsMemory
+            ? rankPoints.contextSize * Math.min(1, resolvedContextSize / contextSize)
+            : 0;
         const ramUsageFitsInRamPoints = rankPoints.ramUsageFitsInRam * (
             estimatedModelResourceUsage.cpuRam <= ramState.free
                 ? 1
-                : estimatedModelResourceUsage.cpuRam <= ramState.total
-                    ? 0.5
-                    : (
-                        0.5 - Math.min(
-                            0.5,
-                            0.5 * (
-                                (estimatedModelResourceUsage.cpuRam - ramState.total) / ramState.total
+                : estimatedModelResourceUsage.cpuRam <= ramState.free + swapState.free
+                    ? 0.8
+                    : estimatedModelResourceUsage.cpuRam <= ramState.total
+                        ? 0.5
+                        : (
+                            0.5 - Math.min(
+                                0.5,
+                                0.5 * (
+                                    (estimatedModelResourceUsage.cpuRam - ramState.total) / ramState.total
+                                )
                             )
                         )
-                    )
-        );
-        const bonusContextSizePoints = 10 * Math.min(
-            1,
-            (
-                Math.max(0, resolvedContextSize - contextSize) / contextSize
-            ) / maximumFittedContextSizeMultiplier
         );
+        const bonusContextSizePoints = contextFitsMemory
+            ? (
+                10 * Math.min(
+                    1,
+                    (
+                        Math.max(0, resolvedContextSize - contextSize) / contextSize
+                    ) / maximumFittedContextSizeMultiplier
+                )
+            )
+            : 0;
 
-        const compatibilityScore = canUseGpu
+        let compatibilityScore = canUseGpu
             ? (
                 (gpuLayersPoints + allLayersAreOffloadedPoints + contextSizePoints + ramUsageFitsInRamPoints) /
                 (rankPoints.gpuLayers + rankPoints.allLayersAreOffloaded + rankPoints.contextSize + rankPoints.ramUsageFitsInRam)
@@ -290,7 +322,21 @@ export class GgufInsightsConfigurationResolver {
             : (
                 (contextSizePoints + ramUsageFitsInRamPoints + scoreModelSizeForCpuOnlyUsage(this._ggufInsights.modelSize)) /
                 (rankPoints.contextSize + rankPoints.ramUsageFitsInRam + rankPoints.cpuOnlySmallModelSize));
-        const bonusScore = bonusContextSizePoints / rankPoints.bonusContextSize;
+        let bonusScore = bonusContextSizePoints / rankPoints.bonusContextSize;
+
+        if (!gpuLayersFitMemory || !contextFitsMemory ||
+            estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram > vramState.total ||
+            estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam > ramState.total + swapState.total
+        ) {
+            const totalVramRequirement = estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram;
+            const totalRamRequirement = estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam;
+
+            compatibilityScore = 0;
+            bonusScore = (
+                (1 - (totalVramRequirement / (vramState.total * maximumUnfitConfigurationResourceMultiplier))) +
+                (1 - (totalRamRequirement / ((ramState.total + swapState.total) * maximumUnfitConfigurationResourceMultiplier)))
+            ) / 2;
+        }
 
         return {
             compatibilityScore,
@@ -333,12 +379,19 @@ export class GgufInsightsConfigurationResolver {
         });
     }
 
+    /**
+     * Resolve a context size option for the given options and constraints.
+     *
+     * If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown.
+     */
     public async resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], {
         modelGpuLayers,
         batchSize,
         modelTrainContextSize,
         flashAttention = false,
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
+        getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
+        getSwapState = (() =>  this._ggufInsights._llama._swapOrchestrator.getMemoryState()),
         llamaGpu = this._ggufInsights._llama.gpu,
         ignoreMemorySafetyChecks = false,
         isEmbeddingContext = false,
@@ -349,7 +402,9 @@ export class GgufInsightsConfigurationResolver {
         flashAttention?: boolean,
         batchSize?: LlamaContextOptions["batchSize"],
         sequences?: number,
-        getVramState?(): Promise<{total: number, free: number}>,
+        getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>,
+        getRamState?(): Promise<{total: number, free: number}>,
+        getSwapState?(): Promise<{total: number, free: number}>,
         llamaGpu?: BuildGpu,
         ignoreMemorySafetyChecks?: boolean,
         isEmbeddingContext?: boolean
@@ -363,6 +418,8 @@ export class GgufInsightsConfigurationResolver {
             modelTrainContextSize,
             flashAttention,
             getVramState,
+            getRamState,
+            getSwapState,
             llamaGpu,
             ignoreMemorySafetyChecks,
             isEmbeddingContext
@@ -377,16 +434,16 @@ export class GgufInsightsConfigurationResolver {
 
 function scoreModelSizeForCpuOnlyUsage(modelSize: number) {
     const s1GB = Math.pow(1024, 3);
-    return 60 - scoreLevels(modelSize, [{
+    return 70 - scoreLevels(modelSize, [{
         start: s1GB,
         end: s1GB * 2.5,
-        points: 40
+        points: 46
     }, {
         start: s1GB * 2.5,
         end: s1GB * 4,
-        points: 15
+        points: 17
     }, {
         start: s1GB * 4,
-        points: 5
+        points: 7
     }]);
 }
diff --git a/src/gguf/insights/utils/getRamUsageFromUnifiedVram.ts b/src/gguf/insights/utils/getRamUsageFromUnifiedVram.ts
new file mode 100644
index 00000000..7db8a9eb
--- /dev/null
+++ b/src/gguf/insights/utils/getRamUsageFromUnifiedVram.ts
@@ -0,0 +1,8 @@
+export function getRamUsageFromUnifiedVram(vramUsage: number, vramState: {total: number, free: number, unifiedSize: number}) {
+    const onlyVramSize = vramState.total - vramState.unifiedSize;
+    const existingUsage = Math.max(0, vramState.total - vramState.free);
+
+    const unifiedRamUsage = Math.min(vramState.unifiedSize, Math.max(0, vramUsage - Math.max(0, onlyVramSize - existingUsage)));
+
+    return unifiedRamUsage;
+}
diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
index c4bb5fcf..f800f712 100644
--- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts
+++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
@@ -3,10 +3,15 @@ import {GgufInsights} from "../GgufInsights.js";
 import {BuildGpu} from "../../../bindings/types.js";
 import {minAllowedContextSizeInCalculations} from "../../../config.js";
 import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js";
+import {InsufficientMemoryError} from "../../../utils/InsufficientMemoryError.js";
+import {getRamUsageFromUnifiedVram} from "./getRamUsageFromUnifiedVram.js";
+
+const defaultMaxContextSizeSwapUse = 2048;
 
 export async function resolveContextContextSizeOption({
-    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, getVramState, llamaGpu,
-    ignoreMemorySafetyChecks = false, isEmbeddingContext = false
+    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention,
+    getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false,
+    maxContextSizeSwapUse = defaultMaxContextSizeSwapUse
 }: {
     contextSize?: LlamaContextOptions["contextSize"],
     batchSize?: LlamaContextOptions["batchSize"],
@@ -15,10 +20,13 @@ export async function resolveContextContextSizeOption({
     modelGpuLayers: number,
     modelTrainContextSize: number,
     flashAttention: boolean,
-    getVramState(): Promise<{total: number, free: number}>,
+    getVramState(): Promise<{total: number, free: number, unifiedSize: number}>,
+    getRamState(): Promise<{total: number, free: number}>,
+    getSwapState(): Promise<{total: number, free: number}>,
     llamaGpu: BuildGpu,
     ignoreMemorySafetyChecks?: boolean,
-    isEmbeddingContext?: boolean
+    isEmbeddingContext?: boolean,
+    maxContextSizeSwapUse?: number
 }): Promise<number> {
     if (contextSize == null)
         contextSize = "auto";
@@ -29,30 +37,42 @@ export async function resolveContextContextSizeOption({
         if (ignoreMemorySafetyChecks)
             return resolvedContextSize;
 
-        const vramState = await getVramState();
-        const contextVram = modelFileInsights.estimateContextResourceRequirements({
+        const [
+            vramState,
+            ramState,
+            swapState
+        ] = await Promise.all([
+            getVramState(),
+            getRamState(),
+            getSwapState()
+        ]);
+        const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({
             contextSize: resolvedContextSize,
             batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}),
             modelGpuLayers: modelGpuLayers,
             sequences,
             flashAttention,
             isEmbeddingContext
-        }).gpuVram;
+        });
 
-        if (contextVram > vramState.free)
-            throw new Error(`The context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
+        if (contextResourceRequirements.gpuVram > vramState.free)
+            throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
+        else if (contextResourceRequirements.cpuRam > (
+            ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState)
+        ))
+            throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`);
 
         return resolvedContextSize;
     } else if (contextSize === "auto" || typeof contextSize === "object") {
-        if (llamaGpu === false)
-            return modelTrainContextSize;
-
-        const vramState = await getVramState();
-
-        if (vramState.total === 0)
-            return modelTrainContextSize;
-
-        const freeVram = vramState.free;
+        const [
+            vramState,
+            ramState,
+            swapState
+        ] = await Promise.all([
+            getVramState(),
+            getRamState(),
+            getSwapState()
+        ]);
 
         const maxContextSize = contextSize === "auto"
             ? getDefaultModelContextSize({trainContextSize: modelTrainContextSize})
@@ -71,17 +91,25 @@ export async function resolveContextContextSizeOption({
         let highestCompatibleContextSize: number | null = null;
         let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4));
         for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) {
-            const contextVram = modelFileInsights.estimateContextResourceRequirements({
+            const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({
                 contextSize: testContextSize,
                 batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}),
                 modelGpuLayers: modelGpuLayers,
                 sequences,
                 flashAttention,
                 isEmbeddingContext
-            }).gpuVram;
-
-            if (contextVram <= freeVram) {
-                if (highestCompatibleContextSize == null || testContextSize > highestCompatibleContextSize) {
+            });
+
+            if (contextResourceRequirements.gpuVram <= vramState.free &&
+                contextResourceRequirements.cpuRam <= (
+                    ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + (
+                        testContextSize <= maxContextSizeSwapUse
+                            ? swapState.free
+                            : 0
+                    )
+                )
+            ) {
+                if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) {
                     highestCompatibleContextSize = testContextSize;
 
                     if (step === -1)
@@ -111,7 +139,28 @@ export async function resolveContextContextSizeOption({
         if (ignoreMemorySafetyChecks)
             return minContextSize;
 
-        throw new Error(`The available VRAM is too small to fit the context size of ${maxContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""}`);
+        const minContextSizeResourceRequirements = modelFileInsights.estimateContextResourceRequirements({
+            contextSize: minContextSize,
+            batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: minContextSize, sequences}),
+            modelGpuLayers: modelGpuLayers,
+            sequences,
+            flashAttention,
+            isEmbeddingContext
+        });
+
+        const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState);
+        if (minContextSizeResourceRequirements.gpuVram > vramState.free &&
+            minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage
+        )
+            throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}`);
+        else if (minContextSizeResourceRequirements.gpuVram > vramState.free)
+            throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
+        else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage)
+            throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`);
+        else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage)
+            throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM`);
+        else
+            throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources`);
     }
 
     throw new Error(`Invalid context size: "${contextSize}"`);
diff --git a/src/gguf/utils/getGgufMetadataKeyValue.ts b/src/gguf/utils/getGgufMetadataKeyValue.ts
new file mode 100644
index 00000000..b6ce51df
--- /dev/null
+++ b/src/gguf/utils/getGgufMetadataKeyValue.ts
@@ -0,0 +1,34 @@
+export function getGgufMetadataKeyValue(metadata: Record<string, any>, key: string) {
+    return readMedataKey(metadata, key.split("."));
+}
+
+function readMedataKey(metadata: Record<string, any>, keyParts: string[]): any {
+    for (const [metadataKey, value] of Object.entries(metadata)) {
+        const matchLength = checkMatchLength(metadataKey, keyParts);
+        if (matchLength === 0)
+            continue;
+
+        if (matchLength === keyParts.length)
+            return value;
+
+        const res = readMedataKey(value, keyParts.slice(matchLength));
+        if (res !== undefined)
+            return res;
+    }
+
+    return undefined;
+}
+
+function checkMatchLength(metadataKey: string, keyParts: string[]) {
+    const metadataKeyParts = metadataKey.split(".");
+
+    if (metadataKeyParts.length > keyParts.length)
+        return 0;
+
+    for (let i = 0; i < metadataKeyParts.length; i++) {
+        if (metadataKeyParts[i] !== keyParts[i])
+            return 0;
+    }
+
+    return metadataKeyParts.length;
+}
diff --git a/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts b/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts
index c0bd096c..27efd9b5 100644
--- a/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts
+++ b/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts
@@ -1,6 +1,8 @@
 import {ChatHistoryItem, Tokenizer} from "../types.js";
 import {ChatWrapper} from "../ChatWrapper.js";
 
+const maxSequentialUnhelpfulIterations = 100;
+
 export async function findCharacterRemovalCountToFitChatHistoryInContext({
     compressChatHistory,
     chatHistory,
@@ -9,7 +11,8 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
     chatWrapper,
     initialCharactersRemovalCount = 0,
     estimatedCharactersPerToken = 5,
-    maxDecompressionAttempts = 2
+    maxDecompressionAttempts = 2,
+    failedCompressionErrorMessage = "Failed to compress chat history. Consider increasing the context size."
 }: {
     compressChatHistory(options: {
         chatHistory: readonly ChatHistoryItem[], charactersToRemove: number, estimatedCharactersPerToken: number
@@ -20,7 +23,8 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
     chatWrapper: ChatWrapper,
     initialCharactersRemovalCount?: number,
     estimatedCharactersPerToken?: number,
-    maxDecompressionAttempts?: number
+    maxDecompressionAttempts?: number,
+    failedCompressionErrorMessage?: string
 }): Promise<{
     removedCharactersCount: number,
     compressedChatHistory: ChatHistoryItem[]
@@ -55,6 +59,8 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
 
     let latestCompressionAttempt = await getResultForCharacterRemovalCount(initialCharactersRemovalCount);
     const firstCompressionAttempt = latestCompressionAttempt;
+    let latestCompressionAttemptTokensCount = latestCompressionAttempt.tokensCount;
+    let sameTokensCountRepetitions = 0;
 
     if (latestCompressionAttempt.tokensCount === tokensCountToFit ||
         (latestCompressionAttempt.tokensCount < tokensCountToFit && latestCompressionAttempt.characterRemovalCount === 0)
@@ -116,6 +122,19 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({
             latestCompressionAttempt.characterRemovalCount < bestCompressionAttempt.characterRemovalCount
         ))
             bestCompressionAttempt = latestCompressionAttempt;
+
+        if (latestCompressionAttempt.tokensCount === latestCompressionAttemptTokensCount)
+            sameTokensCountRepetitions++;
+        else {
+            latestCompressionAttemptTokensCount = latestCompressionAttempt.tokensCount;
+            sameTokensCountRepetitions = 0;
+        }
+
+        if (decompressionAttempts === 0 &&
+            compressionAttempts >= maxSequentialUnhelpfulIterations &&
+            sameTokensCountRepetitions >= maxSequentialUnhelpfulIterations
+        )
+            throw new Error(failedCompressionErrorMessage);
     }
 
     return {
diff --git a/src/utils/truncateTextAndRoundToWords.ts b/src/utils/truncateTextAndRoundToWords.ts
index 2f2d5c53..26288ef1 100644
--- a/src/utils/truncateTextAndRoundToWords.ts
+++ b/src/utils/truncateTextAndRoundToWords.ts
@@ -5,68 +5,132 @@ const truncatePrefix = "...";
 /**
  * Truncate the given text starting from the specified index and try to round to the nearest word.
  * @param text - The text to truncate and round
- * @param truncateStartIndex - The index to start truncating the text at
+ * @param truncateSize - The size of the text to truncate
  * @param maxRound - The maximum number of extra characters to delete to round to the nearest word
+ * @param truncateStart - Whether to truncate from the start of the text. If false, truncate from the end.
  * @returns - The truncated and rounded text
  */
-export function truncateTextAndRoundToWords(text: string, truncateStartIndex: number, maxRound: number = 6): string {
-    const res = text.slice(truncateStartIndex);
+export function truncateTextAndRoundToWords(
+    text: string, truncateSize: number, maxRound: number = 6, truncateStart: boolean = false
+): string {
+    if (truncateStart) {
+        const res = text.slice(truncateSize);
 
-    if (res.length === 0)
-        return res;
+        if (res.length === 0)
+            return res;
 
-    if (truncateStartIndex === 0 || text[truncateStartIndex - 1] === " ")
-        return res;
+        if (truncateSize === 0 || text[truncateSize - 1] === " ")
+            return res;
 
-    const nextSpaceIndex = res.indexOf(" ");
+        const nextSpaceIndex = res.indexOf(" ");
 
-    if (nextSpaceIndex < 0) {
-        if (res.length <= maxRound || res.length < truncatePrefix.length)
+        if (nextSpaceIndex < 0) {
+            if (res.length <= maxRound || res.length < truncatePrefix.length)
+                return "";
+
+            return truncatePrefix + res.slice(truncatePrefix.length);
+        }
+
+        if (nextSpaceIndex <= maxRound)
+            return res.slice(nextSpaceIndex + 1);
+
+        if (res.length < truncatePrefix.length)
             return "";
 
         return truncatePrefix + res.slice(truncatePrefix.length);
-    }
+    } else {
+        const res = text.slice(0, -truncateSize);
 
-    if (nextSpaceIndex <= maxRound)
-        return res.slice(nextSpaceIndex + 1);
+        if (res.length === 0)
+            return res;
 
-    if (res.length < truncatePrefix.length)
-        return "";
+        if (truncateSize === 0 || (text.length === res.length || text[res.length] === " "))
+            return res;
 
-    return truncatePrefix + res.slice(truncatePrefix.length);
-}
+        const nextSpaceIndex = res.lastIndexOf(" ");
 
-export function truncateLlamaTextAndRoundToWords(llamaText: LlamaText, truncateStartIndex: number, maxRound: number = 6): LlamaText {
-    if (truncateStartIndex <= 0)
-        return llamaText;
+        if (nextSpaceIndex < 0) {
+            if (res.length <= maxRound || res.length < truncatePrefix.length)
+                return "";
 
-    for (let i = 0; i < llamaText.values.length; i++) {
-        const value = llamaText.values[i];
+            return res.slice(truncatePrefix.length) + truncatePrefix;
+        }
 
-        if (value == null)
-            continue;
+        if (nextSpaceIndex <= maxRound)
+            return res.slice(0, nextSpaceIndex);
 
-        if (typeof value === "string") {
-            if (value.length > truncateStartIndex) {
-                return LlamaText([
-                    truncateTextAndRoundToWords(value, truncateStartIndex, maxRound),
-                    ...llamaText.values.slice(i + 1)
-                ]);
-            }
+        if (res.length < truncatePrefix.length)
+            return "";
 
-            truncateStartIndex -= value.length;
-        } else if (value instanceof SpecialToken) {
-            truncateStartIndex--;
-            if (truncateStartIndex <= 0)
-                return LlamaText(llamaText.values.slice(i + 1));
-        } else {
-            void (value satisfies SpecialTokensText);
+        return res.slice(truncatePrefix.length) + truncatePrefix;
+    }
+}
 
-            // SpecialTokensText shouldn't be truncated
-            if (value.value.length > truncateStartIndex)
-                return LlamaText(llamaText.values.slice(i + 1));
+export function truncateLlamaTextAndRoundToWords(
+    llamaText: LlamaText, truncateSize: number, maxRound: number = 6, truncateStart: boolean = false
+): LlamaText {
+    if (truncateSize <= 0)
+        return llamaText;
 
-            truncateStartIndex -= value.value.length;
+    if (truncateStart) {
+        for (let i = 0; i < llamaText.values.length; i++) {
+            const value = llamaText.values[i];
+
+            if (value == null)
+                continue;
+
+            if (typeof value === "string") {
+                if (value.length > truncateSize) {
+                    return LlamaText([
+                        truncateTextAndRoundToWords(value, truncateSize, maxRound, true),
+                        ...llamaText.values.slice(i + 1)
+                    ]);
+                }
+
+                truncateSize -= value.length;
+            } else if (value instanceof SpecialToken) {
+                truncateSize--;
+                if (truncateSize <= 0)
+                    return LlamaText(llamaText.values.slice(i + 1));
+            } else {
+                void (value satisfies SpecialTokensText);
+
+                // SpecialTokensText shouldn't be truncated
+                if (value.value.length > truncateSize)
+                    return LlamaText(llamaText.values.slice(i + 1));
+
+                truncateSize -= value.value.length;
+            }
+        }
+    } else {
+        for (let i = llamaText.values.length - 1; i >= 0; i--) {
+            const value = llamaText.values[i];
+
+            if (value == null)
+                continue;
+
+            if (typeof value === "string") {
+                if (value.length > truncateSize) {
+                    return LlamaText([
+                        ...llamaText.values.slice(0, i),
+                        truncateTextAndRoundToWords(value, truncateSize, maxRound, false)
+                    ]);
+                }
+
+                truncateSize -= value.length;
+            } else if (value instanceof SpecialToken) {
+                truncateSize--;
+                if (truncateSize <= 0)
+                    return LlamaText(llamaText.values.slice(0, i));
+            } else {
+                void (value satisfies SpecialTokensText);
+
+                // SpecialTokensText shouldn't be truncated
+                if (value.value.length > truncateSize)
+                    return LlamaText(llamaText.values.slice(0, i));
+
+                truncateSize -= value.value.length;
+            }
         }
     }
 
diff --git a/test/modelDependent/codegemma/completion.test.ts b/test/modelDependent/codegemma/completion.test.ts
index 67deed2d..bed1afc2 100644
--- a/test/modelDependent/codegemma/completion.test.ts
+++ b/test/modelDependent/codegemma/completion.test.ts
@@ -19,13 +19,13 @@ describe("CodeGemma", () => {
                 contextSequence: context.getSequence()
             });
 
-            const res = await completion.generateCompletion("Here is a list of sweet fruits:\n* ", {
-                maxTokens: 10
+            const res = await completion.generateCompletion("Sweet fruit names:\n* ", {
+                maxTokens: 10,
+                seed: 30
             });
             expect(res).toMatchInlineSnapshot(`
-              "🍎
-              * 🍊
-              * 🍋
+              "1. Apple
+              * 2. Banana
               "
             `);
         });
diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
index 88cfa250..4d5ebb5c 100644
--- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
@@ -1,4 +1,4 @@
-import {describe, expect, it} from "vitest";
+import {describe, expect, it, test} from "vitest";
 import {getModelFile} from "../../utils/modelFiles.js";
 import {getTestLlama} from "../../utils/getTestLlama.js";
 import {LlamaModelOptions, readGgufFileInfo} from "../../../src/index.js";
@@ -18,9 +18,15 @@ describe("functionary", () => {
             const s1GB = Math.pow(1024, 3);
 
             async function resolveGpuLayers(gpuLayers: LlamaModelOptions["gpuLayers"], {
-                totalVram, freeVram, ignoreMemorySafetyChecks = false, llamaGpu = "metal"
+                totalVram, freeVram, unifiedMemorySize = 0,
+                totalRam = 0, freeRam = 0,
+                totalSwap = 0, freeSwap = 0,
+                ignoreMemorySafetyChecks = false, llamaGpu = "metal"
             }: {
-                totalVram: number, freeVram: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu
+                totalVram: number, freeVram: number, unifiedMemorySize?: number,
+                totalRam?: number, freeRam?: number,
+                totalSwap?: number, freeSwap?: number,
+                ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu
             }) {
                 const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, {
                     ignoreMemorySafetyChecks,
@@ -34,27 +40,31 @@ describe("functionary", () => {
                 });
 
                 async function resolveAutoContextSize() {
-                    const modelVram = ggufInsights.estimateModelResourceRequirements({
-                        gpuLayers: resolvedGpuLayers
-                    }).gpuVram;
+                    const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({
+                        targetGpuLayers: resolvedGpuLayers
+                    }, {
+                        llamaGpu,
+                        getVramState: async () => ({
+                            total: llamaGpu === false ? 0 : totalVram,
+                            free: llamaGpu === false ? 0 : freeVram,
+                            unifiedSize: unifiedMemorySize
+                        }),
+                        getRamState: async () => ({
+                            total: totalRam,
+                            free: freeRam
+                        }),
+                        getSwapState: async () => ({
+                            total: totalSwap,
+                            free: freeSwap
+                        }),
+                        llamaSupportsGpuOffloading: llamaGpu !== false,
+                        llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram)
+                    });
 
-                    try {
-                        return await ggufInsights.configurationResolver.resolveContextContextSize("auto", {
-                            batchSize: undefined,
-                            sequences: 1,
-                            modelGpuLayers: resolvedGpuLayers,
-                            modelTrainContextSize: ggufInsights.trainContextSize ?? 4096,
-                            getVramState: async () => ({
-                                total: llamaGpu === false ? 0 : totalVram,
-                                free: llamaGpu === false ? 0 : (freeVram - modelVram)
-                            }),
-                            llamaGpu,
-                            ignoreMemorySafetyChecks: false,
-                            isEmbeddingContext: false
-                        });
-                    } catch (err) {
+                    if (resolvedConfig.compatibilityScore === 0)
                         return null;
-                    }
+
+                    return resolvedConfig.resolvedValues.contextSize;
                 }
 
                 return {
@@ -63,403 +73,792 @@ describe("functionary", () => {
                 };
             }
 
-            it("attempts to resolve 0 gpuLayers", async () => {
-                {
-                    const res = await resolveGpuLayers(0, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 1
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers(0, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
+            describe("attempts to resolve 0 gpuLayers", () => {
+                test("no RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 1
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
 
-                {
-                    const res = await resolveGpuLayers(0, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
+
+                test("some RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 1,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 0
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 0,
+                            freeRam: s1GB * 0,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
+
+                test("with swap", async () => {
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 1,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 1
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 4,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 1
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
+                    }
+
+                    {
+                        const res = await resolveGpuLayers(0, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 0,
+                            freeRam: s1GB * 0,
+                            totalSwap: s1GB * 0,
+                            freeSwap: s1GB * 0,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
             });
 
-            it("attempts to resolve 16 gpuLayers", async () => {
-                {
-                    const res = await resolveGpuLayers(16, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 3
-                    });
-                    expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1924");
-                }
-                try {
-                    await resolveGpuLayers(16, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0
-                    });
-                    expect.unreachable("Should have thrown an error");
-                } catch (err) {
-                    expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
-                }
-                try {
-                    await resolveGpuLayers(16, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.2
-                    });
-                    expect.unreachable("Should have thrown an error");
-                } catch (err) {
-                    expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
-                }
-                {
-                    const res = await resolveGpuLayers(16, {
-                        totalVram: s1GB * 6,
+            describe("attempts to resolve 16 gpuLayers", () => {
+                test("no RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
 
-                        // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
-                        // to create a context
-                        freeVram: s1GB * 0.2,
+                            // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
+                            // to create a context
+                            freeVram: s1GB * 0.2,
 
-                        ignoreMemorySafetyChecks: true
-                    });
-                    expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.eql(null);
-                }
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.eql(null);
+                    }
 
 
-                {
-                    const res = await resolveGpuLayers(16, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers(16, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false,
-                        ignoreMemorySafetyChecks: true
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-            });
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
 
-            it("attempts to resolve 32 gpuLayers", async () => {
-                {
-                    const res = await resolveGpuLayers(32, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 6
-                    });
-                    expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7562");
-                }
-                try {
-                    await resolveGpuLayers(32, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.2
-                    });
-                    expect.unreachable("Should have thrown an error");
-                } catch (err) {
-                    expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
-                }
-                {
-                    const res = await resolveGpuLayers(32, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0,
-                        ignoreMemorySafetyChecks: true
-                    });
-                    expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
-                }
+                test("some RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
 
-                {
-                    const res = await resolveGpuLayers(32, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers(32, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false,
-                        ignoreMemorySafetyChecks: true
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-            });
+                            // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
+                            // to create a context
+                            freeVram: s1GB * 0.2,
 
-            it("attempts to resolve 33 gpuLayers", async () => {
-                {
-                    const res = await resolveGpuLayers(33, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 6
-                    });
-                    expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7562");
-                }
-                try {
-                    await resolveGpuLayers(33, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.2
-                    });
-                    expect.unreachable("Should have thrown an error");
-                } catch (err) {
-                    expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
-                }
-                {
-                    const res = await resolveGpuLayers(33, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.2,
-                        ignoreMemorySafetyChecks: true
-                    });
-                    expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
-                }
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
 
-                {
-                    const res = await resolveGpuLayers(33, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers(33, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false,
-                        ignoreMemorySafetyChecks: true
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.eql(null);
+                    }
+
+
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 7,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 6,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                });
+
+                test("some unified RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7411");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2168");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5,
+                            unifiedMemorySize: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7411");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+
+                            // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
+                            // to create a context
+                            freeVram: s1GB * 0.2,
+
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
+                            unifiedMemorySize: s1GB * 6,
+
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.eql(null);
+                    }
+
+
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5.4,
+                            unifiedMemorySize: s1GB * 6,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4352");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5,
+                            unifiedMemorySize: s1GB * 6,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }
+                });
+
+                test("with swap", async () => {
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 3
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 3
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    try {
+                        await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 3
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: s1GB * 6,
+
+                            // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
+                            // to create a context
+                            freeVram: s1GB * 0.2,
+
+                            totalRam: s1GB * 3,
+                            freeRam: s1GB * 2,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 3,
+
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.eql(null);
+                    }
+
+
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 3,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 3,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
+                    }
+                    {
+                        const res = await resolveGpuLayers(16, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 0,
+                            totalSwap: s1GB * 6,
+                            freeSwap: s1GB * 3,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
             });
 
-            it('attempts to resolve "max"', async () => {
-                try {
-                    await resolveGpuLayers("max", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0
-                    });
-                    expect.unreachable("Should have thrown an error");
-                } catch (err) {
-                    expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
-                }
+            describe("attempts to resolve 32 gpuLayers", () => {
+                it("no RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(32);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                    try {
+                        await resolveGpuLayers(32, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(32);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
 
-                try {
-                    await resolveGpuLayers("max", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.2
-                    });
-                    expect.unreachable("Should have thrown an error");
-                } catch (err) {
-                    expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
-                }
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
 
-                try {
-                    await resolveGpuLayers("max", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 3.2
-                    });
-                    expect.unreachable("Should have thrown an error");
-                } catch (err) {
-                    expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
-                }
+                it("some RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(32);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    try {
+                        await resolveGpuLayers(32, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 0.2
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 0,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(32);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
 
-                {
-                    const res = await resolveGpuLayers("max", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 1.2,
-                        ignoreMemorySafetyChecks: true
-                    });
-                    expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
-                }{
-                    const res = await resolveGpuLayers("max", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.7
-                    });
-                    expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("607");
-                }
-                {
-                    const res = await resolveGpuLayers("max", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.8
-                    });
-                    expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1142");
-                }
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }{
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 4.8,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1143");
+                    }
+                    {
+                        const res = await resolveGpuLayers(32, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 4,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
             });
 
-            it('attempts to resolve "auto"', async () => {
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: 0
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.4
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 1.4
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 1.8
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5164");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 2.4
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 3.1
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 3.3
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 3.5
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 3.8
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.3
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.5
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8076");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.8
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8140");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 5.2
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3282");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 5.8
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6492");
-                }
-                {
-                    const res = await resolveGpuLayers("auto", {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 6
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7562");
-                }
+            describe("attempts to resolve 33 gpuLayers", () => {
+                test("no RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    try {
+                        await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
+
+                test("some RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 4
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    try {
+                        await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 4,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 4.8,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1143");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6,
+                            llamaGpu: false
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6,
+                            llamaGpu: false,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                });
+
+                test("some unified RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 5.4,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4352");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 4.8,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1142");
+                    }
+                    try {
+                        await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6,
+                            unifiedMemorySize: s1GB * 6
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers(33, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.2,
+                            totalRam: s1GB * 6,
+                            freeRam: s1GB * 6,
+                            unifiedMemorySize: s1GB * 6,
+                            ignoreMemorySafetyChecks: true
+                        });
+                        expect(res.gpuLayers).to.eql(33);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    }
+                });
             });
 
-            it("attempts to resolve {min?: number, max?: number}", async () => {
-                {
-                    const res = await resolveGpuLayers({max: 4}, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
-                {
-                    const res = await resolveGpuLayers({min: 0, max: 4}, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 0
-                    });
-                    expect(res.gpuLayers).to.eql(0);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
+            it('attempts to resolve "max"', async () => {
                 try {
-                    await resolveGpuLayers({min: 2}, {
+                    await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0
                     });
@@ -467,137 +866,766 @@ describe("functionary", () => {
                 } catch (err) {
                     expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
                 }
+
                 try {
-                    await resolveGpuLayers({min: 2, max: 4}, {
+                    await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 0
+                        freeVram: s1GB * 0.2
                     });
                     expect.unreachable("Should have thrown an error");
                 } catch (err) {
                     expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
                 }
 
-                {
-                    const res = await resolveGpuLayers({max: 16}, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 3.8
-                    });
-                    expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                }
                 try {
-                    await resolveGpuLayers({min: 16}, {
+                    await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 2
+                        freeVram: s1GB * 3.2
                     });
                     expect.unreachable("Should have thrown an error");
                 } catch (err) {
                     expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
                 }
+
                 {
-                    const res = await resolveGpuLayers({min: 16}, {
+                    const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4
+                        freeVram: s1GB * 1.2,
+                        ignoreMemorySafetyChecks: true
                     });
-                    expect(res.gpuLayers).to.be.gte(16);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    expect(res.gpuLayers).to.eql(33);
+                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
                 }
                 {
-                    const res = await resolveGpuLayers({min: 16, max: 24}, {
+                    const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4
+                        freeVram: s1GB * 4.7
                     });
-                    expect(res.gpuLayers).to.be.gte(16);
-                    expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    expect(res.gpuLayers).to.eql(33);
+                    expect(res.contextSize).to.toMatchInlineSnapshot("607");
                 }
                 {
-                    const res = await resolveGpuLayers({min: 16, max: 24}, {
+                    const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 3
+                        freeVram: s1GB * 4.8
                     });
-                    expect(res.gpuLayers).to.be.gte(16);
-                    expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                    expect(res.gpuLayers).to.eql(33);
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1142");
                 }
             });
 
-            it("attempts to resolve {fitContext?: {contextSize?: number}}", async () => {
-                {
-                    const contextSize = 4096;
-                    const res = await resolveGpuLayers({fitContext: {contextSize}}, {
-                        totalVram: 0,
-                        freeVram: 0,
-                        llamaGpu: false
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                    expect(res.contextSize).to.be.gte(contextSize);
-                }
-                {
-                    const contextSize = 4096;
-                    const res = await resolveGpuLayers({fitContext: {contextSize}}, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5561");
-                    expect(res.contextSize).to.be.gte(contextSize);
-                }
-                {
-                    const contextSize = 4096;
-                    const res = await resolveGpuLayers({fitContext: {contextSize}}, {
-                        totalVram: s1GB * 2,
-                        freeVram: s1GB * 1.8
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5164");
-                    expect(res.contextSize).to.be.gte(contextSize);
-                }
-                {
-                    const contextSize = 8192;
-                    const res = await resolveGpuLayers({fitContext: {contextSize}}, {
-                        totalVram: s1GB * 6,
-                        freeVram: s1GB * 4
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                    expect(res.contextSize).to.be.gte(contextSize);
-                }
-                {
-                    const contextSize = 8192;
-                    const res = await resolveGpuLayers({fitContext: {contextSize}}, {
-                        totalVram: s1GB * 1,
-                        freeVram: s1GB * 1.8
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                    expect(res.contextSize).to.be.gte(contextSize);
-                }
-                {
-                    const contextSize = 8192;
-                    const res = await resolveGpuLayers({fitContext: {contextSize}}, {
-                        totalVram: s1GB * 0,
-                        freeVram: s1GB * 0
-                    });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
-                    expect(res.contextSize).to.be.gte(contextSize);
-                }
-                {
+            describe('attempts to resolve "auto"', () => {
+                test("8GB RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: 0,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 1.4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 1.8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 2.4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.1,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.3,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.5,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4.3,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4.5,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8076");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4.8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8140");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 5.2,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3282");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 5.8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6492");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                });
+
+                test("5GB RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: 0,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0.4,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 1.4,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 1.8,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 2.4,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.1,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.3,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.5,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.8,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4.3,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4.5,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8076");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4.8,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8140");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 5.2,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3282");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 5.8,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6492");
+                    }
+                    {
+                        const res = await resolveGpuLayers("auto", {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 6,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                    }
+                });
+            });
+
+            describe("attempts to resolve {min?: number, max?: number}", () => {
+                test("8GB RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers({max: 4}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 0, max: 4}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
                     try {
-                        await resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, {
-                            totalVram: s1GB * 0.2,
-                            freeVram: s1GB * 0
+                        await resolveGpuLayers({min: 2}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
                         });
                         expect.unreachable("Should have thrown an error");
                     } catch (err) {
                         expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
                     }
-                }
+                    try {
+                        await resolveGpuLayers({min: 2, max: 4}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+
+                    {
+                        const res = await resolveGpuLayers({max: 16}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    try {
+                        await resolveGpuLayers({min: 16}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 2,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 16}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.be.gte(16);
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 16, max: 24}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.be.gte(16);
+                        expect(res.gpuLayers).to.be.lte(24);
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 16, max: 24}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.be.gte(16);
+                        expect(res.gpuLayers).to.be.lte(24);
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                    }
+                });
+
+                test("5GB RAM", async () => {
+                    {
+                        const res = await resolveGpuLayers({max: 4}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 0, max: 4}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.eql(0);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                    }
+                    try {
+                        await resolveGpuLayers({min: 2}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    try {
+                        await resolveGpuLayers({min: 2, max: 4}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+
+                    {
+                        const res = await resolveGpuLayers({max: 16}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3.8,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.eql(16);
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    try {
+                        await resolveGpuLayers({min: 16}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 2,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect.unreachable("Should have thrown an error");
+                    } catch (err) {
+                        expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 16}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.be.gte(16);
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 16, max: 24}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.be.gte(16);
+                        expect(res.gpuLayers).to.be.lte(24);
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                    }
+                    {
+                        const res = await resolveGpuLayers({min: 16, max: 24}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 3,
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 5
+                        });
+                        expect(res.gpuLayers).to.be.gte(16);
+                        expect(res.gpuLayers).to.be.lte(24);
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                    }
+                });
+            });
+
+            describe("attempts to resolve {fitContext?: {contextSize?: number}}", () => {
+                test("8GB RAM", async () => {
+                    {
+                        const contextSize = 4096;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 4096;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5561");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 4096;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 2,
+                            freeVram: s1GB * 1.8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 8192;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 6,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 8192;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 2,
+                            freeVram: s1GB * 1.9,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 8192;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 0,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        try {
+                            await resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, {
+                                totalVram: s1GB * 0.2,
+                                freeVram: s1GB * 0,
+                                totalRam: s1GB * 8,
+                                freeRam: s1GB * 8
+                            });
+                            expect.unreachable("Should have thrown an error");
+                        } catch (err) {
+                            expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                        }
+                    }
+                });
+
+                test("7GB RAM", async () => {
+                    {
+                        const contextSize = 4096;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: 0,
+                            freeVram: 0,
+                            llamaGpu: false,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 7
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 4096;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 7,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 7
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6548");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 4096;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 2,
+                            freeVram: s1GB * 1.8,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 7
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 8192;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 7,
+                            freeVram: s1GB * 4,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 7
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 8192;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 2,
+                            freeVram: s1GB * 1.9,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 7
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        const contextSize = 8192;
+                        const res = await resolveGpuLayers({fitContext: {contextSize}}, {
+                            totalVram: s1GB * 0,
+                            freeVram: s1GB * 0,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 7
+                        });
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.contextSize).to.be.gte(contextSize);
+                    }
+                    {
+                        try {
+                            await resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, {
+                                totalVram: s1GB * 0.2,
+                                freeVram: s1GB * 0,
+                                totalRam: s1GB * 7,
+                                freeRam: s1GB * 7
+                            });
+                            expect.unreachable("Should have thrown an error");
+                        } catch (err) {
+                            expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
+                        }
+                    }
+                });
             });
         });
     });
diff --git a/test/modelDependent/llama3/chatSession.test.ts b/test/modelDependent/llama3/chatSession.test.ts
index e083dc57..8252da13 100644
--- a/test/modelDependent/llama3/chatSession.test.ts
+++ b/test/modelDependent/llama3/chatSession.test.ts
@@ -108,6 +108,80 @@ describe("llama 3", () => {
             expect(completion).to.eql(" it is.");
         });
 
+        test("prompt longer than context size incurs context shift", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const contextSize = 128;
+
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct-Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence(),
+                systemPrompt: "You are a helpful, respectful and honest biologist. " +
+                    "Always answer as helpfully as possible with extensive detail."
+            });
+            const prompt = "Describe the appearance of a llama and explain what it is. " +
+                "Include as much detail as possible with detailed examples and explanations, including its physical appearance, " +
+                "habitat, diet, social structure, and any other relevant information. " +
+                "Do not assume any prior knowledge on the part of the reader, and always provide detailed explanations as you describe the animal. " +
+                "Remember to be as helpful and detailed as possible in your response and your role in great importance in educating the reader. " +
+                "Assume that the reader is a student who is eager to learn and is looking to you for guidance and information, and always provide the best possible information you can. " +
+                "Do not provide any false or misleading information, and always be honest and respectful in your responses.";
+
+            const initialContextState = chatSession.chatWrapper.generateContextState({
+                chatHistory: [...chatSession.getChatHistory(), {
+                    type: "user",
+                    text: prompt
+                }, {
+                    type: "model",
+                    response: [""]
+                }]
+            });
+            const initialContextStateTokens = initialContextState.contextText.tokenize(model.tokenizer);
+
+            expect(initialContextStateTokens.length).to.be.gt(contextSize);
+
+            const res = await chatSession.prompt(prompt, {maxTokens: contextSize});
+            expect(res.length).to.be.gte(20);
+
+            // ensure there's no repetition of the first part
+            const firstPart = res.slice(0, 12);
+            const firstPartOccurrences = res.split(firstPart).length - 1;
+            expect(firstPartOccurrences).to.eql(1);
+        });
+
+        test("using response prefix", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct-Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 2048
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            expect(chatSession.chatWrapper).to.be.an.instanceof(Llama3ChatWrapper);
+
+            const prompt = "Describe the appearance of a llama";
+            const responsePrefix = "Of course! A llama is";
+            const res = await chatSession.prompt(prompt, {
+                responsePrefix,
+                maxTokens: 10
+            });
+
+            expect(res.startsWith(responsePrefix)).to.eql(true);
+            expect(res).toMatchInlineSnapshot('"Of course! A llama is a domesticated mammal that belongs to the camel"');
+        });
+
         // disabled due to getting timeout in the CI due to taking too long
         test.skip("context shift works correctly", {timeout: 1000 * 60 * 60 * 2}, async () => {
             const contextSize = 2048;
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
index a8ea5834..13539a84 100644
--- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
@@ -18,9 +18,15 @@ describe("stableCode", () => {
             const s1GB = Math.pow(1024, 3);
 
             async function resolveGpuLayers(gpuLayers: LlamaModelOptions["gpuLayers"], {
-                totalVram, freeVram, ignoreMemorySafetyChecks = false, llamaGpu = "metal"
+                totalVram, freeVram, unifiedMemorySize = 0,
+                totalRam = s1GB * 10, freeRam = s1GB * 10, // TODO: update all tests to test different RAM sizes
+                totalSwap = 0, freeSwap = 0,
+                ignoreMemorySafetyChecks = false, llamaGpu = "metal"
             }: {
-                totalVram: number, freeVram: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu
+                totalVram: number, freeVram: number, unifiedMemorySize?: number,
+                totalRam?: number, freeRam?: number,
+                totalSwap?: number, freeSwap?: number,
+                ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu
             }) {
                 const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, {
                     ignoreMemorySafetyChecks,
@@ -34,27 +40,31 @@ describe("stableCode", () => {
                 });
 
                 async function resolveAutoContextSize() {
-                    const modelVram = ggufInsights.estimateModelResourceRequirements({
-                        gpuLayers: resolvedGpuLayers
-                    }).gpuVram;
+                    const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({
+                        targetGpuLayers: resolvedGpuLayers
+                    }, {
+                        llamaGpu,
+                        getVramState: async () => ({
+                            total: llamaGpu === false ? 0 : totalVram,
+                            free: llamaGpu === false ? 0 : freeVram,
+                            unifiedSize: unifiedMemorySize
+                        }),
+                        getRamState: async () => ({
+                            total: totalRam,
+                            free: freeRam
+                        }),
+                        getSwapState: async () => ({
+                            total: totalSwap,
+                            free: freeSwap
+                        }),
+                        llamaSupportsGpuOffloading: llamaGpu !== false,
+                        llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram)
+                    });
 
-                    try {
-                        return await ggufInsights.configurationResolver.resolveContextContextSize("auto", {
-                            batchSize: undefined,
-                            sequences: 1,
-                            modelGpuLayers: resolvedGpuLayers,
-                            modelTrainContextSize: ggufInsights.trainContextSize ?? 4096,
-                            getVramState: async () => ({
-                                total: llamaGpu === false ? 0 : totalVram,
-                                free: llamaGpu === false ? 0 : (freeVram - modelVram)
-                            }),
-                            llamaGpu,
-                            ignoreMemorySafetyChecks: false,
-                            isEmbeddingContext: false
-                        });
-                    } catch (err) {
+                    if (resolvedConfig.compatibilityScore === 0)
                         return null;
-                    }
+
+                    return resolvedConfig.resolvedValues.contextSize;
                 }
 
                 return {
@@ -130,7 +140,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.eql(null);
+                    expect(res.contextSize).to.toMatchInlineSnapshot("133");
                 }
 
 
@@ -180,7 +190,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("94");
                 }
 
                 {
@@ -229,7 +239,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("94");
                 }
 
                 {
@@ -291,7 +301,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("94");
                 }{
                     const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,