diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 004e4b94..4e2032b4 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -35,11 +35,10 @@ body: id: steps attributes: label: Steps to reproduce - description: >- + description: |- Your bug can be investigated much faster if your code can be run without any dependencies other than `node-llama-cpp`. Issues without reproduction steps or code examples may be closed as not actionable. - Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)). - Please include a link to the model file you used if possible. + Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)), including a link to the model file you used if possible. Also, please enable enable debug logs by using `getLlama({debug: true})` to get more information. placeholder: >- Please try to provide a Minimal, Complete, and Verifiable example. @@ -50,10 +49,9 @@ body: id: env attributes: label: My Environment - description: >- + description: |- Please include the result of the command `npx --yes node-llama-cpp inspect gpu`. - Please also add any other relevant dependencies to this table at the end. - For example: Electron, Bun, Webpack. + Please also add any other relevant dependencies to this table at the end. For example: Electron, Bun, Webpack. value: | | Dependency | Version | | --- | --- | diff --git a/.github/ISSUE_TEMPLATE/documentation-issue.yml b/.github/ISSUE_TEMPLATE/documentation-issue.yml index 118756bd..53e74a4f 100644 --- a/.github/ISSUE_TEMPLATE/documentation-issue.yml +++ b/.github/ISSUE_TEMPLATE/documentation-issue.yml @@ -13,7 +13,7 @@ body: id: details attributes: label: What was unclear or otherwise insufficient? - description: >- + description: |- If relevant, please be clear about the documentation URL, as well as the location within the page. Add a link to the relevant documentation you're referring to. placeholder: >- diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 59ec39fd..c0ec58c2 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -51,8 +51,12 @@ body: required: false - label: CUDA support required: false + - label: Vulkan support + required: false - label: Grammar required: false + - label: Function calling + required: false - type: dropdown id: pr attributes: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bfe9a3e4..2814133f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -383,7 +383,7 @@ jobs: model-dependent-tests: name: Model dependent tests - runs-on: macos-13 + runs-on: macos-12 env: NODE_LLAMA_CPP_GPU: false needs: @@ -417,6 +417,9 @@ jobs: - name: Build binary run: node ./dist/cli/cli.js source build --noUsageExample + - name: Inspect hardware + run: node ./dist/cli/cli.js inspect gpu + - name: Cache models id: cache-test-models uses: actions/cache@v4 diff --git a/.vitepress/config.ts b/.vitepress/config.ts index 422b79a2..b51b0940 100644 --- a/.vitepress/config.ts +++ b/.vitepress/config.ts @@ -34,7 +34,8 @@ const packageVersion = env.get("DOCS_PACKAGE_VERSION") .default(packageJson.version) .asString(); -const hostname = "https://node-llama-cpp.withcat.ai/"; +const hostname = "https://node-llama-cpp.withcat.ai/" +const buildDate = new Date(); const socialPosterLink = hostname + "social.poster.jpg"; const defaultPageTitle = "node-llama-cpp - node.js bindings for llama.cpp"; @@ -90,7 +91,7 @@ export default defineConfig({ base: urlBase, sitemap: { hostname, - transformItems(items) { + async transformItems(items) { function priorityMatch(a: {url: string}, b: {url: string}, matchers: ((url: string) => boolean)[]): number { for (const matcher of matchers) { const aMatch = matcher(a.url); @@ -105,13 +106,38 @@ export default defineConfig({ return 0; } + const blogPosts = await createContentLoader("blog/*.md", { + excerpt: true, + render: true + }) + .load(); + const blogPostMap = new Map(); + for (const blogPost of blogPosts) { + let url = blogPost.url; + if (url.startsWith("/")) + url = url.slice("/".length); + + blogPostMap.set(url, blogPost); + } + return items .map((item) => { - if (item.url.startsWith("api/") || item.url.startsWith("cli/")) { + if (item.url === "" || item.url === "blog/") { + item.lastmod = new Date(buildDate); + } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) { item = { ...item, - lastmod: undefined + lastmod: new Date(buildDate) }; + } else if (item.lastmod == null && item.url.startsWith("blog/")) { + const postDate = blogPostMap.get(item.url)?.frontmatter.date; + if (postDate != null) { + const parsedDate = new Date(postDate); + if (Number.isFinite(parsedDate.getTime())) + item.lastmod = parsedDate; + } + } else if (item.lastmod == null) { + item.lastmod = new Date(buildDate); } return item; diff --git a/README.md b/README.md index faacd427..569f7990 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- node-llama-cpp Logo + node-llama-cpp Logo

node-llama-cpp

Run AI models locally on your machine

Pre-built bindings are provided with a fallback to building from source with cmake diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md index 3f8c3cb5..dce8ecd1 100644 --- a/docs/guide/chat-session.md +++ b/docs/guide/chat-session.md @@ -671,3 +671,34 @@ await new Promise(resolve => setTimeout(resolve, 1500)); const cachedCompletion = completionEngine.complete("Hi there! How"); console.log("Cached completion:", cachedCompletion); ``` + +## Response Prefix {#response-prefix} +You can force the model response to start with a specific prefix, +to make the model follow a certain direction in its response. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama, LlamaChatSession, GeneralChatWrapper} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const session = new LlamaChatSession({ + contextSequence: context.getSequence(), + chatWrapper: new GeneralChatWrapper() +}); + + +const q1 = "Hi there, how are you?"; +console.log("User: " + q1); + +const a1 = await session.prompt(q1, { + responsePrefix: "The weather today is" +}); +console.log("AI: " + a1); +``` diff --git a/docs/guide/electron.md b/docs/guide/electron.md index 1e2204c8..dc1fc2aa 100644 --- a/docs/guide/electron.md +++ b/docs/guide/electron.md @@ -37,3 +37,27 @@ so that `node-llama-cpp` can find them. Cross packaging from one platform to another is not supported, since binaries for other platforms are not downloaded to you machine when your run `npm install`. Packaging an `arm64` app on an `x64` machine is supported, but packaging an `x64` app on an `arm64` machine is not. + +## Bundling +When bundling your code for Electron using [Electron Vite](https://electron-vite.org) or Webpack, +ensure that `node-llama-cpp` is not bundled, and is instead treated as an external module. + +Marking `node-llama-cpp` as an external module will prevent its code from being bundled with your application code, +and instead, it'll be loaded from the `node_modules` directory at runtime (which should be packed into a `.asar` archive). + +The file structure of `node-llama-cpp` is crucial for it to function correctly, +so bundling it will break its functionality. +Moreover, since `node-llama-cpp` includes prebuilt binaries (and also local builds from source), +those files must be retained in their original structure for it to work. + +Electron has [its own bundling solution called ASAR](https://www.electronjs.org/docs/latest/tutorial/asar-archives) that is designed to work with node modules. +ASAR retains the original file structure of node modules by packing all the files into a single `.asar` archive file that Electron will read from at runtime like it would from the file system. +This method ensures node modules work as intended in Electron applications, even though they are bundled into a single file. + +Using ASAR is the recommended way to bundle `node-llama-cpp` in your Electron app. + +If you're using the scaffolded Electron app, this is already taken care of. + +::: tip NOTE +We recommend using [Electron Vite](https://electron-vite.org) over Webpack for your Electron app due to to Vite's speed and Webpack's lack of proper ESM support in the output bundle, which complicates the bundling process. +::: diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md index d8d1eea6..df3949e3 100644 --- a/docs/guide/tips-and-tricks.md +++ b/docs/guide/tips-and-tricks.md @@ -85,3 +85,37 @@ npx --no node-llama-cpp source download ``` Now, just use `node-llama-cpp` as you normally would. + +## Intel AMX {#intel-amx} +> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors +> that helps optimize and accelerate matrix multiplication operations. +> +> It's available on the 4th Gen and newer Intel Xeon processors. + +Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions). + +If you're using a 4th Gen or newer Intel Xeon processor, +you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware. + +To do this, run this command inside your project on the machine you run your project on: +```shell +npx --no node-llama-cpp source download +``` + +Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries +and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU: + +```typescript +import os from "os"; +import {getLlama} from "node-llama-cpp"; + +const llama = await getLlama({ + usePrebuiltBinaries: !os.cpus().some((cpu) => ( + cpu.model.toLowerCase().includes("Xeon".toLowerCase()) + )) +}); +``` +::: info NOTE +Building from source can take some time (when using CUDA even up to an hour in extreme cases), +so ensure you dedicate some time for this as part of the deployment process. +::: diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt index fc7c5504..90a424c2 100644 --- a/llama/CMakeLists.txt +++ b/llama/CMakeLists.txt @@ -22,6 +22,12 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)" OUTPUT_VARIABLE NODE_ADDON_API_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) +set(LLAMA_BUILD_COMMON ON) + +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + add_compile_options(-Wno-c++17-extensions) +endif() + include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC}) add_subdirectory("llama.cpp") diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 93cbe413..21ccab8c 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -447,7 +447,7 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) { GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens); for (size_t i = 0; i < tokensLength; i++) { - llama_batch_add(batch, static_cast(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false); + common_batch_add(batch, static_cast(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false); } if (generateLogitAtTheEnd) { diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp index 27340fa4..ec73c45b 100644 --- a/llama/addon/AddonModel.cpp +++ b/llama/addon/AddonModel.cpp @@ -426,7 +426,7 @@ Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) { std::string text = info[0].As().Utf8Value(); bool specialTokens = info[1].As().Value(); - std::vector tokens = llama_tokenize(model, text, false, specialTokens); + std::vector tokens = common_tokenize(model, text, false, specialTokens); Napi::Uint32Array result = Napi::Uint32Array::New(info.Env(), tokens.size()); for (size_t i = 0; i < tokens.size(); ++i) { @@ -539,7 +539,7 @@ Napi::Value AddonModel::PrefixToken(const Napi::CallbackInfo& info) { return info.Env().Undefined(); } - return getNapiToken(info, model, llama_token_prefix(model)); + return getNapiToken(info, model, llama_token_fim_pre(model)); } Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) { if (disposed) { @@ -547,7 +547,7 @@ Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) { return info.Env().Undefined(); } - return getNapiToken(info, model, llama_token_middle(model)); + return getNapiToken(info, model, llama_token_fim_mid(model)); } Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) { if (disposed) { @@ -555,7 +555,7 @@ Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) { return info.Env().Undefined(); } - return getNapiToken(info, model, llama_token_suffix(model)); + return getNapiToken(info, model, llama_token_fim_suf(model)); } Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) { if (disposed) { diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp index 89d0b075..d84160d7 100644 --- a/llama/addon/AddonSampler.cpp +++ b/llama/addon/AddonSampler.cpp @@ -52,11 +52,6 @@ void AddonSampler::dispose() { topPSampler = nullptr; } - if (softmaxSampler != nullptr) { - llama_sampler_free(softmaxSampler); - softmaxSampler = nullptr; - } - if (seedSampler != nullptr) { llama_sampler_free(seedSampler); seedSampler = nullptr; @@ -135,10 +130,6 @@ void AddonSampler::rebuildChainIfNeeded() { llama_sampler_chain_add(chain, temperatureSampler); } - if (softmaxSampler != nullptr) { - llama_sampler_chain_add(chain, softmaxSampler); - } - if (seedSampler != nullptr) { llama_sampler_chain_add(chain, seedSampler); } @@ -206,10 +197,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { } } - if (softmaxSampler == nullptr) { - softmaxSampler = llama_sampler_init_softmax(); - } - if (config.Has("minP")) { auto minP = config.Get("minP").As().FloatValue(); if (minP != minPSampler_minP) { diff --git a/llama/addon/AddonSampler.h b/llama/addon/AddonSampler.h index 942d03d2..33114b49 100644 --- a/llama/addon/AddonSampler.h +++ b/llama/addon/AddonSampler.h @@ -25,8 +25,6 @@ class AddonSampler : public Napi::ObjectWrap { llama_sampler * topPSampler = nullptr; float topPSampler_topP = 0.0f; // Top p sampling >=1.0 = disabled - - llama_sampler * softmaxSampler = nullptr; llama_sampler * seedSampler = nullptr; uint32_t seedSampler_seed = 0; diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp index 16393618..5c2d1c52 100644 --- a/llama/addon/addon.cpp +++ b/llama/addon/addon.cpp @@ -8,6 +8,7 @@ #include "globals/addonLog.h" #include "globals/addonProgress.h" #include "globals/getGpuInfo.h" +#include "globals/getSwapInfo.h" bool backendInitialized = false; bool backendDisposed = false; @@ -203,6 +204,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) { Napi::PropertyDescriptor::Function("getGpuVramInfo", getGpuVramInfo), Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo), Napi::PropertyDescriptor::Function("getGpuType", getGpuType), + Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo), Napi::PropertyDescriptor::Function("init", addonInit), Napi::PropertyDescriptor::Function("dispose", addonDispose), }); diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp index f3a67185..ef51c1cd 100644 --- a/llama/addon/globals/getGpuInfo.cpp +++ b/llama/addon/globals/getGpuInfo.cpp @@ -26,6 +26,7 @@ void logVulkanWarning(const char* message) { Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) { uint64_t total = 0; uint64_t used = 0; + uint64_t unifiedVramSize = 0; #ifdef GPU_INFO_USE_CUDA size_t cudaDeviceTotal = 0; @@ -41,26 +42,31 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) { #ifdef GPU_INFO_USE_VULKAN uint64_t vulkanDeviceTotal = 0; uint64_t vulkanDeviceUsed = 0; - const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, logVulkanWarning); + uint64_t vulkanDeviceUnifiedVramSize = 0; + const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, &vulkanDeviceUnifiedVramSize, logVulkanWarning); if (vulkanDeviceSupportsMemoryBudgetExtension) { total += vulkanDeviceTotal; used += vulkanDeviceUsed; + unifiedVramSize += vulkanDeviceUnifiedVramSize; } #endif #ifdef GPU_INFO_USE_METAL uint64_t metalDeviceTotal = 0; uint64_t metalDeviceUsed = 0; - getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed); + uint64_t metalDeviceUnifiedVramSize = 0; + getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed, &metalDeviceUnifiedVramSize); total += metalDeviceTotal; used += metalDeviceUsed; + unifiedVramSize += metalDeviceUnifiedVramSize; #endif Napi::Object result = Napi::Object::New(info.Env()); result.Set("total", Napi::Number::From(info.Env(), total)); result.Set("used", Napi::Number::From(info.Env(), used)); + result.Set("unifiedSize", Napi::Number::From(info.Env(), unifiedVramSize)); return result; } diff --git a/llama/addon/globals/getSwapInfo.cpp b/llama/addon/globals/getSwapInfo.cpp new file mode 100644 index 00000000..bae94612 --- /dev/null +++ b/llama/addon/globals/getSwapInfo.cpp @@ -0,0 +1,69 @@ +#include "getSwapInfo.h" +#include "addonLog.h" + +#ifdef __APPLE__ +#include +#include +#include +#elif __linux__ +#include +#include +#elif _WIN32 +#include +#include +#include +#endif + + +Napi::Value getSwapInfo(const Napi::CallbackInfo& info) { + uint64_t totalSwap = 0; + uint64_t freeSwap = 0; + uint64_t maxSize = 0; + bool maxSizeSet = true; + +#ifdef __APPLE__ + struct xsw_usage swapInfo; + size_t size = sizeof(swapInfo); + + if (sysctlbyname("vm.swapusage", &swapInfo, &size, NULL, 0) == 0) { + totalSwap = swapInfo.xsu_total; + freeSwap = swapInfo.xsu_avail; + maxSizeSet = false; + } else { + addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get swap info").c_str(), nullptr); + } +#elif __linux__ + struct sysinfo sysInfo; + + if (sysinfo(&sysInfo) == 0) { + totalSwap = sysInfo.totalswap; + freeSwap = sysInfo.freeswap; + maxSize = sysInfo.totalswap; + } else { + addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get swap info").c_str(), nullptr); + } +#elif _WIN32 + MEMORYSTATUSEX memInfo; + memInfo.dwLength = sizeof(MEMORYSTATUSEX); + + if (GlobalMemoryStatusEx(&memInfo)) { + PERFORMANCE_INFORMATION perfInfo; + perfInfo.cb = sizeof(PERFORMANCE_INFORMATION); + if (GetPerformanceInfo(&perfInfo, sizeof(perfInfo))) { + totalSwap = memInfo.ullTotalPageFile; + freeSwap = memInfo.ullAvailPageFile; + maxSize = perfInfo.CommitLimit * perfInfo.PageSize; + } else { + addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get max pagefile size").c_str(), nullptr); + } + } else { + addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get pagefile info").c_str(), nullptr); + } +#endif + + Napi::Object obj = Napi::Object::New(info.Env()); + obj.Set("total", Napi::Number::New(info.Env(), totalSwap)); + obj.Set("free", Napi::Number::New(info.Env(), freeSwap)); + obj.Set("maxSize", maxSizeSet ? Napi::Number::New(info.Env(), maxSize) : Napi::Number::New(info.Env(), -1)); + return obj; +} diff --git a/llama/addon/globals/getSwapInfo.h b/llama/addon/globals/getSwapInfo.h new file mode 100644 index 00000000..dd265c60 --- /dev/null +++ b/llama/addon/globals/getSwapInfo.h @@ -0,0 +1,4 @@ +#pragma once +#include "napi.h" + +Napi::Value getSwapInfo(const Napi::CallbackInfo& info); diff --git a/llama/gpuInfo/metal-gpu-info.h b/llama/gpuInfo/metal-gpu-info.h index 30056ce7..9a199bee 100644 --- a/llama/gpuInfo/metal-gpu-info.h +++ b/llama/gpuInfo/metal-gpu-info.h @@ -4,5 +4,5 @@ #include #include -void getMetalGpuInfo(uint64_t * total, uint64_t * used); +void getMetalGpuInfo(uint64_t * total, uint64_t * used, uint64_t * unifiedMemorySize); void getMetalGpuDeviceNames(std::vector * deviceNames); \ No newline at end of file diff --git a/llama/gpuInfo/metal-gpu-info.mm b/llama/gpuInfo/metal-gpu-info.mm index 7bfd6bce..46ac0b18 100644 --- a/llama/gpuInfo/metal-gpu-info.mm +++ b/llama/gpuInfo/metal-gpu-info.mm @@ -3,15 +3,22 @@ #include #import -void getMetalGpuInfo(uint64_t * total, uint64_t * used) { +void getMetalGpuInfo(uint64_t * total, uint64_t * used, uint64_t * unifiedMemorySize) { id device = MTLCreateSystemDefaultDevice(); if (device) { *total = device.recommendedMaxWorkingSetSize; *used = device.currentAllocatedSize; + + if (device.hasUnifiedMemory) { + *unifiedMemorySize = device.recommendedMaxWorkingSetSize; + } else { + *unifiedMemorySize = 0; + } } else { *total = 0; *used = 0; + *unifiedMemorySize = 0; } [device release]; diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp index 0b9a6556..25356546 100644 --- a/llama/gpuInfo/vulkan-gpu-info.cpp +++ b/llama/gpuInfo/vulkan-gpu-info.cpp @@ -5,7 +5,7 @@ typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message); -static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNames, std::vector * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) { +static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) { vk::ApplicationInfo appInfo("node-llama-cpp GPU info", 1, "llama.cpp", 1, VK_API_VERSION_1_2); vk::InstanceCreateInfo createInfo(vk::InstanceCreateFlags(), &appInfo, {}, {}); vk::Instance instance = vk::createInstance(createInfo); @@ -14,6 +14,7 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa size_t usedMem = 0; size_t totalMem = 0; + size_t totalUnifiedMemorySize = 0; for (size_t i = 0; i < physicalDevices.size(); i++) { vk::PhysicalDevice physicalDevice = physicalDevices[i]; @@ -41,16 +42,20 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa physicalDevice.getMemoryProperties2(&memProps2); for (uint32_t i = 0; i < memProps.memoryHeapCount; ++i) { - if (memProps.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + const auto flags = memProps.memoryHeaps[i].flags; + + if (flags & vk::MemoryHeapFlagBits::eDeviceLocal) { const auto size = memProps.memoryHeaps[i].size; totalMem += size; usedMem += memoryBudgetProperties.heapUsage[i]; + if (flags & vk::MemoryHeapFlagBits::eMultiInstance) { + totalUnifiedMemorySize += size; + } + if (size > 0 && addDeviceNames) { (*deviceNames).push_back(std::string(deviceProps.deviceName.data())); } - - break; } } } else { @@ -58,9 +63,8 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa warningLogCallback( ( "Vulkan VK_EXT_memory_budget extension not supported for device \"" + - std::string(deviceProps.deviceName.data()) + "\", so VRAM info cannot be determained for it" - ) - .c_str() + std::string(deviceProps.deviceName.data()) + "\", so VRAM info cannot be determined for it" + ).c_str() ); return false; } @@ -68,16 +72,19 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, bool addDeviceNa *total = totalMem; *used = usedMem; + *unifiedMemorySize = totalUnifiedMemorySize; + return true; } -bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, gpuInfoVulkanWarningLogCallback_t warningLogCallback) { - return enumerateVulkanDevices(total, used, false, nullptr, warningLogCallback); +bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback) { + return enumerateVulkanDevices(total, used, unifiedMemorySize, false, nullptr, warningLogCallback); } bool gpuInfoGetVulkanDeviceNames(std::vector * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) { size_t vulkanDeviceTotal = 0; size_t vulkanDeviceUsed = 0; + size_t unifiedMemorySize = 0; - return enumerateVulkanDevices(&vulkanDeviceTotal, &vulkanDeviceUsed, true, deviceNames, warningLogCallback); + return enumerateVulkanDevices(&vulkanDeviceTotal, &vulkanDeviceUsed, &unifiedMemorySize, true, deviceNames, warningLogCallback); } diff --git a/llama/gpuInfo/vulkan-gpu-info.h b/llama/gpuInfo/vulkan-gpu-info.h index d2457f10..f8eb0527 100644 --- a/llama/gpuInfo/vulkan-gpu-info.h +++ b/llama/gpuInfo/vulkan-gpu-info.h @@ -5,5 +5,5 @@ typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message); -bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, gpuInfoVulkanWarningLogCallback_t warningLogCallback); +bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback); bool gpuInfoGetVulkanDeviceNames(std::vector * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback); \ No newline at end of file diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index 2422c16b..891d9df4 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -63,12 +63,18 @@ export type BindingModule = { setLoggerLogLevel(level: number): void, getGpuVramInfo(): { total: number, - used: number + used: number, + unifiedSize: number }, getGpuDeviceInfo(): { deviceNames: string[] }, getGpuType(): "cuda" | "vulkan" | "metal" | undefined, + getSwapInfo(): { + total: number, + maxSize: number, + free: number + }, init(): Promise, dispose(): Promise }; diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts index 50831cc9..a30395a2 100644 --- a/src/bindings/Llama.ts +++ b/src/bindings/Llama.ts @@ -1,3 +1,4 @@ +import os from "os"; import chalk from "chalk"; import {DisposedError, EventRelay, withLock} from "lifecycle-utils"; import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js"; @@ -34,6 +35,9 @@ export class Llama { /** @internal */ public readonly _consts: ReturnType; /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator; /** @internal */ public readonly _vramPadding: MemoryReservation; + /** @internal */ public readonly _ramOrchestrator: MemoryOrchestrator; + /** @internal */ public readonly _ramPadding: MemoryReservation; + /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator; /** @internal */ public readonly _debug: boolean; /** @internal */ public readonly _threadsSplitter: ThreadsSplitter; /** @internal */ private readonly _gpu: LlamaGpuType; @@ -61,7 +65,8 @@ export class Llama { public readonly onDispose = new EventRelay(); private constructor({ - bindings, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, gpu, maxThreads, vramOrchestrator, vramPadding + bindings, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, gpu, maxThreads, vramOrchestrator, vramPadding, + ramOrchestrator, ramPadding, swapOrchestrator }: { bindings: BindingModule, logLevel: LlamaLogLevel, @@ -76,7 +81,10 @@ export class Llama { gpu: BuildGpu, maxThreads?: number, vramOrchestrator: MemoryOrchestrator, - vramPadding: MemoryReservation + vramPadding: MemoryReservation, + ramOrchestrator: MemoryOrchestrator, + ramPadding: MemoryReservation, + swapOrchestrator: MemoryOrchestrator }) { this._bindings = bindings; this._gpu = gpu; @@ -88,6 +96,9 @@ export class Llama { this._debug = debug; this._vramOrchestrator = vramOrchestrator; this._vramPadding = vramPadding; + this._ramOrchestrator = ramOrchestrator; + this._ramPadding = ramPadding; + this._swapOrchestrator = swapOrchestrator; this._threadsSplitter = new ThreadsSplitter( maxThreads ?? ( this._gpu === false @@ -235,15 +246,60 @@ export class Llama { return this._vramPadding.size; } + /** + * The total amount of VRAM that is currently being used. + * + * `unifiedSize` represents the amount of VRAM that is shared between the CPU and GPU. + * On SoC devices, this is usually the same as `total`. + */ public async getVramState() { this._ensureNotDisposed(); - const {total, used} = this._bindings.getGpuVramInfo(); + const {total, used, unifiedSize} = this._bindings.getGpuVramInfo(); return { total, used, - free: Math.max(0, total - used) + free: Math.max(0, total - used), + unifiedSize + }; + } + + /** + * Get the state of the swap memory. + * + * **`maxSize`** - The maximum size of the swap memory that the system can allocate. + * If the swap size is dynamic (like on macOS), this will be `Infinity`. + * + * **`allocated`** - The total size allocated by the system for swap memory. + * + * **`used`** - The amount of swap memory that is currently being used from the `allocated` size. + * + * On Windows, this will return the info for the page file. + */ + public async getSwapState(): Promise<{ + /** + * The maximum size of the swap memory that the system can allocate. + * If the swap size is dynamic (like on macOS), this will be `Infinity` + */ + maxSize: number, + + /** The total size allocated by the system for swap memory */ + allocated: number, + + /** The amount of swap memory that is currently being used from the `allocated` size */ + used: number + }> { + this._ensureNotDisposed(); + + const {total, maxSize, free} = this._bindings.getSwapInfo(); + + return { + maxSize: maxSize === -1 + ? Infinity + : maxSize, + allocated: total, + used: total - free }; } @@ -383,7 +439,7 @@ export class Llama { /** @internal */ public static async _create({ - bindings, buildType, buildMetadata, logLevel, logger, vramPadding, maxThreads, skipLlamaInit = false, debug + bindings, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug }: { bindings: BindingModule, buildType: "localBuild" | "prebuilt", @@ -392,16 +448,45 @@ export class Llama { logger: (level: LlamaLogLevel, message: string) => void, maxThreads?: number, vramPadding: number | ((totalVram: number) => number), + ramPadding: number | ((totalRam: number) => number), skipLlamaInit?: boolean, debug: boolean }) { const gpu = bindings.getGpuType() ?? false; const vramOrchestrator = new MemoryOrchestrator(() => { - const {total, used} = bindings.getGpuVramInfo(); + const {total, used, unifiedSize} = bindings.getGpuVramInfo(); + + return { + total, + free: Math.max(0, total - used), + unifiedSize + }; + }); + const ramOrchestrator = new MemoryOrchestrator(() => { + const used = process.memoryUsage().rss; + const total = os.totalmem(); return { total, - free: Math.max(0, total - used) + free: Math.max(0, total - used), + unifiedSize: total + }; + }); + const swapOrchestrator = new MemoryOrchestrator(() => { + const {total, maxSize, free} = bindings.getSwapInfo(); + const used = total - free; + + if (maxSize === -1) + return { + total: Infinity, + free: Infinity, + unifiedSize: Infinity + }; + + return { + total: maxSize, + free: maxSize - used, + unifiedSize: maxSize }; }); @@ -413,6 +498,12 @@ export class Llama { else resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding); + let resolvedRamPadding: MemoryReservation; + if (ramPadding instanceof Function) + resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total)); + else + resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding); + const llama = new Llama({ bindings, buildType, @@ -427,7 +518,10 @@ export class Llama { gpu, vramOrchestrator, maxThreads, - vramPadding: resolvedVramPadding + vramPadding: resolvedVramPadding, + ramOrchestrator, + ramPadding: resolvedRamPadding, + swapOrchestrator }); if (!skipLlamaInit) diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts index 9df05cba..d9e3255f 100644 --- a/src/bindings/getLlama.ts +++ b/src/bindings/getLlama.ts @@ -132,6 +132,17 @@ export type LlamaOptions = { */ vramPadding?: number | ((totalVram: number) => number), + /** + * Pad the available RAM for the memory size calculations, as these calculations are not always accurate. + * Recommended to ensure stability. + * + * Defaults to `25%` of the total RAM or 6GB (1GB on Linux), whichever is lower. + * Set to `0` to disable. + * + * > Since the OS also needs RAM to function, the default value can get up to 6GB on Windows and macOS, and 1GB on Linux. + */ + ramPadding?: number | ((totalRam: number) => number), + /** * Enable debug mode to find issues with llama.cpp. * Makes logs print directly to the console from `llama.cpp` and not through the provided logger. @@ -196,6 +207,17 @@ export type LastBuildOptions = { */ vramPadding?: number | ((totalVram: number) => number), + /** + * Pad the available RAM for the memory size calculations, as these calculations are not always accurate. + * Recommended to ensure stability. + * + * Defaults to `25%` of the total RAM or 6GB (1GB on Linux), whichever is lower. + * Set to `0` to disable. + * + * > Since the OS also needs RAM to function, the default value can get up to 6GB on Windows and macOS, and 1GB on Linux. + */ + ramPadding?: number | ((totalRam: number) => number), + /** * Enable debug mode to find issues with llama.cpp. * Makes logs print directly to the console from `llama.cpp` and not through the provided logger. @@ -210,6 +232,14 @@ export type LastBuildOptions = { export const getLlamaFunctionName = "getLlama"; export const defaultLlamaVramPadding = (totalVram: number) => Math.floor(Math.min(totalVram * 0.06, 1024 * 1024 * 1024)); +export const defaultLlamaRamPadding = (totalRam: number) => { + const platform = getPlatform(); + + if (platform === "linux") + return Math.floor(Math.min(totalRam * 0.25, 1024 * 1024 * 1024)); + + return Math.floor(Math.min(totalRam * 0.25, 1024 * 1024 * 1024 * 6)); +}; const defaultBuildOption: Exclude = runningInElectron ? "never" : "auto"; @@ -251,6 +281,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp skipDownload: lastBuildOptions?.skipDownload ?? defaultSkipDownload, maxThreads: lastBuildOptions?.maxThreads, vramPadding: lastBuildOptions?.vramPadding ?? defaultLlamaVramPadding, + ramPadding: lastBuildOptions?.ramPadding ?? defaultLlamaRamPadding, debug: lastBuildOptions?.debug ?? defaultLlamaCppDebugMode }; @@ -274,6 +305,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp logLevel: lastBuildOptions?.logLevel ?? defaultLlamaCppLogLevel, maxThreads: lastBuildOptions?.maxThreads, vramPadding: lastBuildOptions?.vramPadding ?? defaultLlamaVramPadding, + ramPadding: lastBuildOptions?.ramPadding ?? defaultLlamaRamPadding, debug: lastBuildOptions?.debug ?? defaultLlamaCppDebugMode }); } catch (err) { @@ -300,6 +332,7 @@ export async function getLlamaForOptions({ skipDownload = defaultSkipDownload, maxThreads, vramPadding = defaultLlamaVramPadding, + ramPadding = defaultLlamaRamPadding, debug = defaultLlamaCppDebugMode }: LlamaOptions, { updateLastBuildInfoOnCompile = false, @@ -320,6 +353,7 @@ export async function getLlamaForOptions({ if (progressLogs == null) progressLogs = true; if (skipDownload == null) skipDownload = defaultSkipDownload; if (vramPadding == null) vramPadding = defaultLlamaVramPadding; + if (ramPadding == null) ramPadding = defaultLlamaRamPadding; if (debug == null) debug = defaultLlamaCppDebugMode; const clonedLlamaCppRepoReleaseInfo = await getClonedLlamaCppRepoReleaseInfo(); @@ -376,6 +410,7 @@ export async function getLlamaForOptions({ skipLlamaInit, maxThreads, vramPadding, + ramPadding, fallbackMessage: !isLastItem ? `falling back to using ${getPrettyBuildGpuName(buildGpusToTry[i + 1])}` : ( @@ -437,6 +472,7 @@ export async function getLlamaForOptions({ updateLastBuildInfoOnCompile, maxThreads, vramPadding, + ramPadding, skipLlamaInit, debug }); @@ -473,6 +509,7 @@ async function loadExistingLlamaBinary({ skipLlamaInit, maxThreads, vramPadding, + ramPadding, fallbackMessage, debug }: { @@ -487,6 +524,7 @@ async function loadExistingLlamaBinary({ skipLlamaInit: boolean, maxThreads: number | undefined, vramPadding: Required["vramPadding"], + ramPadding: Required["ramPadding"], fallbackMessage: string | null, debug: boolean }) { @@ -520,6 +558,7 @@ async function loadExistingLlamaBinary({ logger, maxThreads, vramPadding, + ramPadding, skipLlamaInit, debug }); @@ -576,6 +615,7 @@ async function loadExistingLlamaBinary({ logger, maxThreads, vramPadding, + ramPadding, skipLlamaInit, debug }); @@ -630,6 +670,7 @@ async function buildAndLoadLlamaBinary({ updateLastBuildInfoOnCompile, maxThreads, vramPadding, + ramPadding, skipLlamaInit, debug }: { @@ -640,6 +681,7 @@ async function buildAndLoadLlamaBinary({ updateLastBuildInfoOnCompile: boolean, maxThreads: number | undefined, vramPadding: Required["vramPadding"], + ramPadding: Required["ramPadding"], skipLlamaInit: boolean, debug: boolean }) { @@ -671,6 +713,7 @@ async function buildAndLoadLlamaBinary({ logger, maxThreads, vramPadding, + ramPadding, skipLlamaInit, debug }); diff --git a/src/bindings/utils/MemoryOrchestrator.ts b/src/bindings/utils/MemoryOrchestrator.ts index 052651cf..992f336e 100644 --- a/src/bindings/utils/MemoryOrchestrator.ts +++ b/src/bindings/utils/MemoryOrchestrator.ts @@ -1,12 +1,12 @@ import {EventRelay} from "lifecycle-utils"; export class MemoryOrchestrator { - /** @internal */ private readonly _getMemoryState: () => {free: number, total: number}; + /** @internal */ private readonly _getMemoryState: () => {free: number, total: number, unifiedSize: number}; /** @internal */ private _reservedMemory: number = 0; public readonly onMemoryReservationRelease = new EventRelay(); - public constructor(getMemoryState: () => {free: number, total: number}) { + public constructor(getMemoryState: () => {free: number, total: number, unifiedSize: number}) { this._getMemoryState = getMemoryState; } @@ -20,11 +20,12 @@ export class MemoryOrchestrator { } public async getMemoryState() { - const {free, total} = this._getMemoryState(); + const {free, total, unifiedSize} = this._getMemoryState(); return { free: Math.max(0, free - this._reservedMemory), - total + total, + unifiedSize }; } } diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts index ab61644f..1ff9f01d 100644 --- a/src/bindings/utils/compileLLamaCpp.ts +++ b/src/bindings/utils/compileLLamaCpp.ts @@ -1,6 +1,7 @@ import path from "path"; import {fileURLToPath} from "url"; import process from "process"; +import os from "os"; import fs from "fs-extra"; import chalk from "chalk"; import which from "which"; @@ -17,7 +18,7 @@ import {getModuleVersion} from "../../utils/getModuleVersion.js"; import {ensureLlamaCppRepoIsCloned, isLlamaCppRepoCloned} from "./cloneLlamaCppRepo.js"; import {getBuildFolderNameForBuildOptions} from "./getBuildFolderNameForBuildOptions.js"; import {setLastBuildInfo} from "./lastBuildInfo.js"; -import {getPlatform} from "./getPlatform.js"; +import {BinaryPlatform, getPlatform} from "./getPlatform.js"; import {logDistroInstallInstruction} from "./logDistroInstallInstruction.js"; import {testCmakeBinary} from "./testCmakeBinary.js"; import {getCudaNvccPaths} from "./detectAvailableComputeLayers.js"; @@ -45,6 +46,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions ciMode = false } = compileOptions; + const platform = getPlatform(); const buildFolderName = await getBuildFolderNameForBuildOptions(buildOptions); const finalBuildFolderName = includeBuildOptionsInBinaryFolderName ? buildFolderName.withCustomCmakeOptions @@ -94,6 +96,9 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions if (ciMode) { if (!cmakeCustomOptions.has("GGML_OPENMP")) cmakeCustomOptions.set("GGML_OPENMP", "OFF"); + + if (!cmakeCustomOptions.has("GGML_AMX")) + cmakeCustomOptions.set("GGML_AMX", "OFF"); } await fs.remove(outDirectory); @@ -120,6 +125,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions "--arch=" + buildOptions.arch, "--out", path.relative(llamaDirectory, outDirectory), "--runtime-version=" + runtimeVersion, + "--parallel=" + getParallelBuildThreadsToUse(platform), ...cmakePathArgs, ...( [...cmakeCustomOptions].map(([key, value]) => "--CD" + key + "=" + value) @@ -171,7 +177,6 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions } }); } catch (err) { - const platform = getPlatform(); if (platform === "linux" && await which("make", {nothrow: true}) == null) { console.info("\n" + getConsoleLogPrefix(true) + @@ -453,3 +458,15 @@ async function getToolchainFileForArch(targetArch: string) { return null; } + +function getParallelBuildThreadsToUse(platform: BinaryPlatform) { + const cpuCount = os.cpus().length; + + if (cpuCount <= 4) + return cpuCount; + + if (platform === "mac" && process.arch === "arm64") + return cpuCount - 1; + + return cpuCount - 2; +} diff --git a/src/bindings/utils/getLinuxDistroInfo.ts b/src/bindings/utils/getLinuxDistroInfo.ts index 7ac09bdd..ccea0c56 100644 --- a/src/bindings/utils/getLinuxDistroInfo.ts +++ b/src/bindings/utils/getLinuxDistroInfo.ts @@ -29,7 +29,7 @@ async function getOsReleaseInfo() { if (!(await fs.pathExists(osReleasePath))) continue; - const osReleaseFile = await fs.readFile(osReleasePath, "utf-8"); + const osReleaseFile = await fs.readFile(osReleasePath, "utf8"); const res = new Map(); for (const line of osReleaseFile.split("\n")) { diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts index dc5a2f4c..85156f8e 100644 --- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts +++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts @@ -256,7 +256,6 @@ function logCompatibilityScore( title: "VRAM usage", value: () => bytes(compatibilityScore.resolvedValues.totalVramUsage) }, { - show: compatibilityScore.resolvedValues.totalRamUsage > 0, title: "RAM usage", value: () => bytes(compatibilityScore.resolvedValues.totalRamUsage) }, { diff --git a/src/cli/commands/inspect/commands/InspectGgufCommand.ts b/src/cli/commands/inspect/commands/InspectGgufCommand.ts index fa1ca7e3..49afe0ed 100644 --- a/src/cli/commands/inspect/commands/InspectGgufCommand.ts +++ b/src/cli/commands/inspect/commands/InspectGgufCommand.ts @@ -13,10 +13,12 @@ import {documentationPageUrls} from "../../../../config.js"; import withOra from "../../../../utils/withOra.js"; import {resolveModelDestination} from "../../../../utils/resolveModelDestination.js"; import {printModelDestination} from "../../../utils/printModelDestination.js"; +import {getGgufMetadataKeyValue} from "../../../../gguf/utils/getGgufMetadataKeyValue.js"; type InspectGgufCommand = { modelPath: string, header?: string[], + key?: string, noSplice: boolean, fullTensorInfo: boolean, fullMetadataArrays: boolean, @@ -46,6 +48,12 @@ export const InspectGgufCommand: CommandModule = { description: "Headers to use when reading a model file from a URL, in the format `key: value`. You can pass this option multiple times to add multiple headers.", group: "Optional:" }) + .option("key", { + alias: ["k"], + type: "string", + description: "A single metadata key to print the value of. If not provided, all metadata will be printed", + group: "Optional:" + }) .option("noSplice", { alias: "s", type: "boolean", @@ -80,7 +88,7 @@ export const InspectGgufCommand: CommandModule = { }); }, async handler({ - modelPath: ggufPath, header: headerArg, noSplice, fullTensorInfo, fullMetadataArrays, plainJson, outputToJsonFile + modelPath: ggufPath, header: headerArg, key, noSplice, fullTensorInfo, fullMetadataArrays, plainJson, outputToJsonFile }: InspectGgufCommand) { const resolvedModelDestination = resolveModelDestination(ggufPath); const resolvedGgufPath = resolvedModelDestination.type == "file" @@ -116,16 +124,30 @@ export const InspectGgufCommand: CommandModule = { const fileTypeName = getGgufFileTypeName(parsedMetadata.metadata.general?.file_type); if (plainJson || outputToJsonFile != null) { - const outputJson = JSON.stringify({ - splicedParts: parsedMetadata.splicedParts, - version: parsedMetadata.version, - fileType: fileTypeName, - tensorCount: parsedMetadata.totalTensorCount, - metadataSize: parsedMetadata.totalMetadataSize, - tensorInfoSize: parsedMetadata.totalTensorInfoSize, - metadata: parsedMetadata.metadata, - tensorInfo: parsedMetadata.fullTensorInfo - }, undefined, 4); + const getOutputJson = () => { + if (key != null) { + const keyValue = getGgufMetadataKeyValue(parsedMetadata.metadata, key); + if (keyValue === undefined) { + console.log(`Key not found: ${key}`); + process.exit(1); + } + + return JSON.stringify(keyValue, undefined, 4); + } + + return JSON.stringify({ + splicedParts: parsedMetadata.splicedParts, + version: parsedMetadata.version, + fileType: fileTypeName, + tensorCount: parsedMetadata.totalTensorCount, + metadataSize: parsedMetadata.totalMetadataSize, + tensorInfoSize: parsedMetadata.totalTensorInfoSize, + metadata: parsedMetadata.metadata, + tensorInfo: parsedMetadata.fullTensorInfo + }, undefined, 4); + }; + + const outputJson = getOutputJson(); if (outputToJsonFile != null) { const filePath = path.resolve(process.cwd(), outputToJsonFile); @@ -134,6 +156,27 @@ export const InspectGgufCommand: CommandModule = { } else { console.info(outputJson); } + } else if (key != null) { + const keyValue = getGgufMetadataKeyValue(parsedMetadata.metadata, key); + if (keyValue === undefined) { + console.log(`${chalk.red("Metadata key not found:")} ${key}`); + process.exit(1); + } + + const metadataPrettyPrintOptions: PrettyPrintObjectOptions = { + maxArrayValues: fullMetadataArrays + ? undefined + : 10, + useNumberGrouping: true, + maxArrayItemsWidth: process.stdout.columns - 1 + }; + + console.info(`${chalk.yellow("Metadata key:")} ${prettyPrintObject(key)}`); + console.info(`${chalk.yellow("Metadata:")} ${ + typeof keyValue === "string" + ? keyValue + : prettyPrintObject(keyValue, undefined, metadataPrettyPrintOptions) + }`); } else { const metadataPrettyPrintOptions: PrettyPrintObjectOptions = { maxArrayValues: fullMetadataArrays diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts index c3a710c3..4e8e47ac 100644 --- a/src/cli/commands/inspect/commands/InspectGpuCommand.ts +++ b/src/cli/commands/inspect/commands/InspectGpuCommand.ts @@ -129,6 +129,9 @@ export const InspectGpuCommand: CommandModule = { } } + if (lastLlama == null) + await loadLlamaForGpu(false); + for (const gpu of gpusToLogVramUsageOf) { const llama = gpuToLlama.get(gpu); if (llama == null) @@ -140,6 +143,9 @@ export const InspectGpuCommand: CommandModule = { console.info(); await logRamUsage(lastLlama?.cpuMathCores); + + if (lastLlama != null) + await logSwapUsage(lastLlama); } }; @@ -162,14 +168,17 @@ async function getLlamaForGpu(gpu: BuildGpu) { async function logGpuVramUsage(gpu: BuildGpu, llama: Llama) { try { const gpuName = getPrettyBuildGpuName(gpu); - const vramStatus = await llama.getVramState(); + const vramState = await llama.getVramState(); const gpuDeviceNames = await llama.getGpuDeviceNames(); if (gpuDeviceNames.length > 0) console.info(`${chalk.yellow(`${gpuName} device${gpuDeviceNames.length > 1 ? "s" : ""}:`)} ${gpuDeviceNames.join(", ")}`); - console.info(`${chalk.yellow(`${gpuName} used VRAM:`)} ${getPercentageString(vramStatus.used, vramStatus.total)}% ${chalk.gray("(" + bytes(vramStatus.used) + "/" + bytes(vramStatus.total) + ")")}`); - console.info(`${chalk.yellow(`${gpuName} free VRAM:`)} ${getPercentageString(vramStatus.free, vramStatus.total)}% ${chalk.gray("(" + bytes(vramStatus.free) + "/" + bytes(vramStatus.total) + ")")}`); + console.info(`${chalk.yellow(`${gpuName} used VRAM:`)} ${getPercentageString(vramState.used, vramState.total)}% ${chalk.gray("(" + bytes(vramState.used) + "/" + bytes(vramState.total) + ")")}`); + console.info(`${chalk.yellow(`${gpuName} free VRAM:`)} ${getPercentageString(vramState.free, vramState.total)}% ${chalk.gray("(" + bytes(vramState.free) + "/" + bytes(vramState.total) + ")")}`); + + if (vramState.unifiedSize > 0) + console.info(`${chalk.yellow(`${gpuName} unified memory:`)} ${bytes(vramState.unifiedSize)} ${chalk.gray("(" + getPercentageString(vramState.unifiedSize, vramState.total) + "%)")}`); } catch (err) {} } @@ -195,6 +204,13 @@ async function logRamUsage(cpuMathCores?: number) { console.info(`${chalk.yellow("Free RAM:")} ${getPercentageString(freeMemory, totalMemory)}% ${chalk.gray("(" + bytes(freeMemory) + "/" + bytes(totalMemory) + ")")}`); } +async function logSwapUsage(llama: Llama) { + const swapState = await llama.getSwapState(); + + console.info(`${chalk.yellow("Used swap:")} ${getPercentageString(swapState.used, swapState.allocated)}% ${chalk.gray("(" + bytes(swapState.used) + "/" + bytes(swapState.allocated) + ")")}`); + console.info(`${chalk.yellow("Max swap size:")} ${swapState.maxSize === Infinity ? "dynamic" : bytes(swapState.maxSize)}`); +} + function getPercentageString(amount: number, total: number) { if (total === 0) return "0"; diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts index 7dc264a2..9201e3e3 100644 --- a/src/cli/utils/interactivelyAskForModel.ts +++ b/src/cli/utils/interactivelyAskForModel.ts @@ -544,6 +544,9 @@ function renderRecommendedModelTechnicalInfo( show: canUseGpu, title: "VRAM usage", value: () => bytes(compatibilityScore.resolvedValues.totalVramUsage) + }, { + title: "RAM usage", + value: () => bytes(compatibilityScore.resolvedValues.totalRamUsage) }] }) ].join("\n"); diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts index 15c8f1b3..7e265929 100644 --- a/src/evaluator/LlamaChat/LlamaChat.ts +++ b/src/evaluator/LlamaChat/LlamaChat.ts @@ -2166,21 +2166,24 @@ class GenerateResponseState 0) - firstDifferentIndex -= 1; - - this.tokens.splice(0, firstDifferentIndex); - - if (firstDifferentIndex < this.llamaChat.sequence.nextTokenIndex) { + if (this.tokens.length === 1 && this.llamaChat.sequence.nextTokenIndex !== 0) { await this.llamaChat.sequence.eraseContextTokenRanges([{ - start: firstDifferentIndex, + start: 0, end: this.llamaChat.sequence.nextTokenIndex }]); - this.ensureNotAborted(); + return; } + + const lastToken = this.tokens[this.tokens.length - 1]!; + + // we need to decode at least one token to generate a response + this.tokens.pop(); + await this.llamaChat.sequence.adaptStateToTokens(this.tokens, false); + this.tokens.push(lastToken); + this.ensureNotAborted(); + + const firstDifferentIndex = this.llamaChat.sequence.nextTokenIndex; + this.tokens.splice(0, firstDifferentIndex); } public async evaluateWithoutGeneratingNewTokens() { diff --git a/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts b/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts index 91f0bfc7..ad96d83b 100644 --- a/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts +++ b/src/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.ts @@ -30,6 +30,8 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate initialCharactersRemovalCount, tokenizer, chatWrapper, + failedCompressionErrorMessage: "Failed to compress chat history for context shift due to a too long prompt or system message that cannot be compressed without affecting the generation quality. " + + "Consider increasing the context size or shortening the long prompt or system message.", compressChatHistory({chatHistory, charactersToRemove, estimatedCharactersPerToken}) { const res = chatHistory.map(item => structuredClone(item)); let charactersLeftToRemove = charactersToRemove; @@ -66,6 +68,8 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate } function removeHistoryThatLedToModelResponseAtIndex(index: number) { + let removedItems = 0; + for (let i = index - 1; i >= 0; i--) { const historyItem = res[i]; @@ -79,13 +83,19 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate break; // keep the first system message if (historyItem.type === "user" || historyItem.type === "system") { - const newText = truncateLlamaTextAndRoundToWords(LlamaText.fromJSON(historyItem.text), charactersLeftToRemove); + const newText = truncateLlamaTextAndRoundToWords( + LlamaText.fromJSON(historyItem.text), + charactersLeftToRemove, + undefined, + false + ); const newTextString = newText.toString(); const historyItemString = LlamaText.fromJSON(historyItem.text).toString(); if (newText.values.length === 0) { res.splice(i, 1); i++; + removedItems++; charactersLeftToRemove -= historyItemString.length; } else if (newTextString.length < historyItemString.length) { charactersLeftToRemove -= historyItemString.length - newTextString.length; @@ -98,6 +108,66 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate void (historyItem satisfies never); } } + + return removedItems; + } + + function compressHistoryThatLedToModelResponseAtIndex(index: number, keepTokensCount: number = 0) { + let removedItems = 0; + let promptStartIndex: number | undefined = undefined; + + for (let i = index - 1; i >= 0; i--) { + const historyItem = res[i]; + + if (historyItem == null) + continue; + + if (historyItem.type === "model") { + promptStartIndex = i + 1; + break; + } + + if (i === 0 && historyItem.type === "system") { + promptStartIndex = i + 1; + break; // keep the first system message + } + } + + if (promptStartIndex == null || promptStartIndex >= index) + return 0; + + for (let i = promptStartIndex; i < index && charactersLeftToRemove > 0; i++) { + const historyItem = res[i]; + + if (historyItem == null || historyItem.type !== "user") + continue; + + let removeChars = Math.min(charactersLeftToRemove, historyItem.text.length); + if (keepTokensCount > 0) { + removeChars -= Math.floor(keepTokensCount * estimatedCharactersPerToken); + if (removeChars < 0) + removeChars = 0; + + keepTokensCount -= Math.min( + keepTokensCount, + Math.max(0, historyItem.text.length - removeChars) / estimatedCharactersPerToken + ); + } + + const newText = truncateTextAndRoundToWords(historyItem.text, removeChars, undefined, false); + if (newText.length === 0) { + res.splice(i, 1); + i--; + index--; + removedItems++; + charactersLeftToRemove -= historyItem.text.length; + } else { + charactersLeftToRemove -= historyItem.text.length - newText.length; + historyItem.text = newText; + } + } + + return removedItems; } function compressFirstModelResponse() { @@ -116,7 +186,7 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate continue; if (typeof item === "string") { - const newText = truncateTextAndRoundToWords(item, charactersLeftToRemove); + const newText = truncateTextAndRoundToWords(item, charactersLeftToRemove, undefined, true); if (newText === "") { historyItem.response.splice(t, 1); @@ -139,14 +209,14 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate if (historyItem.response.length === 0) { // if the model response is removed from the history, // the things that led to it are not important anymore - removeHistoryThatLedToModelResponseAtIndex(i); + i -= removeHistoryThatLedToModelResponseAtIndex(i); res.splice(i, 1); i--; } } } - function compressLastModelResponse(minCharactersToKeep: number = 20) { + function compressLastModelResponse(minCharactersToKeep: number = 60) { const lastHistoryItem = res[res.length - 1]; if (lastHistoryItem == null || lastHistoryItem.type !== "model") @@ -157,14 +227,27 @@ export async function eraseFirstResponseAndKeepFirstSystemChatContextShiftStrate if (lastResponseItem == null || typeof lastResponseItem !== "string") return; - const nextTextLength = lastResponseItem.length - charactersLeftToRemove; - const charactersToRemoveFromText = charactersLeftToRemove + Math.max(0, nextTextLength - minCharactersToKeep); - const newText = truncateTextAndRoundToWords(lastResponseItem, charactersToRemoveFromText); + compressHistoryThatLedToModelResponseAtIndex(res.length - 1, maxTokensCount / 4); + + if (charactersLeftToRemove <= 0) + return; + + const nextTextLength = Math.max( + Math.min(lastResponseItem.length, minCharactersToKeep), + lastResponseItem.length - charactersLeftToRemove + ); + const charactersToRemoveFromText = lastResponseItem.length - nextTextLength; + const newText = truncateTextAndRoundToWords(lastResponseItem, charactersToRemoveFromText, undefined, true); if (newText.length < lastResponseItem.length) { lastHistoryItem.response[lastHistoryItem.response.length - 1] = newText; charactersLeftToRemove -= lastResponseItem.length - newText.length; } + + if (charactersLeftToRemove <= 0) + return; + + compressHistoryThatLedToModelResponseAtIndex(res.length - 1); } compressFunctionCalls(); diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts index 41e2f952..cc1d0563 100644 --- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts +++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts @@ -148,6 +148,14 @@ export type LLamaChatPromptOptions 0) - firstDifferentIndex -= 1; - - inputTokens.splice(0, firstDifferentIndex); - - if (firstDifferentIndex < sequence.nextTokenIndex) { + if (inputTokens.length === 1 && sequence.nextTokenIndex !== 0) await sequence.eraseContextTokenRanges([{ - start: firstDifferentIndex, + start: 0, end: sequence.nextTokenIndex }]); + else { + const lastToken = inputTokens[inputTokens.length - 1]!; + + // we need to decode at least one token to generate a response + inputTokens.pop(); + await sequence.adaptStateToTokens(inputTokens, false); + inputTokens.push(lastToken); ensureNotAborted(); + + const firstDifferentIndex = sequence.nextTokenIndex; + inputTokens.splice(0, firstDifferentIndex); } const evaluationIterator = sequence.evaluate(inputTokens, removeNullFields({ diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 845efd2e..4a241635 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -702,19 +702,22 @@ export class LlamaContext { async function createContext(contextSize: number) { const batchSize = options.batchSize ?? getDefaultContextBatchSize({contextSize, sequences}); - const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({ + const resourceRequirementsEstimation = _model.fileInsights.estimateContextResourceRequirements({ contextSize, sequences, isEmbeddingContext: options._embeddings, modelGpuLayers: _model.gpuLayers, batchSize, flashAttention - }).gpuVram; + }); const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention}); - const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks + const contextCreationVramReservation = options.ignoreMemorySafetyChecks + ? null + : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram); + const contextCreationRamReservation = options.ignoreMemorySafetyChecks ? null - : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate); + : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam); try { if (createSignal?.aborted) @@ -730,7 +733,8 @@ export class LlamaContext { } else if (!contextLoaded) throw new Error("Failed to create context"); - contextCreationMemoryReservation?.dispose?.(); + contextCreationVramReservation?.dispose?.(); + contextCreationRamReservation?.dispose?.(); if (loraOptions != null && loraOptions.adapters.length > 0) { let loadedAdapters = 0; @@ -768,7 +772,8 @@ export class LlamaContext { return context; } finally { - contextCreationMemoryReservation?.dispose?.(); + contextCreationVramReservation?.dispose?.(); + contextCreationRamReservation?.dispose?.(); } } @@ -904,6 +909,61 @@ export class LlamaContextSequence { }; } + /** + * Erase parts of the context state to align it with the given tokens. + * + * If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens. + * + * To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property. + * + * If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens, + * which incurs token evaluation of the shifted tokens. + */ + public async adaptStateToTokens(tokens: Token[], allowShift: boolean = true) { + if (this.model.fileInsights.isRecurrent || !allowShift) { + const {firstDifferentIndex} = this.compareContextTokens(tokens); + if (firstDifferentIndex < this._nextTokenIndex) + await this.eraseContextTokenRanges([{ + start: firstDifferentIndex, + end: this._nextTokenIndex + }]); + + return; + } + + const eraseRanges: ContextTokensDeleteRange[] = []; + + let tokensIndex = 0; + let differentTokenIndex: number | undefined = undefined; + for (let i = 0; i < this._contextTokens.length && tokensIndex < tokens.length; i++) { + if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) { + if (differentTokenIndex != null) { + eraseRanges.push({ + start: differentTokenIndex, + end: i + }); + + differentTokenIndex = undefined; + } + + tokensIndex++; + continue; + } + + if (differentTokenIndex == null) + differentTokenIndex = i; + } + + if (differentTokenIndex != null) + eraseRanges.push({ + start: differentTokenIndex, + end: this._nextTokenIndex + }); + + if (eraseRanges.length > 0) + await this.eraseContextTokenRanges(eraseRanges); + } + /** * Clear the history of the sequence. * If `prependBos` was enabled, the BOS token will be prepended to the sequence again. @@ -970,15 +1030,23 @@ export class LlamaContextSequence { if (deletionSuccessful) deletionSuccessful &&= this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end); - if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start) + if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start) { this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, range.start, -removedTokens); + const shiftedTokens = range.start - lastDeleteRangeEndPos; + this._tokenMeter.useTokens(shiftedTokens, "input"); + } removedTokens += range.end - range.start; lastDeleteRangeEndPos = range.end; } - if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex) + if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && + lastDeleteRangeEndPos !== this._nextTokenIndex + ) { this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens); + const shiftedTokens = this._nextTokenIndex - lastDeleteRangeEndPos; + this._tokenMeter.useTokens(shiftedTokens, "input"); + } this._nextTokenIndex -= removedTokens; diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 7d0f31a3..8cdc5e52 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -684,7 +684,7 @@ export class LlamaModel { ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, defaultContextFlashAttention: resolvedDefaultContextFlashAttention }); - const vramRequiredEstimate = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers}).gpuVram; + const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers}); const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, { _fileInfo: fileInfo, @@ -694,9 +694,12 @@ export class LlamaModel { _flashAttentionSupported: flashAttentionSupported, _defaultContextFlashAttention: resolvedDefaultContextFlashAttention }); - const modelCreationMemoryReservation = modelOptions.ignoreMemorySafetyChecks + const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks ? null - : _llama._vramOrchestrator.reserveMemory(vramRequiredEstimate); + : _llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram); + const modelCreationRamReservation = modelOptions.ignoreMemorySafetyChecks + ? null + : _llama._ramOrchestrator.reserveMemory(resourceRequirementsEstimation.cpuRam); const loggedWarnings = new Set(); function onAbort() { @@ -741,7 +744,8 @@ export class LlamaModel { return model; } finally { loadSignal?.removeEventListener("abort", onAbort); - modelCreationMemoryReservation?.dispose?.(); + modelCreationVramReservation?.dispose?.(); + modelCreationRamReservation?.dispose?.(); } } } diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index e6e4bca3..5833e5a4 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -104,6 +104,16 @@ export class GgufInsights { return true; } + public get isRecurrent() { + switch (this._ggufFileInfo.metadata?.general?.architecture) { + case GgufArchitectureType.mamba: + case GgufArchitectureType.rwkv6: + return true; + } + + return false; + } + public estimateModelResourceRequirements({gpuLayers}: {gpuLayers: number}): GgufInsightsResourceRequirements { const {cpu, gpu} = this._getTensorResourceSplit(gpuLayers); diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index edf56084..ce57ee1d 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -1,14 +1,16 @@ -import os from "os"; import {BuildGpu} from "../../bindings/types.js"; import {LlamaModelOptions} from "../../evaluator/LlamaModel/LlamaModel.js"; import {LlamaContextOptions} from "../../evaluator/LlamaContext/types.js"; import {getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js"; +import {InsufficientMemoryError} from "../../utils/InsufficientMemoryError.js"; import {resolveModelGpuLayersOption} from "./utils/resolveModelGpuLayersOption.js"; import {resolveContextContextSizeOption} from "./utils/resolveContextContextSizeOption.js"; import {scoreLevels} from "./utils/scoreLevels.js"; +import {getRamUsageFromUnifiedVram} from "./utils/getRamUsageFromUnifiedVram.js"; import type {GgufInsights} from "./GgufInsights.js"; export const defaultTrainContextSizeForEstimationPurposes = 4096; +const defaultContextSizeForUnfitContextSizeConfiguration = 2048; export class GgufInsightsConfigurationResolver { @@ -44,13 +46,15 @@ export class GgufInsightsConfigurationResolver { flashAttention?: boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), - getRamState = (async () => ({total: os.totalmem(), free: os.freemem()})), + getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), + getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading }: { - getVramState?(): Promise<{total: number, free: number}>, + getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>, getRamState?(): Promise<{total: number, free: number}>, + getSwapState?(): Promise<{total: number, free: number}>, llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean @@ -58,78 +62,18 @@ export class GgufInsightsConfigurationResolver { const compatibilityScore = await this.scoreModelConfigurationCompatibility({ flashAttention, contextSize: targetContextSize, - embeddingContext + embeddingContext, + forceGpuLayers: targetGpuLayers, + forceStrictContextSize: targetContextSize != null }, { getVramState, getRamState, + getSwapState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading }); - if (targetContextSize != null || targetGpuLayers != null) { - const vramState = await getVramState(); - const resolvedGpuLayers = await this.resolveModelGpuLayers( - targetGpuLayers == null - ? { - fitContext: { - contextSize: targetContextSize, - embeddingContext - } - } - : targetGpuLayers, - { - getVramState: async () => vramState, - defaultContextFlashAttention: flashAttention, - ignoreMemorySafetyChecks: targetGpuLayers != null, - llamaGpu, - llamaSupportsGpuOffloading, - llamaVramPaddingSize - } - ); - const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({ - gpuLayers: resolvedGpuLayers - }); - - const resolvedContextSize = await this._ggufInsights.configurationResolver.resolveContextContextSize(targetContextSize ?? "auto", { - getVramState: async () => ({ - total: vramState.total, - free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram) - }), - isEmbeddingContext: embeddingContext, - modelGpuLayers: resolvedGpuLayers, - modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes, - flashAttention, - ignoreMemorySafetyChecks: targetContextSize != null, - llamaGpu - }); - const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({ - contextSize: resolvedContextSize, - isEmbeddingContext: embeddingContext, - modelGpuLayers: resolvedGpuLayers, - flashAttention - }); - - compatibilityScore.resolvedValues = { - gpuLayers: resolvedGpuLayers, - contextSize: resolvedContextSize, - - modelRamUsage: estimatedModelResourceUsage.cpuRam, - contextRamUsage: estimatedContextResourceUsage.cpuRam, - totalRamUsage: estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam, - - modelVramUsage: estimatedModelResourceUsage.gpuVram, - contextVramUsage: estimatedContextResourceUsage.gpuVram, - totalVramUsage: estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram - }; - - if (compatibilityScore.resolvedValues.totalVramUsage > vramState.total) { - compatibilityScore.compatibilityScore = 0; - compatibilityScore.bonusScore = 0; - compatibilityScore.totalScore = 0; - } - } - return compatibilityScore; } @@ -148,27 +92,46 @@ export class GgufInsightsConfigurationResolver { * Set this to any value higher than ` / contextSize`. * Defaults to `100`. * + * `maximumUnfitConfigurationResourceMultiplier` is used to improve the proportionality of the bonus score between unfit models. + * Set this to any value higher than ` / `. + * Defaults to `100`. + * * `contextSize` defaults to `4096` (if the model train context size is lower than this, the model train context size is used instead). */ public async scoreModelConfigurationCompatibility({ contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, flashAttention = false, - maximumFittedContextSizeMultiplier = 100 + maximumFittedContextSizeMultiplier = 100, + maximumUnfitConfigurationResourceMultiplier = 100, + forceStrictContextSize = false, + forceGpuLayers }: { contextSize?: number, embeddingContext?: boolean, flashAttention?: boolean, - maximumFittedContextSizeMultiplier?: number + maximumFittedContextSizeMultiplier?: number, + maximumUnfitConfigurationResourceMultiplier?: number, + + /** + * Do not resolve a context size larger than the specified `contextSize`. + * + * Defaults to `false`. + */ + forceStrictContextSize?: boolean, + + forceGpuLayers?: number | "max" } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), - getRamState = (async () => ({total: os.totalmem(), free: os.freemem()})), + getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), + getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading }: { - getVramState?(): Promise<{total: number, free: number}>, + getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>, getRamState?(): Promise<{total: number, free: number}>, + getSwapState?(): Promise<{total: number, free: number}>, llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean @@ -207,39 +170,100 @@ export class GgufInsightsConfigurationResolver { }> { const [ vramState, - ramState + ramState, + swapState ] = await Promise.all([ getVramState(), - getRamState() + getRamState(), + getSwapState() ]); - const resolvedGpuLayers = await this.resolveModelGpuLayers( - embeddingContext - ? {fitContext: {embeddingContext: true}} - : "auto", - { - getVramState: async () => vramState, - llamaVramPaddingSize, - llamaGpu, - llamaSupportsGpuOffloading, - defaultContextFlashAttention: flashAttention - } - ); + let resolvedGpuLayers = (forceGpuLayers == null || forceGpuLayers == "max") + ? this.ggufInsights.totalLayers + : forceGpuLayers; + let gpuLayersFitMemory = false; + + try { + resolvedGpuLayers = await this.resolveModelGpuLayers( + forceGpuLayers != null + ? forceGpuLayers + : embeddingContext + ? { + fitContext: { + embeddingContext: true, + contextSize: forceStrictContextSize + ? contextSize + : undefined + } + } + : forceStrictContextSize != null + ? {fitContext: {contextSize}} + : "auto", + { + getVramState: async () => vramState, + llamaVramPaddingSize, + llamaGpu, + llamaSupportsGpuOffloading, + defaultContextFlashAttention: flashAttention, + ignoreMemorySafetyChecks: forceGpuLayers != null + } + ); + gpuLayersFitMemory = true; + } catch (err) { + if (!(err instanceof InsufficientMemoryError)) + throw err; + } + const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false; const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({ gpuLayers: resolvedGpuLayers }); - const resolvedContextSize = await this.resolveContextContextSize("auto", { - getVramState: async () => ({ - total: vramState.total, - free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram) - }), - llamaGpu, - isEmbeddingContext: embeddingContext, - modelGpuLayers: resolvedGpuLayers, - modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes, - flashAttention - }); + let resolvedContextSize = Math.min( + this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration, + defaultContextSizeForUnfitContextSizeConfiguration + ); + let contextFitsMemory = false; + + try { + resolvedContextSize = await this.resolveContextContextSize("auto", { + getVramState: async () => ({ + total: vramState.total, + free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram), + unifiedSize: vramState.unifiedSize + }), + getRamState: async () => ({ + total: ramState.total, + free: Math.max( + 0, + ramState.free - estimatedModelResourceUsage.cpuRam + + (-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) + ) + }), + getSwapState: async () => ({ + total: swapState.total, + free: Math.max( + 0, + swapState.free - Math.max( + 0, + estimatedModelResourceUsage.cpuRam + + (-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) + + (-ramState.free) + ) + ) + }), + llamaGpu, + isEmbeddingContext: embeddingContext, + modelGpuLayers: resolvedGpuLayers, + modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes, + ignoreMemorySafetyChecks: forceStrictContextSize, + flashAttention + }); + contextFitsMemory = true; + } catch (err) { + if (!(err instanceof InsufficientMemoryError)) + throw err; + } + const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({ contextSize: resolvedContextSize, isEmbeddingContext: embeddingContext, @@ -252,7 +276,7 @@ export class GgufInsightsConfigurationResolver { allLayersAreOffloaded: 10, contextSize: 30, ramUsageFitsInRam: 10, - cpuOnlySmallModelSize: 60, // also defined inside `scoreModelSizeForCpuOnlyUsage` + cpuOnlySmallModelSize: 70, // also defined inside `scoreModelSizeForCpuOnlyUsage` bonusContextSize: 10 } as const; @@ -260,29 +284,37 @@ export class GgufInsightsConfigurationResolver { const allLayersAreOffloadedPoints = rankPoints.allLayersAreOffloaded * ( resolvedGpuLayers === this._ggufInsights.totalLayers ? 1 : 0 ); - const contextSizePoints = rankPoints.contextSize * Math.min(1, resolvedContextSize / contextSize); + const contextSizePoints = contextFitsMemory + ? rankPoints.contextSize * Math.min(1, resolvedContextSize / contextSize) + : 0; const ramUsageFitsInRamPoints = rankPoints.ramUsageFitsInRam * ( estimatedModelResourceUsage.cpuRam <= ramState.free ? 1 - : estimatedModelResourceUsage.cpuRam <= ramState.total - ? 0.5 - : ( - 0.5 - Math.min( - 0.5, - 0.5 * ( - (estimatedModelResourceUsage.cpuRam - ramState.total) / ramState.total + : estimatedModelResourceUsage.cpuRam <= ramState.free + swapState.free + ? 0.8 + : estimatedModelResourceUsage.cpuRam <= ramState.total + ? 0.5 + : ( + 0.5 - Math.min( + 0.5, + 0.5 * ( + (estimatedModelResourceUsage.cpuRam - ramState.total) / ramState.total + ) ) ) - ) - ); - const bonusContextSizePoints = 10 * Math.min( - 1, - ( - Math.max(0, resolvedContextSize - contextSize) / contextSize - ) / maximumFittedContextSizeMultiplier ); + const bonusContextSizePoints = contextFitsMemory + ? ( + 10 * Math.min( + 1, + ( + Math.max(0, resolvedContextSize - contextSize) / contextSize + ) / maximumFittedContextSizeMultiplier + ) + ) + : 0; - const compatibilityScore = canUseGpu + let compatibilityScore = canUseGpu ? ( (gpuLayersPoints + allLayersAreOffloadedPoints + contextSizePoints + ramUsageFitsInRamPoints) / (rankPoints.gpuLayers + rankPoints.allLayersAreOffloaded + rankPoints.contextSize + rankPoints.ramUsageFitsInRam) @@ -290,7 +322,21 @@ export class GgufInsightsConfigurationResolver { : ( (contextSizePoints + ramUsageFitsInRamPoints + scoreModelSizeForCpuOnlyUsage(this._ggufInsights.modelSize)) / (rankPoints.contextSize + rankPoints.ramUsageFitsInRam + rankPoints.cpuOnlySmallModelSize)); - const bonusScore = bonusContextSizePoints / rankPoints.bonusContextSize; + let bonusScore = bonusContextSizePoints / rankPoints.bonusContextSize; + + if (!gpuLayersFitMemory || !contextFitsMemory || + estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram > vramState.total || + estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam > ramState.total + swapState.total + ) { + const totalVramRequirement = estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram; + const totalRamRequirement = estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam; + + compatibilityScore = 0; + bonusScore = ( + (1 - (totalVramRequirement / (vramState.total * maximumUnfitConfigurationResourceMultiplier))) + + (1 - (totalRamRequirement / ((ramState.total + swapState.total) * maximumUnfitConfigurationResourceMultiplier))) + ) / 2; + } return { compatibilityScore, @@ -333,12 +379,19 @@ export class GgufInsightsConfigurationResolver { }); } + /** + * Resolve a context size option for the given options and constraints. + * + * If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown. + */ public async resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], { modelGpuLayers, batchSize, modelTrainContextSize, flashAttention = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), + getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), + getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaGpu = this._ggufInsights._llama.gpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, @@ -349,7 +402,9 @@ export class GgufInsightsConfigurationResolver { flashAttention?: boolean, batchSize?: LlamaContextOptions["batchSize"], sequences?: number, - getVramState?(): Promise<{total: number, free: number}>, + getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>, + getRamState?(): Promise<{total: number, free: number}>, + getSwapState?(): Promise<{total: number, free: number}>, llamaGpu?: BuildGpu, ignoreMemorySafetyChecks?: boolean, isEmbeddingContext?: boolean @@ -363,6 +418,8 @@ export class GgufInsightsConfigurationResolver { modelTrainContextSize, flashAttention, getVramState, + getRamState, + getSwapState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext @@ -377,16 +434,16 @@ export class GgufInsightsConfigurationResolver { function scoreModelSizeForCpuOnlyUsage(modelSize: number) { const s1GB = Math.pow(1024, 3); - return 60 - scoreLevels(modelSize, [{ + return 70 - scoreLevels(modelSize, [{ start: s1GB, end: s1GB * 2.5, - points: 40 + points: 46 }, { start: s1GB * 2.5, end: s1GB * 4, - points: 15 + points: 17 }, { start: s1GB * 4, - points: 5 + points: 7 }]); } diff --git a/src/gguf/insights/utils/getRamUsageFromUnifiedVram.ts b/src/gguf/insights/utils/getRamUsageFromUnifiedVram.ts new file mode 100644 index 00000000..7db8a9eb --- /dev/null +++ b/src/gguf/insights/utils/getRamUsageFromUnifiedVram.ts @@ -0,0 +1,8 @@ +export function getRamUsageFromUnifiedVram(vramUsage: number, vramState: {total: number, free: number, unifiedSize: number}) { + const onlyVramSize = vramState.total - vramState.unifiedSize; + const existingUsage = Math.max(0, vramState.total - vramState.free); + + const unifiedRamUsage = Math.min(vramState.unifiedSize, Math.max(0, vramUsage - Math.max(0, onlyVramSize - existingUsage))); + + return unifiedRamUsage; +} diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts index c4bb5fcf..f800f712 100644 --- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts +++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts @@ -3,10 +3,15 @@ import {GgufInsights} from "../GgufInsights.js"; import {BuildGpu} from "../../../bindings/types.js"; import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; +import {InsufficientMemoryError} from "../../../utils/InsufficientMemoryError.js"; +import {getRamUsageFromUnifiedVram} from "./getRamUsageFromUnifiedVram.js"; + +const defaultMaxContextSizeSwapUse = 2048; export async function resolveContextContextSizeOption({ - contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, getVramState, llamaGpu, - ignoreMemorySafetyChecks = false, isEmbeddingContext = false + contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, + getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, + maxContextSizeSwapUse = defaultMaxContextSizeSwapUse }: { contextSize?: LlamaContextOptions["contextSize"], batchSize?: LlamaContextOptions["batchSize"], @@ -15,10 +20,13 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: number, modelTrainContextSize: number, flashAttention: boolean, - getVramState(): Promise<{total: number, free: number}>, + getVramState(): Promise<{total: number, free: number, unifiedSize: number}>, + getRamState(): Promise<{total: number, free: number}>, + getSwapState(): Promise<{total: number, free: number}>, llamaGpu: BuildGpu, ignoreMemorySafetyChecks?: boolean, - isEmbeddingContext?: boolean + isEmbeddingContext?: boolean, + maxContextSizeSwapUse?: number }): Promise { if (contextSize == null) contextSize = "auto"; @@ -29,30 +37,42 @@ export async function resolveContextContextSizeOption({ if (ignoreMemorySafetyChecks) return resolvedContextSize; - const vramState = await getVramState(); - const contextVram = modelFileInsights.estimateContextResourceRequirements({ + const [ + vramState, + ramState, + swapState + ] = await Promise.all([ + getVramState(), + getRamState(), + getSwapState() + ]); + const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ contextSize: resolvedContextSize, batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}), modelGpuLayers: modelGpuLayers, sequences, flashAttention, isEmbeddingContext - }).gpuVram; + }); - if (contextVram > vramState.free) - throw new Error(`The context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); + if (contextResourceRequirements.gpuVram > vramState.free) + throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); + else if (contextResourceRequirements.cpuRam > ( + ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + )) + throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); return resolvedContextSize; } else if (contextSize === "auto" || typeof contextSize === "object") { - if (llamaGpu === false) - return modelTrainContextSize; - - const vramState = await getVramState(); - - if (vramState.total === 0) - return modelTrainContextSize; - - const freeVram = vramState.free; + const [ + vramState, + ramState, + swapState + ] = await Promise.all([ + getVramState(), + getRamState(), + getSwapState() + ]); const maxContextSize = contextSize === "auto" ? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) @@ -71,17 +91,25 @@ export async function resolveContextContextSizeOption({ let highestCompatibleContextSize: number | null = null; let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4)); for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) { - const contextVram = modelFileInsights.estimateContextResourceRequirements({ + const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ contextSize: testContextSize, batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}), modelGpuLayers: modelGpuLayers, sequences, flashAttention, isEmbeddingContext - }).gpuVram; - - if (contextVram <= freeVram) { - if (highestCompatibleContextSize == null || testContextSize > highestCompatibleContextSize) { + }); + + if (contextResourceRequirements.gpuVram <= vramState.free && + contextResourceRequirements.cpuRam <= ( + ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + ( + testContextSize <= maxContextSizeSwapUse + ? swapState.free + : 0 + ) + ) + ) { + if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) { highestCompatibleContextSize = testContextSize; if (step === -1) @@ -111,7 +139,28 @@ export async function resolveContextContextSizeOption({ if (ignoreMemorySafetyChecks) return minContextSize; - throw new Error(`The available VRAM is too small to fit the context size of ${maxContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""}`); + const minContextSizeResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ + contextSize: minContextSize, + batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: minContextSize, sequences}), + modelGpuLayers: modelGpuLayers, + sequences, + flashAttention, + isEmbeddingContext + }); + + const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState); + if (minContextSizeResourceRequirements.gpuVram > vramState.free && + minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage + ) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}`); + else if (minContextSizeResourceRequirements.gpuVram > vramState.free) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); + else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); + else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM`); + else + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources`); } throw new Error(`Invalid context size: "${contextSize}"`); diff --git a/src/gguf/utils/getGgufMetadataKeyValue.ts b/src/gguf/utils/getGgufMetadataKeyValue.ts new file mode 100644 index 00000000..b6ce51df --- /dev/null +++ b/src/gguf/utils/getGgufMetadataKeyValue.ts @@ -0,0 +1,34 @@ +export function getGgufMetadataKeyValue(metadata: Record, key: string) { + return readMedataKey(metadata, key.split(".")); +} + +function readMedataKey(metadata: Record, keyParts: string[]): any { + for (const [metadataKey, value] of Object.entries(metadata)) { + const matchLength = checkMatchLength(metadataKey, keyParts); + if (matchLength === 0) + continue; + + if (matchLength === keyParts.length) + return value; + + const res = readMedataKey(value, keyParts.slice(matchLength)); + if (res !== undefined) + return res; + } + + return undefined; +} + +function checkMatchLength(metadataKey: string, keyParts: string[]) { + const metadataKeyParts = metadataKey.split("."); + + if (metadataKeyParts.length > keyParts.length) + return 0; + + for (let i = 0; i < metadataKeyParts.length; i++) { + if (metadataKeyParts[i] !== keyParts[i]) + return 0; + } + + return metadataKeyParts.length; +} diff --git a/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts b/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts index c0bd096c..27efd9b5 100644 --- a/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts +++ b/src/utils/findCharacterRemovalCountToFitChatHistoryInContext.ts @@ -1,6 +1,8 @@ import {ChatHistoryItem, Tokenizer} from "../types.js"; import {ChatWrapper} from "../ChatWrapper.js"; +const maxSequentialUnhelpfulIterations = 100; + export async function findCharacterRemovalCountToFitChatHistoryInContext({ compressChatHistory, chatHistory, @@ -9,7 +11,8 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({ chatWrapper, initialCharactersRemovalCount = 0, estimatedCharactersPerToken = 5, - maxDecompressionAttempts = 2 + maxDecompressionAttempts = 2, + failedCompressionErrorMessage = "Failed to compress chat history. Consider increasing the context size." }: { compressChatHistory(options: { chatHistory: readonly ChatHistoryItem[], charactersToRemove: number, estimatedCharactersPerToken: number @@ -20,7 +23,8 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({ chatWrapper: ChatWrapper, initialCharactersRemovalCount?: number, estimatedCharactersPerToken?: number, - maxDecompressionAttempts?: number + maxDecompressionAttempts?: number, + failedCompressionErrorMessage?: string }): Promise<{ removedCharactersCount: number, compressedChatHistory: ChatHistoryItem[] @@ -55,6 +59,8 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({ let latestCompressionAttempt = await getResultForCharacterRemovalCount(initialCharactersRemovalCount); const firstCompressionAttempt = latestCompressionAttempt; + let latestCompressionAttemptTokensCount = latestCompressionAttempt.tokensCount; + let sameTokensCountRepetitions = 0; if (latestCompressionAttempt.tokensCount === tokensCountToFit || (latestCompressionAttempt.tokensCount < tokensCountToFit && latestCompressionAttempt.characterRemovalCount === 0) @@ -116,6 +122,19 @@ export async function findCharacterRemovalCountToFitChatHistoryInContext({ latestCompressionAttempt.characterRemovalCount < bestCompressionAttempt.characterRemovalCount )) bestCompressionAttempt = latestCompressionAttempt; + + if (latestCompressionAttempt.tokensCount === latestCompressionAttemptTokensCount) + sameTokensCountRepetitions++; + else { + latestCompressionAttemptTokensCount = latestCompressionAttempt.tokensCount; + sameTokensCountRepetitions = 0; + } + + if (decompressionAttempts === 0 && + compressionAttempts >= maxSequentialUnhelpfulIterations && + sameTokensCountRepetitions >= maxSequentialUnhelpfulIterations + ) + throw new Error(failedCompressionErrorMessage); } return { diff --git a/src/utils/truncateTextAndRoundToWords.ts b/src/utils/truncateTextAndRoundToWords.ts index 2f2d5c53..26288ef1 100644 --- a/src/utils/truncateTextAndRoundToWords.ts +++ b/src/utils/truncateTextAndRoundToWords.ts @@ -5,68 +5,132 @@ const truncatePrefix = "..."; /** * Truncate the given text starting from the specified index and try to round to the nearest word. * @param text - The text to truncate and round - * @param truncateStartIndex - The index to start truncating the text at + * @param truncateSize - The size of the text to truncate * @param maxRound - The maximum number of extra characters to delete to round to the nearest word + * @param truncateStart - Whether to truncate from the start of the text. If false, truncate from the end. * @returns - The truncated and rounded text */ -export function truncateTextAndRoundToWords(text: string, truncateStartIndex: number, maxRound: number = 6): string { - const res = text.slice(truncateStartIndex); +export function truncateTextAndRoundToWords( + text: string, truncateSize: number, maxRound: number = 6, truncateStart: boolean = false +): string { + if (truncateStart) { + const res = text.slice(truncateSize); - if (res.length === 0) - return res; + if (res.length === 0) + return res; - if (truncateStartIndex === 0 || text[truncateStartIndex - 1] === " ") - return res; + if (truncateSize === 0 || text[truncateSize - 1] === " ") + return res; - const nextSpaceIndex = res.indexOf(" "); + const nextSpaceIndex = res.indexOf(" "); - if (nextSpaceIndex < 0) { - if (res.length <= maxRound || res.length < truncatePrefix.length) + if (nextSpaceIndex < 0) { + if (res.length <= maxRound || res.length < truncatePrefix.length) + return ""; + + return truncatePrefix + res.slice(truncatePrefix.length); + } + + if (nextSpaceIndex <= maxRound) + return res.slice(nextSpaceIndex + 1); + + if (res.length < truncatePrefix.length) return ""; return truncatePrefix + res.slice(truncatePrefix.length); - } + } else { + const res = text.slice(0, -truncateSize); - if (nextSpaceIndex <= maxRound) - return res.slice(nextSpaceIndex + 1); + if (res.length === 0) + return res; - if (res.length < truncatePrefix.length) - return ""; + if (truncateSize === 0 || (text.length === res.length || text[res.length] === " ")) + return res; - return truncatePrefix + res.slice(truncatePrefix.length); -} + const nextSpaceIndex = res.lastIndexOf(" "); -export function truncateLlamaTextAndRoundToWords(llamaText: LlamaText, truncateStartIndex: number, maxRound: number = 6): LlamaText { - if (truncateStartIndex <= 0) - return llamaText; + if (nextSpaceIndex < 0) { + if (res.length <= maxRound || res.length < truncatePrefix.length) + return ""; - for (let i = 0; i < llamaText.values.length; i++) { - const value = llamaText.values[i]; + return res.slice(truncatePrefix.length) + truncatePrefix; + } - if (value == null) - continue; + if (nextSpaceIndex <= maxRound) + return res.slice(0, nextSpaceIndex); - if (typeof value === "string") { - if (value.length > truncateStartIndex) { - return LlamaText([ - truncateTextAndRoundToWords(value, truncateStartIndex, maxRound), - ...llamaText.values.slice(i + 1) - ]); - } + if (res.length < truncatePrefix.length) + return ""; - truncateStartIndex -= value.length; - } else if (value instanceof SpecialToken) { - truncateStartIndex--; - if (truncateStartIndex <= 0) - return LlamaText(llamaText.values.slice(i + 1)); - } else { - void (value satisfies SpecialTokensText); + return res.slice(truncatePrefix.length) + truncatePrefix; + } +} - // SpecialTokensText shouldn't be truncated - if (value.value.length > truncateStartIndex) - return LlamaText(llamaText.values.slice(i + 1)); +export function truncateLlamaTextAndRoundToWords( + llamaText: LlamaText, truncateSize: number, maxRound: number = 6, truncateStart: boolean = false +): LlamaText { + if (truncateSize <= 0) + return llamaText; - truncateStartIndex -= value.value.length; + if (truncateStart) { + for (let i = 0; i < llamaText.values.length; i++) { + const value = llamaText.values[i]; + + if (value == null) + continue; + + if (typeof value === "string") { + if (value.length > truncateSize) { + return LlamaText([ + truncateTextAndRoundToWords(value, truncateSize, maxRound, true), + ...llamaText.values.slice(i + 1) + ]); + } + + truncateSize -= value.length; + } else if (value instanceof SpecialToken) { + truncateSize--; + if (truncateSize <= 0) + return LlamaText(llamaText.values.slice(i + 1)); + } else { + void (value satisfies SpecialTokensText); + + // SpecialTokensText shouldn't be truncated + if (value.value.length > truncateSize) + return LlamaText(llamaText.values.slice(i + 1)); + + truncateSize -= value.value.length; + } + } + } else { + for (let i = llamaText.values.length - 1; i >= 0; i--) { + const value = llamaText.values[i]; + + if (value == null) + continue; + + if (typeof value === "string") { + if (value.length > truncateSize) { + return LlamaText([ + ...llamaText.values.slice(0, i), + truncateTextAndRoundToWords(value, truncateSize, maxRound, false) + ]); + } + + truncateSize -= value.length; + } else if (value instanceof SpecialToken) { + truncateSize--; + if (truncateSize <= 0) + return LlamaText(llamaText.values.slice(0, i)); + } else { + void (value satisfies SpecialTokensText); + + // SpecialTokensText shouldn't be truncated + if (value.value.length > truncateSize) + return LlamaText(llamaText.values.slice(0, i)); + + truncateSize -= value.value.length; + } } } diff --git a/test/modelDependent/codegemma/completion.test.ts b/test/modelDependent/codegemma/completion.test.ts index 67deed2d..bed1afc2 100644 --- a/test/modelDependent/codegemma/completion.test.ts +++ b/test/modelDependent/codegemma/completion.test.ts @@ -19,13 +19,13 @@ describe("CodeGemma", () => { contextSequence: context.getSequence() }); - const res = await completion.generateCompletion("Here is a list of sweet fruits:\n* ", { - maxTokens: 10 + const res = await completion.generateCompletion("Sweet fruit names:\n* ", { + maxTokens: 10, + seed: 30 }); expect(res).toMatchInlineSnapshot(` - "🍎 - * 🍊 - * 🍋 + "1. Apple + * 2. Banana " `); }); diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts index 88cfa250..4d5ebb5c 100644 --- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts +++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts @@ -1,4 +1,4 @@ -import {describe, expect, it} from "vitest"; +import {describe, expect, it, test} from "vitest"; import {getModelFile} from "../../utils/modelFiles.js"; import {getTestLlama} from "../../utils/getTestLlama.js"; import {LlamaModelOptions, readGgufFileInfo} from "../../../src/index.js"; @@ -18,9 +18,15 @@ describe("functionary", () => { const s1GB = Math.pow(1024, 3); async function resolveGpuLayers(gpuLayers: LlamaModelOptions["gpuLayers"], { - totalVram, freeVram, ignoreMemorySafetyChecks = false, llamaGpu = "metal" + totalVram, freeVram, unifiedMemorySize = 0, + totalRam = 0, freeRam = 0, + totalSwap = 0, freeSwap = 0, + ignoreMemorySafetyChecks = false, llamaGpu = "metal" }: { - totalVram: number, freeVram: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu + totalVram: number, freeVram: number, unifiedMemorySize?: number, + totalRam?: number, freeRam?: number, + totalSwap?: number, freeSwap?: number, + ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu }) { const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, { ignoreMemorySafetyChecks, @@ -34,27 +40,31 @@ describe("functionary", () => { }); async function resolveAutoContextSize() { - const modelVram = ggufInsights.estimateModelResourceRequirements({ - gpuLayers: resolvedGpuLayers - }).gpuVram; + const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({ + targetGpuLayers: resolvedGpuLayers + }, { + llamaGpu, + getVramState: async () => ({ + total: llamaGpu === false ? 0 : totalVram, + free: llamaGpu === false ? 0 : freeVram, + unifiedSize: unifiedMemorySize + }), + getRamState: async () => ({ + total: totalRam, + free: freeRam + }), + getSwapState: async () => ({ + total: totalSwap, + free: freeSwap + }), + llamaSupportsGpuOffloading: llamaGpu !== false, + llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram) + }); - try { - return await ggufInsights.configurationResolver.resolveContextContextSize("auto", { - batchSize: undefined, - sequences: 1, - modelGpuLayers: resolvedGpuLayers, - modelTrainContextSize: ggufInsights.trainContextSize ?? 4096, - getVramState: async () => ({ - total: llamaGpu === false ? 0 : totalVram, - free: llamaGpu === false ? 0 : (freeVram - modelVram) - }), - llamaGpu, - ignoreMemorySafetyChecks: false, - isEmbeddingContext: false - }); - } catch (err) { + if (resolvedConfig.compatibilityScore === 0) return null; - } + + return resolvedConfig.resolvedValues.contextSize; } return { @@ -63,403 +73,792 @@ describe("functionary", () => { }; } - it("attempts to resolve 0 gpuLayers", async () => { - { - const res = await resolveGpuLayers(0, { - totalVram: s1GB * 6, - freeVram: s1GB * 1 - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers(0, { - totalVram: s1GB * 6, - freeVram: s1GB * 0 - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } + describe("attempts to resolve 0 gpuLayers", () => { + test("no RAM", async () => { + { + const res = await resolveGpuLayers(0, { + totalVram: s1GB * 6, + freeVram: s1GB * 1 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + { + const res = await resolveGpuLayers(0, { + totalVram: s1GB * 6, + freeVram: s1GB * 0 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } - { - const res = await resolveGpuLayers(0, { - totalVram: 0, - freeVram: 0, - llamaGpu: false - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } + { + const res = await resolveGpuLayers(0, { + totalVram: 0, + freeVram: 0, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); + + test("some RAM", async () => { + { + const res = await resolveGpuLayers(0, { + totalVram: s1GB * 6, + freeVram: s1GB * 1, + totalRam: s1GB * 6, + freeRam: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + { + const res = await resolveGpuLayers(0, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 6, + freeRam: s1GB * 0 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + + { + const res = await resolveGpuLayers(0, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 0, + freeRam: s1GB * 0, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); + + test("with swap", async () => { + { + const res = await resolveGpuLayers(0, { + totalVram: s1GB * 6, + freeVram: s1GB * 1, + totalRam: s1GB * 6, + freeRam: s1GB * 5, + totalSwap: s1GB * 6, + freeSwap: s1GB * 1 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + } + { + const res = await resolveGpuLayers(0, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 6, + freeRam: s1GB * 4, + totalSwap: s1GB * 6, + freeSwap: s1GB * 1 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + } + + { + const res = await resolveGpuLayers(0, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 0, + freeRam: s1GB * 0, + totalSwap: s1GB * 0, + freeSwap: s1GB * 0, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); }); - it("attempts to resolve 16 gpuLayers", async () => { - { - const res = await resolveGpuLayers(16, { - totalVram: s1GB * 6, - freeVram: s1GB * 3 - }); - expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("1924"); - } - try { - await resolveGpuLayers(16, { - totalVram: s1GB * 6, - freeVram: s1GB * 0 - }); - expect.unreachable("Should have thrown an error"); - } catch (err) { - expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); - } - try { - await resolveGpuLayers(16, { - totalVram: s1GB * 6, - freeVram: s1GB * 0.2 - }); - expect.unreachable("Should have thrown an error"); - } catch (err) { - expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); - } - { - const res = await resolveGpuLayers(16, { - totalVram: s1GB * 6, + describe("attempts to resolve 16 gpuLayers", () => { + test("no RAM", async () => { + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 3 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, - // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left - // to create a context - freeVram: s1GB * 0.2, + // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left + // to create a context + freeVram: s1GB * 0.2, - ignoreMemorySafetyChecks: true - }); - expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.eql(null); - } + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.eql(null); + } - { - const res = await resolveGpuLayers(16, { - totalVram: 0, - freeVram: 0, - llamaGpu: false - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers(16, { - totalVram: 0, - freeVram: 0, - llamaGpu: false, - ignoreMemorySafetyChecks: true - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - }); + { + const res = await resolveGpuLayers(16, { + totalVram: 0, + freeVram: 0, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: 0, + freeVram: 0, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); - it("attempts to resolve 32 gpuLayers", async () => { - { - const res = await resolveGpuLayers(32, { - totalVram: s1GB * 6, - freeVram: s1GB * 6 - }); - expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); - } - try { - await resolveGpuLayers(32, { - totalVram: s1GB * 6, - freeVram: s1GB * 0.2 - }); - expect.unreachable("Should have thrown an error"); - } catch (err) { - expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); - } - { - const res = await resolveGpuLayers(32, { - totalVram: s1GB * 6, - freeVram: s1GB * 0, - ignoreMemorySafetyChecks: true - }); - expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("null"); - } + test("some RAM", async () => { + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 3, + totalRam: s1GB * 3, + freeRam: s1GB * 2 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("1924"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 3, + freeRam: s1GB * 2 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 3, + freeRam: s1GB * 2 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, - { - const res = await resolveGpuLayers(32, { - totalVram: 0, - freeVram: 0, - llamaGpu: false - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers(32, { - totalVram: 0, - freeVram: 0, - llamaGpu: false, - ignoreMemorySafetyChecks: true - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - }); + // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left + // to create a context + freeVram: s1GB * 0.2, - it("attempts to resolve 33 gpuLayers", async () => { - { - const res = await resolveGpuLayers(33, { - totalVram: s1GB * 6, - freeVram: s1GB * 6 - }); - expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); - } - try { - await resolveGpuLayers(33, { - totalVram: s1GB * 6, - freeVram: s1GB * 0.2 - }); - expect.unreachable("Should have thrown an error"); - } catch (err) { - expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); - } - { - const res = await resolveGpuLayers(33, { - totalVram: s1GB * 6, - freeVram: s1GB * 0.2, - ignoreMemorySafetyChecks: true - }); - expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("null"); - } + totalRam: s1GB * 3, + freeRam: s1GB * 2, - { - const res = await resolveGpuLayers(33, { - totalVram: 0, - freeVram: 0, - llamaGpu: false - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers(33, { - totalVram: 0, - freeVram: 0, - llamaGpu: false, - ignoreMemorySafetyChecks: true - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.eql(null); + } + + + { + const res = await resolveGpuLayers(16, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 7, + freeRam: s1GB * 7, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 7, + freeRam: s1GB * 6, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + }); + + test("some unified RAM", async () => { + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 6, + unifiedMemorySize: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("7411"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 5, + unifiedMemorySize: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("2168"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 5, + unifiedMemorySize: s1GB * 5 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("7411"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 3, + freeRam: s1GB * 2, + unifiedMemorySize: s1GB * 6 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 3, + freeRam: s1GB * 2, + unifiedMemorySize: s1GB * 6 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + + // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left + // to create a context + freeVram: s1GB * 0.2, + + totalRam: s1GB * 3, + freeRam: s1GB * 2, + unifiedMemorySize: s1GB * 6, + + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.eql(null); + } + + + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 5.4, + unifiedMemorySize: s1GB * 6, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 5, + unifiedMemorySize: s1GB * 6, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + } + }); + + test("with swap", async () => { + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 3, + totalRam: s1GB * 3, + freeRam: s1GB * 2, + totalSwap: s1GB * 6, + freeSwap: s1GB * 3 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("1924"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 3, + freeRam: s1GB * 2, + totalSwap: s1GB * 6, + freeSwap: s1GB * 3 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + try { + await resolveGpuLayers(16, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 3, + freeRam: s1GB * 2, + totalSwap: s1GB * 6, + freeSwap: s1GB * 3 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: s1GB * 6, + + // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left + // to create a context + freeVram: s1GB * 0.2, + + totalRam: s1GB * 3, + freeRam: s1GB * 2, + totalSwap: s1GB * 6, + freeSwap: s1GB * 3, + + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.eql(null); + } + + + { + const res = await resolveGpuLayers(16, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 3, + totalSwap: s1GB * 6, + freeSwap: s1GB * 3, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + } + { + const res = await resolveGpuLayers(16, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 0, + totalSwap: s1GB * 6, + freeSwap: s1GB * 3, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); }); - it('attempts to resolve "max"', async () => { - try { - await resolveGpuLayers("max", { - totalVram: s1GB * 6, - freeVram: s1GB * 0 - }); - expect.unreachable("Should have thrown an error"); - } catch (err) { - expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); - } + describe("attempts to resolve 32 gpuLayers", () => { + it("no RAM", async () => { + { + const res = await resolveGpuLayers(32, { + totalVram: s1GB * 6, + freeVram: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(32); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + try { + await resolveGpuLayers(32, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(32, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(32); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } - try { - await resolveGpuLayers("max", { - totalVram: s1GB * 6, - freeVram: s1GB * 0.2 - }); - expect.unreachable("Should have thrown an error"); - } catch (err) { - expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); - } + { + const res = await resolveGpuLayers(32, { + totalVram: 0, + freeVram: 0, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + { + const res = await resolveGpuLayers(32, { + totalVram: 0, + freeVram: 0, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); - try { - await resolveGpuLayers("max", { - totalVram: s1GB * 6, - freeVram: s1GB * 3.2 - }); - expect.unreachable("Should have thrown an error"); - } catch (err) { - expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); - } + it("some RAM", async () => { + { + const res = await resolveGpuLayers(32, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(32); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + try { + await resolveGpuLayers(32, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 6, + freeRam: s1GB * 0.2 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(32, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 6, + freeRam: s1GB * 0, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(32); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } - { - const res = await resolveGpuLayers("max", { - totalVram: s1GB * 6, - freeVram: s1GB * 1.2, - ignoreMemorySafetyChecks: true - }); - expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("null"); - }{ - const res = await resolveGpuLayers("max", { - totalVram: s1GB * 6, - freeVram: s1GB * 4.7 - }); - expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("607"); - } - { - const res = await resolveGpuLayers("max", { - totalVram: s1GB * 6, - freeVram: s1GB * 4.8 - }); - expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1142"); - } + { + const res = await resolveGpuLayers(32, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 5, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + }{ + const res = await resolveGpuLayers(32, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 4.8, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("1143"); + } + { + const res = await resolveGpuLayers(32, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 4, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); }); - it('attempts to resolve "auto"', async () => { - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: 0 - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 0.4 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 1.4 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); - expect(res.contextSize).to.toMatchInlineSnapshot("5192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 1.8 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("5164"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 2.4 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 3.1 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("11"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 3.3 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("12"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 3.5 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 3.8 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 4 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 4.3 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 4.5 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("8076"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 4.8 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); - expect(res.contextSize).to.toMatchInlineSnapshot("8140"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 5.2 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3282"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 5.8 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("6492"); - } - { - const res = await resolveGpuLayers("auto", { - totalVram: s1GB * 6, - freeVram: s1GB * 6 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); - } + describe("attempts to resolve 33 gpuLayers", () => { + test("no RAM", async () => { + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + try { + await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + + { + const res = await resolveGpuLayers(33, { + totalVram: 0, + freeVram: 0, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: 0, + freeVram: 0, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); + + test("some RAM", async () => { + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 4 + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + try { + await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 6, + freeRam: s1GB * 6 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 6, + freeRam: s1GB * 6, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + + { + const res = await resolveGpuLayers(33, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 4, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 4.8, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("1143"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 5, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 6, + llamaGpu: false + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: 0, + freeVram: 0, + totalRam: s1GB * 6, + freeRam: s1GB * 6, + llamaGpu: false, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + }); + + test("some unified RAM", async () => { + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 6, + unifiedMemorySize: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 5.4, + unifiedMemorySize: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 6, + freeRam: s1GB * 4.8, + unifiedMemorySize: s1GB * 6 + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("1142"); + } + try { + await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 6, + freeRam: s1GB * 6, + unifiedMemorySize: s1GB * 6 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers(33, { + totalVram: s1GB * 6, + freeVram: s1GB * 0.2, + totalRam: s1GB * 6, + freeRam: s1GB * 6, + unifiedMemorySize: s1GB * 6, + ignoreMemorySafetyChecks: true + }); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); + } + }); }); - it("attempts to resolve {min?: number, max?: number}", async () => { - { - const res = await resolveGpuLayers({max: 4}, { - totalVram: s1GB * 6, - freeVram: s1GB * 0 - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } - { - const res = await resolveGpuLayers({min: 0, max: 4}, { - totalVram: s1GB * 6, - freeVram: s1GB * 0 - }); - expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } + it('attempts to resolve "max"', async () => { try { - await resolveGpuLayers({min: 2}, { + await resolveGpuLayers("max", { totalVram: s1GB * 6, freeVram: s1GB * 0 }); @@ -467,137 +866,766 @@ describe("functionary", () => { } catch (err) { expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); } + try { - await resolveGpuLayers({min: 2, max: 4}, { + await resolveGpuLayers("max", { totalVram: s1GB * 6, - freeVram: s1GB * 0 + freeVram: s1GB * 0.2 }); expect.unreachable("Should have thrown an error"); } catch (err) { expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); } - { - const res = await resolveGpuLayers({max: 16}, { - totalVram: s1GB * 6, - freeVram: s1GB * 3.8 - }); - expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - } try { - await resolveGpuLayers({min: 16}, { + await resolveGpuLayers("max", { totalVram: s1GB * 6, - freeVram: s1GB * 2 + freeVram: s1GB * 3.2 }); expect.unreachable("Should have thrown an error"); } catch (err) { expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); } + { - const res = await resolveGpuLayers({min: 16}, { + const res = await resolveGpuLayers("max", { totalVram: s1GB * 6, - freeVram: s1GB * 4 + freeVram: s1GB * 1.2, + ignoreMemorySafetyChecks: true }); - expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); } { - const res = await resolveGpuLayers({min: 16, max: 24}, { + const res = await resolveGpuLayers("max", { totalVram: s1GB * 6, - freeVram: s1GB * 4 + freeVram: s1GB * 4.7 }); - expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("607"); } { - const res = await resolveGpuLayers({min: 16, max: 24}, { + const res = await resolveGpuLayers("max", { totalVram: s1GB * 6, - freeVram: s1GB * 3 + freeVram: s1GB * 4.8 }); - expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("1924"); + expect(res.gpuLayers).to.eql(33); + expect(res.contextSize).to.toMatchInlineSnapshot("1142"); } }); - it("attempts to resolve {fitContext?: {contextSize?: number}}", async () => { - { - const contextSize = 4096; - const res = await resolveGpuLayers({fitContext: {contextSize}}, { - totalVram: 0, - freeVram: 0, - llamaGpu: false - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - expect(res.contextSize).to.be.gte(contextSize); - } - { - const contextSize = 4096; - const res = await resolveGpuLayers({fitContext: {contextSize}}, { - totalVram: s1GB * 6, - freeVram: s1GB * 4 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); - expect(res.contextSize).to.toMatchInlineSnapshot("5561"); - expect(res.contextSize).to.be.gte(contextSize); - } - { - const contextSize = 4096; - const res = await resolveGpuLayers({fitContext: {contextSize}}, { - totalVram: s1GB * 2, - freeVram: s1GB * 1.8 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("5164"); - expect(res.contextSize).to.be.gte(contextSize); - } - { - const contextSize = 8192; - const res = await resolveGpuLayers({fitContext: {contextSize}}, { - totalVram: s1GB * 6, - freeVram: s1GB * 4 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - expect(res.contextSize).to.be.gte(contextSize); - } - { - const contextSize = 8192; - const res = await resolveGpuLayers({fitContext: {contextSize}}, { - totalVram: s1GB * 1, - freeVram: s1GB * 1.8 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - expect(res.contextSize).to.be.gte(contextSize); - } - { - const contextSize = 8192; - const res = await resolveGpuLayers({fitContext: {contextSize}}, { - totalVram: s1GB * 0, - freeVram: s1GB * 0 - }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); - expect(res.contextSize).to.be.gte(contextSize); - } - { + describe('attempts to resolve "auto"', () => { + test("8GB RAM", async () => { + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: 0, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 0.4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 1.4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); + expect(res.contextSize).to.toMatchInlineSnapshot("5192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 1.8, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.contextSize).to.toMatchInlineSnapshot("5164"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 2.4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.1, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("11"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.3, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("12"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.5, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.8, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4.3, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4.5, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); + expect(res.contextSize).to.toMatchInlineSnapshot("8076"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4.8, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); + expect(res.contextSize).to.toMatchInlineSnapshot("8140"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 5.2, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("3282"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 5.8, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("6492"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + }); + + test("5GB RAM", async () => { + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: 0, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 0.4, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 1.4, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); + expect(res.contextSize).to.toMatchInlineSnapshot("5192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 1.8, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.contextSize).to.toMatchInlineSnapshot("5164"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 2.4, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.1, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("11"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.3, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("12"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.5, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 3.8, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4.3, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4.5, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); + expect(res.contextSize).to.toMatchInlineSnapshot("8076"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 4.8, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); + expect(res.contextSize).to.toMatchInlineSnapshot("8140"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 5.2, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("3282"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 5.8, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("6492"); + } + { + const res = await resolveGpuLayers("auto", { + totalVram: s1GB * 6, + freeVram: s1GB * 6, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + } + }); + }); + + describe("attempts to resolve {min?: number, max?: number}", () => { + test("8GB RAM", async () => { + { + const res = await resolveGpuLayers({max: 4}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers({min: 0, max: 4}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } try { - await resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, { - totalVram: s1GB * 0.2, - freeVram: s1GB * 0 + await resolveGpuLayers({min: 2}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 8, + freeRam: s1GB * 8 }); expect.unreachable("Should have thrown an error"); } catch (err) { expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); } - } + try { + await resolveGpuLayers({min: 2, max: 4}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + + { + const res = await resolveGpuLayers({max: 16}, { + totalVram: s1GB * 6, + freeVram: s1GB * 3.8, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + try { + await resolveGpuLayers({min: 16}, { + totalVram: s1GB * 6, + freeVram: s1GB * 2, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers({min: 16}, { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.be.gte(16); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers({min: 16, max: 24}, { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.be.gte(16); + expect(res.gpuLayers).to.be.lte(24); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers({min: 16, max: 24}, { + totalVram: s1GB * 6, + freeVram: s1GB * 3, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.be.gte(16); + expect(res.gpuLayers).to.be.lte(24); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); + expect(res.contextSize).to.toMatchInlineSnapshot("1924"); + } + }); + + test("5GB RAM", async () => { + { + const res = await resolveGpuLayers({max: 4}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + } + { + const res = await resolveGpuLayers({min: 0, max: 4}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.eql(0); + expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + } + try { + await resolveGpuLayers({min: 2}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + try { + await resolveGpuLayers({min: 2, max: 4}, { + totalVram: s1GB * 6, + freeVram: s1GB * 0, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + + { + const res = await resolveGpuLayers({max: 16}, { + totalVram: s1GB * 6, + freeVram: s1GB * 3.8, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.eql(16); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + try { + await resolveGpuLayers({min: 16}, { + totalVram: s1GB * 6, + freeVram: s1GB * 2, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + { + const res = await resolveGpuLayers({min: 16}, { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.be.gte(16); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers({min: 16, max: 24}, { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.be.gte(16); + expect(res.gpuLayers).to.be.lte(24); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + } + { + const res = await resolveGpuLayers({min: 16, max: 24}, { + totalVram: s1GB * 6, + freeVram: s1GB * 3, + totalRam: s1GB * 5, + freeRam: s1GB * 5 + }); + expect(res.gpuLayers).to.be.gte(16); + expect(res.gpuLayers).to.be.lte(24); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); + expect(res.contextSize).to.toMatchInlineSnapshot("1924"); + } + }); + }); + + describe("attempts to resolve {fitContext?: {contextSize?: number}}", () => { + test("8GB RAM", async () => { + { + const contextSize = 4096; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: 0, + freeVram: 0, + llamaGpu: false, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 4096; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.contextSize).to.toMatchInlineSnapshot("5561"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 4096; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 2, + freeVram: s1GB * 1.8, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.contextSize).to.toMatchInlineSnapshot("5164"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 8192; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 6, + freeVram: s1GB * 4, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 8192; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 2, + freeVram: s1GB * 1.9, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 8192; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 0, + freeVram: s1GB * 0, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + try { + await resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, { + totalVram: s1GB * 0.2, + freeVram: s1GB * 0, + totalRam: s1GB * 8, + freeRam: s1GB * 8 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + } + }); + + test("7GB RAM", async () => { + { + const contextSize = 4096; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: 0, + freeVram: 0, + llamaGpu: false, + totalRam: s1GB * 7, + freeRam: s1GB * 7 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 4096; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 7, + freeVram: s1GB * 4, + totalRam: s1GB * 7, + freeRam: s1GB * 7 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.contextSize).to.toMatchInlineSnapshot("6548"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 4096; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 2, + freeVram: s1GB * 1.8, + totalRam: s1GB * 7, + freeRam: s1GB * 7 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.contextSize).to.toMatchInlineSnapshot("5164"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 8192; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 7, + freeVram: s1GB * 4, + totalRam: s1GB * 7, + freeRam: s1GB * 7 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 8192; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 2, + freeVram: s1GB * 1.9, + totalRam: s1GB * 7, + freeRam: s1GB * 7 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + const contextSize = 8192; + const res = await resolveGpuLayers({fitContext: {contextSize}}, { + totalVram: s1GB * 0, + freeVram: s1GB * 0, + totalRam: s1GB * 7, + freeRam: s1GB * 7 + }); + expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.contextSize).to.be.gte(contextSize); + } + { + try { + await resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, { + totalVram: s1GB * 0.2, + freeVram: s1GB * 0, + totalRam: s1GB * 7, + freeRam: s1GB * 7 + }); + expect.unreachable("Should have thrown an error"); + } catch (err) { + expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]"); + } + } + }); }); }); }); diff --git a/test/modelDependent/llama3/chatSession.test.ts b/test/modelDependent/llama3/chatSession.test.ts index e083dc57..8252da13 100644 --- a/test/modelDependent/llama3/chatSession.test.ts +++ b/test/modelDependent/llama3/chatSession.test.ts @@ -108,6 +108,80 @@ describe("llama 3", () => { expect(completion).to.eql(" it is."); }); + test("prompt longer than context size incurs context shift", {timeout: 1000 * 60 * 60 * 2}, async () => { + const contextSize = 128; + + const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence(), + systemPrompt: "You are a helpful, respectful and honest biologist. " + + "Always answer as helpfully as possible with extensive detail." + }); + const prompt = "Describe the appearance of a llama and explain what it is. " + + "Include as much detail as possible with detailed examples and explanations, including its physical appearance, " + + "habitat, diet, social structure, and any other relevant information. " + + "Do not assume any prior knowledge on the part of the reader, and always provide detailed explanations as you describe the animal. " + + "Remember to be as helpful and detailed as possible in your response and your role in great importance in educating the reader. " + + "Assume that the reader is a student who is eager to learn and is looking to you for guidance and information, and always provide the best possible information you can. " + + "Do not provide any false or misleading information, and always be honest and respectful in your responses."; + + const initialContextState = chatSession.chatWrapper.generateContextState({ + chatHistory: [...chatSession.getChatHistory(), { + type: "user", + text: prompt + }, { + type: "model", + response: [""] + }] + }); + const initialContextStateTokens = initialContextState.contextText.tokenize(model.tokenizer); + + expect(initialContextStateTokens.length).to.be.gt(contextSize); + + const res = await chatSession.prompt(prompt, {maxTokens: contextSize}); + expect(res.length).to.be.gte(20); + + // ensure there's no repetition of the first part + const firstPart = res.slice(0, 12); + const firstPartOccurrences = res.split(firstPart).length - 1; + expect(firstPartOccurrences).to.eql(1); + }); + + test("using response prefix", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize: 2048 + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence() + }); + + expect(chatSession.chatWrapper).to.be.an.instanceof(Llama3ChatWrapper); + + const prompt = "Describe the appearance of a llama"; + const responsePrefix = "Of course! A llama is"; + const res = await chatSession.prompt(prompt, { + responsePrefix, + maxTokens: 10 + }); + + expect(res.startsWith(responsePrefix)).to.eql(true); + expect(res).toMatchInlineSnapshot('"Of course! A llama is a domesticated mammal that belongs to the camel"'); + }); + // disabled due to getting timeout in the CI due to taking too long test.skip("context shift works correctly", {timeout: 1000 * 60 * 60 * 2}, async () => { const contextSize = 2048; diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts index a8ea5834..13539a84 100644 --- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts +++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts @@ -18,9 +18,15 @@ describe("stableCode", () => { const s1GB = Math.pow(1024, 3); async function resolveGpuLayers(gpuLayers: LlamaModelOptions["gpuLayers"], { - totalVram, freeVram, ignoreMemorySafetyChecks = false, llamaGpu = "metal" + totalVram, freeVram, unifiedMemorySize = 0, + totalRam = s1GB * 10, freeRam = s1GB * 10, // TODO: update all tests to test different RAM sizes + totalSwap = 0, freeSwap = 0, + ignoreMemorySafetyChecks = false, llamaGpu = "metal" }: { - totalVram: number, freeVram: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu + totalVram: number, freeVram: number, unifiedMemorySize?: number, + totalRam?: number, freeRam?: number, + totalSwap?: number, freeSwap?: number, + ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu }) { const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, { ignoreMemorySafetyChecks, @@ -34,27 +40,31 @@ describe("stableCode", () => { }); async function resolveAutoContextSize() { - const modelVram = ggufInsights.estimateModelResourceRequirements({ - gpuLayers: resolvedGpuLayers - }).gpuVram; + const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({ + targetGpuLayers: resolvedGpuLayers + }, { + llamaGpu, + getVramState: async () => ({ + total: llamaGpu === false ? 0 : totalVram, + free: llamaGpu === false ? 0 : freeVram, + unifiedSize: unifiedMemorySize + }), + getRamState: async () => ({ + total: totalRam, + free: freeRam + }), + getSwapState: async () => ({ + total: totalSwap, + free: freeSwap + }), + llamaSupportsGpuOffloading: llamaGpu !== false, + llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram) + }); - try { - return await ggufInsights.configurationResolver.resolveContextContextSize("auto", { - batchSize: undefined, - sequences: 1, - modelGpuLayers: resolvedGpuLayers, - modelTrainContextSize: ggufInsights.trainContextSize ?? 4096, - getVramState: async () => ({ - total: llamaGpu === false ? 0 : totalVram, - free: llamaGpu === false ? 0 : (freeVram - modelVram) - }), - llamaGpu, - ignoreMemorySafetyChecks: false, - isEmbeddingContext: false - }); - } catch (err) { + if (resolvedConfig.compatibilityScore === 0) return null; - } + + return resolvedConfig.resolvedValues.contextSize; } return { @@ -130,7 +140,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.eql(null); + expect(res.contextSize).to.toMatchInlineSnapshot("133"); } @@ -180,7 +190,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.contextSize).to.toMatchInlineSnapshot("94"); } { @@ -229,7 +239,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.contextSize).to.toMatchInlineSnapshot("94"); } { @@ -291,7 +301,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.contextSize).to.toMatchInlineSnapshot("94"); }{ const res = await resolveGpuLayers("max", { totalVram: s1GB * 6,