From 6c4243fae2490190b1a98fc1a91ad6d99c177309 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 11 Dec 2024 19:16:25 +0200 Subject: [PATCH 01/73] feat(minor): dynamically load `llama.cpp` backends --- llama/CMakeLists.txt | 5 ++ llama/addon/addon.cpp | 14 ++-- src/bindings/AddonTypes.ts | 2 +- src/bindings/Llama.ts | 81 ++++++++++--------- src/bindings/getLlama.ts | 21 +++-- src/bindings/utils/compileLLamaCpp.ts | 9 ++- .../utils/resolveActualBindingBinaryPath.ts | 19 +++++ src/bindings/utils/testBindingBinary.ts | 2 +- 8 files changed, 98 insertions(+), 55 deletions(-) create mode 100644 src/bindings/utils/resolveActualBindingBinaryPath.ts diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt index 08c7a86b..d6413202 100644 --- a/llama/CMakeLists.txt +++ b/llama/CMakeLists.txt @@ -24,6 +24,11 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)" set(LLAMA_BUILD_COMMON ON) +if (NOT MINGW) + set(GGML_BACKEND_DL ON) + set(BUILD_SHARED_LIBS ON) +endif() + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") add_compile_options(-Wno-c++17-extensions) endif() diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp index 7b014079..ed4b2fb9 100644 --- a/llama/addon/addon.cpp +++ b/llama/addon/addon.cpp @@ -152,16 +152,16 @@ class AddonBackendUnloadWorker : public Napi::AsyncWorker { }; Napi::Value addonLoadBackends(const Napi::CallbackInfo& info) { - const bool forceLoadLibraries = info.Length() == 0 - ? false - : info[0].IsBoolean() - ? info[0].As().Value() - : false; + const std::string forceLoadLibrariesSearchPath = info.Length() == 0 + ? "" + : info[0].IsString() + ? info[0].As().Utf8Value() + : ""; ggml_backend_reg_count(); - if (forceLoadLibraries) { - ggml_backend_load_all(); + if (forceLoadLibrariesSearchPath.length() > 0) { + ggml_backend_load_all_from_path(forceLoadLibrariesSearchPath.c_str()); } return info.Env().Undefined(); diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index d62faf13..6303a711 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -76,7 +76,7 @@ export type BindingModule = { free: number }, init(): Promise, - loadBackends(forceLoadLibraries?: boolean): void, + loadBackends(forceLoadLibrariesSearchPath?: string): void, dispose(): Promise }; diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts index e48143fe..34bba99b 100644 --- a/src/bindings/Llama.ts +++ b/src/bindings/Llama.ts @@ -1,4 +1,5 @@ import os from "os"; +import path from "path"; import chalk from "chalk"; import {DisposedError, EventRelay, withLock} from "lifecycle-utils"; import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js"; @@ -34,7 +35,7 @@ export class Llama { /** @internal */ public readonly _memoryLock = {}; /** @internal */ public readonly _consts: ReturnType; /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator; - /** @internal */ public readonly _vramPadding: MemoryReservation; + /** @internal */ public _vramPadding: MemoryReservation; /** @internal */ public readonly _ramOrchestrator: MemoryOrchestrator; /** @internal */ public readonly _ramPadding: MemoryReservation; /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator; @@ -65,10 +66,11 @@ export class Llama { public readonly onDispose = new EventRelay(); private constructor({ - bindings, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, gpu, maxThreads, vramOrchestrator, vramPadding, - ramOrchestrator, ramPadding, swapOrchestrator + bindings, bindingPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, buildGpu, maxThreads, vramOrchestrator, + vramPadding, ramOrchestrator, ramPadding, swapOrchestrator }: { bindings: BindingModule, + bindingPath: string, logLevel: LlamaLogLevel, logger: (level: LlamaLogLevel, message: string) => void, buildType: "localBuild" | "prebuilt", @@ -78,7 +80,7 @@ export class Llama { release: string }, debug: boolean, - gpu: BuildGpu, + buildGpu: BuildGpu, maxThreads?: number, vramOrchestrator: MemoryOrchestrator, vramPadding: MemoryReservation, @@ -86,14 +88,31 @@ export class Llama { ramPadding: MemoryReservation, swapOrchestrator: MemoryOrchestrator }) { + this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this); + this._onAddonLog = this._onAddonLog.bind(this); + this._bindings = bindings; - this._gpu = gpu; + this._debug = debug; + this._logLevel = this._debug + ? LlamaLogLevel.debug + : (logLevel ?? LlamaLogLevel.debug); + + if (!this._debug) { + this._bindings.setLogger(this._onAddonLog); + this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel); + } + + bindings.loadBackends(); + const loadedGpu = bindings.getGpuType(); + if (loadedGpu == null || (loadedGpu === false && buildGpu !== false)) + bindings.loadBackends(path.dirname(bindingPath)); + + this._gpu = bindings.getGpuType() ?? false; this._supportsGpuOffloading = bindings.getSupportsGpuOffloading(); this._supportsMmap = bindings.getSupportsMmap(); this._supportsMlock = bindings.getSupportsMlock(); this._mathCores = bindings.getMathCores(); this._consts = bindings.getConsts(); - this._debug = debug; this._vramOrchestrator = vramOrchestrator; this._vramPadding = vramPadding; this._ramOrchestrator = ramOrchestrator; @@ -106,10 +125,6 @@ export class Llama { : 0 ) ); - - this._logLevel = this._debug - ? LlamaLogLevel.debug - : (logLevel ?? LlamaLogLevel.debug); this._logger = logger; this._buildType = buildType; this._cmakeOptions = Object.freeze({...cmakeOptions}); @@ -118,21 +133,7 @@ export class Llama { release: llamaCppRelease.release }); - this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this); - this._onAddonLog = this._onAddonLog.bind(this); - - if (!this._debug) { - this._bindings.setLogger(this._onAddonLog); - this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel); - } - - this._bindings.loadBackends(); - const loadedGpu = bindings.getGpuType(); - if (loadedGpu == null || (loadedGpu === false && gpu !== false)) - this._bindings.loadBackends(true); - this._onExit = this._onExit.bind(this); - process.on("exit", this._onExit); } @@ -446,9 +447,11 @@ export class Llama { /** @internal */ public static async _create({ - bindings, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug + bindings, bindingPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, + debug }: { bindings: BindingModule, + bindingPath: string, buildType: "localBuild" | "prebuilt", buildMetadata: BuildMetadataFile, logLevel: LlamaLogLevel, @@ -459,7 +462,6 @@ export class Llama { skipLlamaInit?: boolean, debug: boolean }) { - const gpu = bindings.getGpuType() ?? false; const vramOrchestrator = new MemoryOrchestrator(() => { const {total, used, unifiedSize} = bindings.getGpuVramInfo(); @@ -497,14 +499,6 @@ export class Llama { }; }); - let resolvedVramPadding: MemoryReservation; - if (gpu === false || vramPadding === 0) - resolvedVramPadding = vramOrchestrator.reserveMemory(0); - else if (vramPadding instanceof Function) - resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total)); - else - resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding); - let resolvedRamPadding: MemoryReservation; if (ramPadding instanceof Function) resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total)); @@ -513,6 +507,7 @@ export class Llama { const llama = new Llama({ bindings, + bindingPath, buildType, cmakeOptions: buildMetadata.buildOptions.customCmakeOptions, llamaCppRelease: { @@ -522,15 +517,27 @@ export class Llama { logLevel, logger, debug, - gpu, + buildGpu: buildMetadata.buildOptions.gpu, vramOrchestrator, maxThreads, - vramPadding: resolvedVramPadding, + vramPadding: vramOrchestrator.reserveMemory(0), ramOrchestrator, ramPadding: resolvedRamPadding, swapOrchestrator }); + if (llama.gpu === false || vramPadding === 0) { + // do nothing since `llama._vramPadding` is already set to 0 + } else if (vramPadding instanceof Function) { + const currentVramPadding = llama._vramPadding; + llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total)); + currentVramPadding.dispose(); + } else { + const currentVramPadding = llama._vramPadding; + llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding); + currentVramPadding.dispose(); + } + if (!skipLlamaInit) await llama._init(); @@ -612,6 +619,8 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string): LlamaLog return LlamaLogLevel.log; else if (level === LlamaLogLevel.warn && message.startsWith("ggml_cuda_init: GGML_CUDA_FORCE_") && message.endsWith(" no")) return LlamaLogLevel.log; + else if (level === LlamaLogLevel.info && message.startsWith("load_backend: loaded ")) + return LlamaLogLevel.log; return level; } diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts index dffeea50..c0dcffc6 100644 --- a/src/bindings/getLlama.ts +++ b/src/bindings/getLlama.ts @@ -30,6 +30,7 @@ import {getLinuxDistroInfo, isDistroAlpineLinux} from "./utils/getLinuxDistroInf import {testBindingBinary} from "./utils/testBindingBinary.js"; import {BinaryPlatformInfo, getPlatformInfo} from "./utils/getPlatformInfo.js"; import {hasBuildingFromSourceDependenciesInstalled} from "./utils/hasBuildingFromSourceDependenciesInstalled.js"; +import {resolveActualBindingBinaryPath} from "./utils/resolveActualBindingBinaryPath.js"; const require = createRequire(import.meta.url); @@ -297,11 +298,13 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp await waitForLockfileRelease({resourcePath: localBuildFolder}); if (localBuildBinPath != null) { try { - const binding = loadBindingModule(localBuildBinPath); + const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath); + const binding = loadBindingModule(resolvedBindingPath); const buildMetadata = await getLocalBuildBinaryBuildMetadata(lastBuildInfo.folderName); return await Llama._create({ bindings: binding, + bindingPath: resolvedBindingPath, buildType: "localBuild", buildMetadata, logger: lastBuildOptions?.logger ?? Llama.defaultConsoleLogger, @@ -585,15 +588,17 @@ async function loadExistingLlamaBinary({ platformInfo, buildMetadata }); + const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath); const binaryCompatible = shouldTestBinaryBeforeLoading - ? await testBindingBinary(localBuildBinPath, buildOptions.gpu) + ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu) : true; if (binaryCompatible) { - const binding = loadBindingModule(localBuildBinPath); + const binding = loadBindingModule(resolvedBindingPath); return await Llama._create({ bindings: binding, + bindingPath: resolvedBindingPath, buildType: "localBuild", buildMetadata, logLevel, @@ -642,15 +647,17 @@ async function loadExistingLlamaBinary({ platformInfo, buildMetadata }); + const resolvedBindingPath = await resolveActualBindingBinaryPath(prebuiltBinDetails.binaryPath); const binaryCompatible = shouldTestBinaryBeforeLoading - ? await testBindingBinary(prebuiltBinDetails.binaryPath, buildOptions.gpu) + ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu) : true; if (binaryCompatible) { - const binding = loadBindingModule(prebuiltBinDetails.binaryPath); + const binding = loadBindingModule(resolvedBindingPath); return await Llama._create({ bindings: binding, + bindingPath: resolvedBindingPath, buildType: "prebuilt", buildMetadata, logLevel, @@ -744,11 +751,13 @@ async function buildAndLoadLlamaBinary({ throw new Error("Failed to build llama.cpp"); } - const binding = loadBindingModule(localBuildBinPath); + const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath); + const binding = loadBindingModule(resolvedBindingPath); const buildMetadata = await getLocalBuildBinaryBuildMetadata(buildFolderName.withCustomCmakeOptions); return await Llama._create({ bindings: binding, + bindingPath: resolvedBindingPath, buildType: "localBuild", buildMetadata, logLevel, diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts index ec9655b4..e898829a 100644 --- a/src/bindings/utils/compileLLamaCpp.ts +++ b/src/bindings/utils/compileLLamaCpp.ts @@ -103,11 +103,12 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions if (!cmakeCustomOptions.has("GGML_OPENMP")) cmakeCustomOptions.set("GGML_OPENMP", "OFF"); - if (!cmakeCustomOptions.has("GGML_AMX")) - cmakeCustomOptions.set("GGML_AMX", "OFF"); - - if (!cmakeCustomOptions.has("GGML_NATIVE") && buildOptions.platform !== "mac") + if (!cmakeCustomOptions.has("GGML_NATIVE") && !(buildOptions.platform === "mac" && buildOptions.arch === "arm64")) { cmakeCustomOptions.set("GGML_NATIVE", "OFF"); + + if (!cmakeCustomOptions.has("GGML_CPU_ALL_VARIANTS")) + cmakeCustomOptions.set("GGML_CPU_ALL_VARIANTS", "ON"); + } } await fs.remove(outDirectory); diff --git a/src/bindings/utils/resolveActualBindingBinaryPath.ts b/src/bindings/utils/resolveActualBindingBinaryPath.ts new file mode 100644 index 00000000..21656519 --- /dev/null +++ b/src/bindings/utils/resolveActualBindingBinaryPath.ts @@ -0,0 +1,19 @@ +import path from "path"; +import fs from "fs-extra"; +import {runningInElectron} from "../../utils/runtime.js"; + +export async function resolveActualBindingBinaryPath(binaryPath: string) { + const absolutePath = path.resolve(binaryPath); + if (!runningInElectron) + return absolutePath; + + const fixedAsarPath = absolutePath.replace(".asar" + path.sep, ".asar.unpacked" + path.sep); + try { + if (await fs.pathExists(fixedAsarPath)) + return fixedAsarPath; + + return absolutePath; + } catch (err) { + return absolutePath; + } +} diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts index 43e47ebe..b85ac213 100644 --- a/src/bindings/utils/testBindingBinary.ts +++ b/src/bindings/utils/testBindingBinary.ts @@ -200,7 +200,7 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro binding.loadBackends(); const loadedGpu = binding.getGpuType(); if (loadedGpu == null || (loadedGpu === false && message.gpu !== false)) - binding.loadBackends(true); + binding.loadBackends(path.dirname(path.resolve(message.bindingBinaryPath))); await binding.init(); binding.getGpuVramInfo(); From 32b7f9e2f7eda85219b12d3a08e4e4b92f3bd73e Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 11 Dec 2024 19:27:19 +0200 Subject: [PATCH 02/73] docs: remove Intel AMX trick, since it's being automatically used in the prebuilt binaries now --- docs/guide/tips-and-tricks.md | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md index 190741ff..bfdb7086 100644 --- a/docs/guide/tips-and-tricks.md +++ b/docs/guide/tips-and-tricks.md @@ -88,37 +88,3 @@ npx --no node-llama-cpp source download ``` Now, just use `node-llama-cpp` as you normally would. - -## Intel AMX {#intel-amx} -> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors -> that helps optimize and accelerate matrix multiplication operations. -> -> It's available on the 4th Gen and newer Intel Xeon processors. - -Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions). - -If you're using a 4th Gen or newer Intel Xeon processor, -you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware. - -To do this, run this command inside your project on the machine you run your project on: -```shell -npx --no node-llama-cpp source download -``` - -Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries -and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU: - -```typescript -import os from "os"; -import {getLlama} from "node-llama-cpp"; - -const llama = await getLlama({ - usePrebuiltBinaries: !os.cpus().some((cpu) => ( - cpu.model.toLowerCase().includes("Xeon".toLowerCase()) - )) -}); -``` -::: info NOTE -Building from source can take some time (when using CUDA even up to an hour in extreme cases), -so ensure you dedicate some time for this as part of the deployment process. -::: From 6504b23a4b90e9c970b572f69558cc7292292df6 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sat, 14 Dec 2024 15:56:36 +0200 Subject: [PATCH 03/73] docs: update custom cmake options --- docs/guide/cmakeOptions.data.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/guide/cmakeOptions.data.ts b/docs/guide/cmakeOptions.data.ts index 1c0263c2..906562d9 100644 --- a/docs/guide/cmakeOptions.data.ts +++ b/docs/guide/cmakeOptions.data.ts @@ -68,12 +68,16 @@ function parseCmakeOptions(cmakeListsTxt: string, optionFilter: ((key: string) = for (let i = 0; i < cmakeOptions.length; i++) { const option = cmakeOptions[i]!; - if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC") { + if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC" || + option.key === "GGML_WASM_SINGLE_FILE" || option.key === "BUILD_SHARED_LIBS" || option.key === "GGML_BACKEND_DL" + ) { cmakeOptions.splice(i, 1); i--; continue; } else if (option.key === "GGML_METAL" && option.defaultValue === "${GGML_METAL_DEFAULT}") option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS on Apple Silicon, `OFF` otherwise"); + else if (option.key === "GGML_BLAS" && option.defaultValue === "${GGML_BLAS_DEFAULT}") + option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise"); else if (option.key === "GGML_METAL_EMBED_LIBRARY" && option.defaultValue === "${GGML_METAL}") option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise"); else if (option.defaultValue === "${GGML_STANDALONE}") { From 561f9eb53973638bdaad3b61c3a73892fadd1dca Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sat, 14 Dec 2024 15:57:34 +0200 Subject: [PATCH 04/73] docs: parse custom cmake options nested under ifs --- .vitepress/utils/parseCmakeListsTxtOptions.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.vitepress/utils/parseCmakeListsTxtOptions.ts b/.vitepress/utils/parseCmakeListsTxtOptions.ts index 3244aae5..b16f09d4 100644 --- a/.vitepress/utils/parseCmakeListsTxtOptions.ts +++ b/.vitepress/utils/parseCmakeListsTxtOptions.ts @@ -1,5 +1,7 @@ const maxLinesSpan = 10; +const cmakeOptionRegex = + /^\s*option\([\s\t\n\r]*(?\S+)[\s\t\n\r]+"(?(?:\\"|[^"])*)"[\s\t\n\r]+(?\S+)[\s\t\n\r]*\)/; export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) { const lines = cmakeListsTxtString.split("\n"); @@ -8,9 +10,7 @@ export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) { const match = lines .slice(index, index + maxLinesSpan) .join("\n") - .match( - /^option\([\s\t\n\r]*(?\S+)[\s\t\n\r]+"(?(?:\\"|[^"])*)"[\s\t\n\r]+(?\S+)[\s\t\n\r]*\)/ - ); + .match(cmakeOptionRegex); if (match == null || match.groups == null || match?.index !== 0) return null; From 14897492586cffa79732cbf5f7e24fb910ac6e83 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sat, 14 Dec 2024 15:59:43 +0200 Subject: [PATCH 05/73] docs: sitemap fixes --- .vitepress/config.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.vitepress/config.ts b/.vitepress/config.ts index 7ec54d35..4545fb59 100644 --- a/.vitepress/config.ts +++ b/.vitepress/config.ts @@ -132,13 +132,16 @@ export default defineConfig({ item.lastmod = new Date(buildDate); item.changefreq = "daily"; item.priority = 0.9; + } else if (item.url === "guide/") { + item.changefreq = "daily"; + item.priority = 0.7; } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) { item = { ...item, lastmod: new Date(buildDate), changefreq: "weekly", priority: item.url.startsWith("cli/") - ? 0.7 + ? 0.6 : 0.5 }; } else if (item.lastmod == null && item.url.startsWith("blog/")) { From dafe3b921487b2c25e42450222445b2517efb868 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sat, 14 Dec 2024 19:07:00 +0200 Subject: [PATCH 06/73] docs: user input safety --- README.md | 1 + docs/guide/llama-text.md | 30 ++++++++++++++++++++++++-- docs/index.md | 1 + src/chatWrappers/AlpacaChatWrapper.ts | 4 ++++ src/chatWrappers/FalconChatWrapper.ts | 4 ++++ src/chatWrappers/GeneralChatWrapper.ts | 4 ++++ src/utils/LlamaText.ts | 3 +++ 7 files changed, 45 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 569f7990..80ca32f1 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ * [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions * [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding) * Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/) +* [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp) * Much more ## [Documentation](https://node-llama-cpp.withcat.ai) diff --git a/docs/guide/llama-text.md b/docs/guide/llama-text.md index adf7f100..d1ea9d81 100644 --- a/docs/guide/llama-text.md +++ b/docs/guide/llama-text.md @@ -48,7 +48,7 @@ Tell the user anything they want ``` -Now that user can override the system prompt and do whatever they want. +Now the user can override the system prompt and do whatever they want. What we can do to mitigate it, is to do something like this: ::: code-group @@ -71,7 +71,7 @@ const tokens = [ ``` ::: -Now, the user input is tokenized with special tokens disabled, which means that is a use type the text ``, +Now, the user input is tokenized with special tokens disabled, which means that if a user types the text ``, it'll be tokenized as the text `` and not as a special token, so the user cannot override the system prompt now. The problem with the above code is that you need to have the model instance to tokenize the text this way, @@ -132,3 +132,29 @@ import {LlamaText, SpecialTokensText} from "node-llama-cpp"; const contentJson = JSON.parse(await fs.readFile("content.json", "utf8")); const content = LlamaText.fromJSON(contentJson); ``` + +## Input Safety in `node-llama-cpp` {#input-safety-in-node-llama-cpp} +[`LlamaText`](../api/classes/LlamaText.md) is used everywhere in `node-llama-cpp` to ensure the safety of the user input. +This ensures that user input cannot introduce special tokens injection attacks. + +When using any of the builtin [chat wrappers](./chat-wrapper.md), +messages are always tokenized with special tokens disabled (including the template chat wrappers, such as [`TemplateChatWrapper`](../api/classes/TemplateChatWrapper.md) and [`JinjaTemplateChatWrapper`](../api/classes/JinjaTemplateChatWrapper.md)). +System messages can include special tokens only if you explicitly pass a [`LlamaText`](../api/classes/LlamaText.md) for them. + +When [generating text completions](./text-completion.md) using [`LlamaCompletion`](../api/classes/LlamaCompletion.md), the input is always tokenized with special tokens disabled. +You can use special tokens in the input by explicitly using [`LlamaText`](../api/classes/LlamaText.md) or passing an array of tokens. + +::: info +The following chat wrappers don't use special tokens at all for the chat template, hence they are not safe against special token injection attacks: +* [`GeneralChatWrapper`](../api/classes/GeneralChatWrapper.md) +* [`AlpacaChatWrapper`](../api/classes/AlpacaChatWrapper.md) +* [`FalconChatWrapper`](../api/classes/FalconChatWrapper.md) +::: + +::: tip NOTE +Most models (such as Llama, Mistral, etc.) have special tokens marked correctly in their tokenizer, +so the user input tokenization will be safe when using such models. + +However, in rare cases, some models have special tokens marked incorrectly or don't have special tokens at all, +so safety cannot be guaranteed when using such models. +::: diff --git a/docs/index.md b/docs/index.md index 899cc407..8c92f29f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -95,6 +95,7 @@ npx -y node-llama-cpp inspect gpu * [TypeScript type-safety](./api/functions/getLlama.md) * [LoRA](./api/type-aliases/LlamaContextOptions.md#lora) * [Remote GGUF reader](./api/functions/readGgufFileInfo.md) +* [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp)