diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 63c5a0e0..dd8ea895 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -97,8 +97,8 @@ jobs: - name: Install dependencies on Windows if: startsWith(matrix.config.os, 'windows') run: | - choco install cmake.install --version=3.31.1 - choco install cmake --version=3.31.1 + choco install cmake.install --version=4.2.1 + choco install cmake --version=4.2.1 choco install ninja - name: Install dependencies on Ubuntu (1) @@ -107,9 +107,9 @@ jobs: sudo apt-get update sudo apt-get install ninja-build libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf - wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz - sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz - rm -f ./cmake-3.31.7-linux-x86_64.tar.gz + wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz + sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz + rm -f ./cmake-4.2.1-linux-x86_64.tar.gz which aarch64-linux-gnu-gcc which aarch64-linux-gnu-g++ @@ -125,31 +125,19 @@ jobs: sudo apt-get update sudo apt-get install ninja-build libtbb-dev - wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz - sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz - rm -f ./cmake-3.31.7-linux-x86_64.tar.gz + wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz + sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz + rm -f ./cmake-4.2.1-linux-x86_64.tar.gz cmake --version - - name: Install Cuda 13.0 on Windows (1) + - name: Install Cuda 13.1 on Windows (1) if: matrix.config.name == 'Windows (1)' - shell: bash - timeout-minutes: 60 - run: | - curl -Lo cuda_13.0.0_windows_network.exe https://developer.download.nvidia.com/compute/cuda/13.0.0/network_installers/cuda_13.0.0_windows_network.exe - - echo "Installing Cuda 13.0.0" - powershell -Command "Start-Process -FilePath cuda_13.0.0_windows_network.exe -ArgumentList '-s','-n' -Wait" - echo "Cuda installation finished" - - rm -f ./cuda_13.0.0_windows_network.exe - - echo "where cudart64_13.dll: $(where cudart64_13.dll)" - - echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV - echo "CUDA_PATH_V13_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV - echo "CUDA_PATH_VX_Y=CUDA_PATH_V13_0" >> $GITHUB_ENV - echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\bin" >> $GITHUB_PATH + uses: Jimver/cuda-toolkit@v0.2.30 + with: + cuda: '13.1.0' + method: 'network' + use-local-cache: false - name: Install Cuda 12.4 on Windows (2) if: matrix.config.name == 'Windows (2)' @@ -160,11 +148,11 @@ jobs: sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' use-local-cache: false - - name: Install Cuda 13.0 on Ubuntu (1) + - name: Install Cuda 13.1 on Ubuntu (1) if: matrix.config.name == 'Ubuntu (1)' - uses: Jimver/cuda-toolkit@v0.2.27 + uses: Jimver/cuda-toolkit@v0.2.30 with: - cuda: '13.0.0' + cuda: '13.1.0' method: 'network' - name: Install Cuda 12.4 on Ubuntu (2) diff --git a/docs/guide/CUDA.md b/docs/guide/CUDA.md index 81ed8a60..66ba2d14 100644 --- a/docs/guide/CUDA.md +++ b/docs/guide/CUDA.md @@ -9,14 +9,14 @@ description: CUDA support in node-llama-cpp and these are automatically used when CUDA is detected on your machine. To use `node-llama-cpp`'s CUDA support with your NVIDIA GPU, -make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher installed on your machine. +make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher installed on your machine. If the pre-built binaries don't work with your CUDA installation, `node-llama-cpp` will automatically download a release of `llama.cpp` and build it from source with CUDA support. Building from source with CUDA support is slow and can take up to an hour. -The pre-built binaries are compiled with CUDA Toolkit 12.4, -so any version of CUDA Toolkit that is 12.4 or higher should work with the pre-built binaries. +The pre-built binaries are compiled with CUDA Toolkits 12.4 and 13.1, +so any CUDA Toolkit 12 that's on version 12.4 or higher or CUDA Toolkit 13 on version 13.1 or higher should work with the pre-built binaries. If you have an older version of CUDA Toolkit installed on your machine, consider updating it to avoid having to wait the long build time. @@ -42,7 +42,7 @@ You should see an output like this: If you see `CUDA used VRAM` in the output, it means that CUDA support is working on your machine. ## Prerequisites -* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher +* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher * [NVIDIA Drivers](https://www.nvidia.com/en-us/drivers/) * [`cmake-js` dependencies](https://github.com/cmake-js/cmake-js#:~:text=%5Bstring%5D-,Requirements%3A,-CMake) * [CMake](https://cmake.org/download/) 3.26 or higher (optional, recommended if you have build issues) @@ -83,21 +83,21 @@ To build `node-llama-cpp` with any of these options, set an environment variable To fix this issue you have to set the `CUDACXX` environment variable to the path of the `nvcc` compiler, and the `CUDA_PATH` environment variable to the path of the CUDA home directory that contains the `nvcc` compiler. -For example, if you have installed CUDA Toolkit 12.4, you have to run a command like this: +For example, if you have installed CUDA Toolkit 13.1, you have to run a command like this: ::: code-group ```shell [Linux] -export CUDACXX=/usr/local/cuda-12.4/bin/nvcc -export CUDA_PATH=/usr/local/cuda-12.4 +export CUDACXX=/usr/local/cuda-13.1/bin/nvcc +export CUDA_PATH=/usr/local/cuda-13.1 ``` ```cmd [Windows (cmd)] -set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe -set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4 +set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe +set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1 ``` ```cmd [Windows (PowerShell)] -$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe" -$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" +$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe" +$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" ``` ::: diff --git a/docs/guide/text-completion.md b/docs/guide/text-completion.md index e94f6be9..f6498cac 100644 --- a/docs/guide/text-completion.md +++ b/docs/guide/text-completion.md @@ -76,3 +76,51 @@ const res = await completion.generateInfillCompletion(prefix, suffix, { console.log("Fill: " + res); ``` > This example uses [CodeGemma](https://huggingface.co/bartowski/codegemma-2b-GGUF). + +## Stop Text Completion Generation {#stop-generation} +To stop the generation of an ongoing text completion without throwing an error (to get the partially generated text), +you can use the [`stopOnAbortSignal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#stoponabortsignal) option +to configure what happens when the given [`signal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#signal) is aborted. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama, LlamaCompletion} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const completion = new LlamaCompletion({ + contextSequence: context.getSequence() +}); + +const abortController = new AbortController(); +const input = "Here is a list of sweet fruits:\n* "; +console.log("Input: " + input); + +let result = ""; + +process.stdout.write("Streamed completion: "); +const res = await completion.generateCompletion(input, { + maxTokens: 256, + + // stop the generation, instead of cancelling it + stopOnAbortSignal: true, + + signal: abortController.signal, + onTextChunk(chunk) { + result += chunk; + process.stdout.write(chunk); + + // max 10 lines + if (result.split("\n").length >= 10) + abortController.abort(); + } +}); +console.log(); +console.log("Completion: " + res); +``` diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt index 0298ce1f..e379d6a5 100644 --- a/llama/CMakeLists.txt +++ b/llama/CMakeLists.txt @@ -84,13 +84,6 @@ else() set(NLC_GGML_NATIVE ON) endif() -if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE) - find_package(CUDAToolkit) - if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0") - set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real") - endif() -endif() - add_subdirectory("llama.cpp") include_directories("llama.cpp") include_directories("./llama.cpp/common") diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp index d25d043c..d58f8731 100644 --- a/llama/addon/AddonModel.cpp +++ b/llama/addon/AddonModel.cpp @@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value(); } + if (options.Has("useDirectIo")) { + model_params.use_direct_io = options.Get("useDirectIo").As().Value(); + } + if (options.Has("useMlock")) { model_params.use_mlock = options.Get("useMlock").As().Value(); } diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index 122c87cf..08e19e61 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -9,6 +9,7 @@ export type BindingModule = { gpuLayers?: number, vocabOnly?: boolean, useMmap?: boolean, + useDirectIo?: boolean, useMlock?: boolean, checkTensors?: boolean, onLoadProgress?(loadPercentage: number): void, diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index e32fa62a..3de7fcdb 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -72,6 +72,7 @@ type ChatCommand = { meter: boolean, timing: boolean, noMmap: boolean, + noDirectIo: boolean, printTimings: boolean }; @@ -329,6 +330,11 @@ export const ChatCommand: CommandModule = { default: false, description: "Disable mmap (memory-mapped file) usage" }) + .option("noDirectIo", { + type: "boolean", + default: false, + description: "Disable Direct I/O usage when available" + }) .option("printTimings", { alias: "pt", type: "boolean", @@ -342,7 +348,8 @@ export const ChatCommand: CommandModule = { noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, - environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings + environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, + printTimings }) { try { await RunChat({ @@ -351,7 +358,7 @@ export const ChatCommand: CommandModule = { temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, printTimings + debug, numa, meter, timing, noMmap, noDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -368,7 +375,7 @@ async function RunChat({ jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, - tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings + tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; @@ -395,6 +402,7 @@ async function RunChat({ }); const logBatchSize = batchSize != null; const useMmap = !noMmap && llama.supportsMmap; + const useDirectIo = !noDirectIo; const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -452,6 +460,7 @@ async function RunChat({ defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, useMmap, + useDirectIo, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); @@ -486,6 +495,7 @@ async function RunChat({ defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, useMmap, + useDirectIo, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); }, @@ -591,6 +601,7 @@ async function RunChat({ context, draftContext, useMmap, + useDirectIo, printBos: true, printEos: true, logBatchSize, diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index d4d12c7e..a434721b 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -54,6 +54,7 @@ type CompleteCommand = { meter: boolean, timing: boolean, noMmap: boolean, + noDirectIo: boolean, printTimings: boolean }; @@ -249,6 +250,11 @@ export const CompleteCommand: CommandModule = { default: false, description: "Disable mmap (memory-mapped file) usage" }) + .option("noDirectIo", { + type: "boolean", + default: false, + description: "Disable Direct I/O usage when available" + }) .option("printTimings", { alias: "pt", type: "boolean", @@ -261,14 +267,14 @@ export const CompleteCommand: CommandModule = { flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, printTimings + debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) { try { await RunCompletion({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -283,7 +289,7 @@ async function RunCompletion({ modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }: CompleteCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; @@ -308,6 +314,7 @@ async function RunCompletion({ }); const logBatchSize = batchSize != null; const useMmap = !noMmap && llama.supportsMmap; + const useDirectIo = !noDirectIo; const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -358,6 +365,7 @@ async function RunCompletion({ defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, useMmap, + useDirectIo, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); @@ -392,6 +400,7 @@ async function RunCompletion({ defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, useMmap, + useDirectIo, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); }, @@ -470,6 +479,7 @@ async function RunCompletion({ context, draftContext, useMmap, + useDirectIo, minTitleLength: "Complete".length + 1, logBatchSize, tokenMeterEnabled: meter diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 934057c3..ffbf5ecd 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -56,6 +56,7 @@ type InfillCommand = { meter: boolean, timing: boolean, noMmap: boolean, + noDirectIo: boolean, printTimings: boolean }; @@ -259,6 +260,11 @@ export const InfillCommand: CommandModule = { default: false, description: "Disable mmap (memory-mapped file) usage" }) + .option("noDirectIo", { + type: "boolean", + default: false, + description: "Disable Direct I/O usage when available" + }) .option("printTimings", { alias: "pt", type: "boolean", @@ -271,14 +277,14 @@ export const InfillCommand: CommandModule = { flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, printTimings + debug, numa, meter, timing, noMmap, noDirectIo, printTimings }) { try { await RunInfill({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -293,7 +299,7 @@ async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings }: InfillCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; @@ -318,6 +324,7 @@ async function RunInfill({ }); const logBatchSize = batchSize != null; const useMmap = !noMmap && llama.supportsMmap; + const useDirectIo = !noDirectIo; const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -382,6 +389,7 @@ async function RunInfill({ defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, useMmap, + useDirectIo, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); @@ -416,6 +424,7 @@ async function RunInfill({ defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, useMmap, + useDirectIo, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); }, @@ -494,6 +503,7 @@ async function RunInfill({ context, draftContext, useMmap, + useDirectIo, logBatchSize, tokenMeterEnabled: meter }); diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts index c3bf548c..482353f6 100644 --- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts +++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts @@ -22,6 +22,7 @@ import {documentationPageUrls} from "../../../../config.js"; import {Llama} from "../../../../bindings/Llama.js"; import {toBytes} from "../../../utils/toBytes.js"; import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js"; +import {getPlatform} from "../../../../bindings/utils/getPlatform.js"; type InspectMeasureCommand = { modelPath?: string, @@ -37,6 +38,7 @@ type InspectMeasureCommand = { measures: number, memory: "vram" | "ram" | "all", noMmap: boolean, + noDirectIo: boolean, printHeaderBeforeEachLayer?: boolean, evaluateText?: string, repeatEvaluateText?: number @@ -135,6 +137,11 @@ export const InspectMeasureCommand: CommandModule default: false, description: "Disable mmap (memory-mapped file) usage" }) + .option("noDirectIo", { + type: "boolean", + default: false, + description: "Disable Direct I/O usage when available" + }) .option("printHeaderBeforeEachLayer", { alias: "ph", type: "boolean", @@ -155,7 +162,8 @@ export const InspectMeasureCommand: CommandModule }, async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache, - batchSize, measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText + batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText, + repeatEvaluateText }: InspectMeasureCommand) { if (maxLayers === -1) maxLayers = undefined; if (maxContextSize === -1) maxContextSize = undefined; @@ -174,7 +182,9 @@ export const InspectMeasureCommand: CommandModule logLevel: LlamaLogLevel.error }); + const platform = getPlatform(); const useMmap = !noMmap && llama.supportsMmap; + const useDirectIo = !noDirectIo; const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, { flashAttention, swaFullCache, useMmap }); @@ -188,6 +198,14 @@ export const InspectMeasureCommand: CommandModule ? "enabled" : "disabled" )); + + if (platform !== "mac") // Direct I/O is not supported on macOS + console.info(chalk.yellow("Direct I/O:") + " " + ( + useDirectIo + ? "enabled" + : "disabled" + )); + if (measureMemoryType === "ram" || measureMemoryType === "all") console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available")); @@ -221,6 +239,7 @@ export const InspectMeasureCommand: CommandModule const done = await measureModel({ modelPath: resolvedGgufPath, useMmap, + useDirectIo, gpu: gpu == null ? undefined : llama.gpu, @@ -513,11 +532,12 @@ const detectedFileName = path.basename(__filename); const expectedFileName = "InspectMeasureCommand"; async function measureModel({ - modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, - swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo + modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, + flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo }: { modelPath: string, useMmap?: boolean, + useDirectIo?: boolean, gpu?: BuildGpu | "auto", tests: number, initialMaxContextSize?: number, @@ -628,6 +648,7 @@ async function measureModel({ type: "start", modelPath, useMmap, + useDirectIo, tests, initialMaxContextSize, maxContextSize, @@ -828,12 +849,12 @@ async function runTestWorkerLogic() { } async function testWithGpuLayers({ - modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, batchSize, - evaluateText, exitAfterMeasurement = false + modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, + batchSize, evaluateText, exitAfterMeasurement = false }: { - modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, - minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, evaluateText?: string, - exitAfterMeasurement?: boolean + modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number, + maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, + evaluateText?: string, exitAfterMeasurement?: boolean }) { try { const preModelVramUsage = (await llama.getVramState()).used; @@ -841,6 +862,7 @@ async function runTestWorkerLogic() { const model = await llama.loadModel({ modelPath, useMmap, + useDirectIo, gpuLayers, defaultContextFlashAttention: flashAttention, defaultContextSwaFullCache: swaFullCache, @@ -908,6 +930,7 @@ async function runTestWorkerLogic() { const measurementsDone = await testWithGpuLayers({ modelPath: message.modelPath, useMmap: message.useMmap, + useDirectIo: message.useDirectIo, gpuLayers, tests: message.tests, startContextSize: gpuLayers == message.maxGpuLayers @@ -1005,6 +1028,7 @@ type ParentToChildMessage = { type: "start", modelPath: string, useMmap?: boolean, + useDirectIo?: boolean, tests: number, maxGpuLayers: number, minGpuLayers?: number, diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts index f8bfea47..983a1056 100644 --- a/src/cli/utils/printCommonInfoLines.ts +++ b/src/cli/utils/printCommonInfoLines.ts @@ -1,6 +1,7 @@ import chalk from "chalk"; import {getPrettyBuildGpuName} from "../../bindings/consts.js"; import {LlamaContext} from "../../evaluator/LlamaContext/LlamaContext.js"; +import {getPlatform} from "../../bindings/utils/getPlatform.js"; import {printInfoLine} from "./printInfoLine.js"; import {toBytes} from "./toBytes.js"; @@ -9,6 +10,7 @@ export async function printCommonInfoLines({ draftContext, minTitleLength = 0, useMmap, + useDirectIo, logBatchSize = false, tokenMeterEnabled = false, printBos = false, @@ -18,11 +20,13 @@ export async function printCommonInfoLines({ draftContext?: LlamaContext, minTitleLength?: number, useMmap?: boolean, + useDirectIo?: boolean, logBatchSize?: boolean, tokenMeterEnabled?: boolean, printBos?: boolean, printEos?: boolean }) { + const platform = getPlatform(); const llama = context._llama; const model = context.model; const padTitle = Math.max( @@ -79,6 +83,14 @@ export async function printCommonInfoLines({ : (useMmap || useMmap == null) ? "enabled" : "disabled" + }, { + title: "Direct I/O", + show: platform !== "mac", // Direct IO is not supported on macOS + value: platform === "mac" + ? "unsupported" + : (useDirectIo || useDirectIo == null) + ? "enabled" + : "disabled" }, { show: printBos, title: "BOS", diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts index faf17bca..75b5ca93 100644 --- a/src/evaluator/LlamaChat/LlamaChat.ts +++ b/src/evaluator/LlamaChat/LlamaChat.ts @@ -168,6 +168,13 @@ export type LLamaChatGenerateResponseOptions void, + /** + * An AbortSignal to later abort the generation. + * + * When the signal is aborted, the generation will stop and throw `signal.reason` as the error. + * + * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`. + */ signal?: AbortSignal, /** @@ -178,6 +185,7 @@ export type LLamaChatGenerateResponseOptions void, + /** + * An AbortSignal to later abort the generation. + * + * When the signal is aborted, the generation will stop and throw `signal.reason` as the error. + * + * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`. + */ signal?: AbortSignal, /** @@ -106,6 +113,7 @@ export type LLamaChatPromptOptions void, + /** + * An AbortSignal to later abort the generation. + * + * When the signal is aborted, the generation will stop and throw `signal.reason` as the error. + * + * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`. + */ signal?: AbortSignal, + + /** + * When a completion already started being generated and then the signal is aborted, + * the generation will stop and the completion will be returned as is instead of throwing an error. + * + * Defaults to `false`. + */ + stopOnAbortSignal?: boolean, + + /** Maximum number of tokens to generate */ maxTokens?: number, /** @@ -160,7 +177,7 @@ export type LlamaCompletionResponse = { response: string, metadata: { remainingGenerationAfterStop?: string | Token[], - stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" + stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort" } | { remainingGenerationAfterStop?: string | Token[], stopReason: "customStopTrigger", @@ -247,6 +264,7 @@ export class LlamaCompletion { onTextChunk, onToken, signal, + stopOnAbortSignal = false, maxTokens, temperature, minP, @@ -295,7 +313,7 @@ export class LlamaCompletion { } const ensureNotAborted = () => { - if (signal?.aborted) + if (signal?.aborted && !stopOnAbortSignal) throw signal.reason; if (this.disposed) @@ -334,6 +352,7 @@ export class LlamaCompletion { onTextChunk: safeEventCallback(onTextChunk), onToken: safeEventCallback(onToken), signal, + stopOnAbortSignal, maxTokens: resolvedMaxTokens, temperature, minP, @@ -390,6 +409,7 @@ export class LlamaCompletion { onTextChunk, onToken, signal, + stopOnAbortSignal = false, maxTokens, temperature, minP, @@ -496,7 +516,7 @@ export class LlamaCompletion { } const ensureNotAborted = () => { - if (signal?.aborted) + if (signal?.aborted && !stopOnAbortSignal) throw signal.reason; if (this.disposed) @@ -533,6 +553,7 @@ export class LlamaCompletion { onTextChunk: safeEventCallback(onTextChunk), onToken: safeEventCallback(onToken), signal, + stopOnAbortSignal, maxTokens: resolvedMaxTokens, temperature, minP, @@ -571,6 +592,7 @@ export class LlamaCompletion { onTextChunk, onToken, signal, + stopOnAbortSignal = false, maxTokens, temperature, minP, @@ -638,7 +660,7 @@ export class LlamaCompletion { .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger)); const ensureNotAborted = () => { - if (signal?.aborted) + if (signal?.aborted && !stopOnAbortSignal) throw signal.reason; if (this.disposed) @@ -805,7 +827,10 @@ export class LlamaCompletion { } } - if (maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens) { + const aborted = (signal?.aborted ?? false) && stopOnAbortSignal; + const maxTokensReached = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens; + + if (aborted || maxTokensReached) { let modelResponse = model.detokenize(res); if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix) @@ -814,7 +839,9 @@ export class LlamaCompletion { return { response: modelResponse, metadata: { - stopReason: "maxTokens" + stopReason: aborted + ? "abort" + : "maxTokens" } }; } diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 9d7fa343..2da930e6 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -76,6 +76,22 @@ export type LlamaModelOptions = { */ useMmap?: boolean, + /** + * Direct I/O is a method of reading and writing data to and from the storage device directly to the application memory, + * bypassing OS in-memory caches. + * + * It leads to improved model loading times and reduced RAM usage, + * on the expense of higher loading times when the model unloaded and loaded again repeatedly in a short period of time. + * + * When this option is enabled, if Direct I/O is supported by the system (and for the given file) + * it will be used and mmap will be disabled. + * + * Unsupported on macOS. + * + * Defaults to `true`. + */ + useDirectIo?: boolean, + /** * Force the system to keep the model in the RAM/VRAM. * Use with caution as this can crash your system if the available resources are insufficient. @@ -150,6 +166,7 @@ export type LlamaModelOptions = { }; const defaultUseMmap = true; +const defaultUseDirectIo = true; const defaultContextFlashAttentionEnabled = false; const defaultContextSwaFullCache = false; @@ -181,7 +198,7 @@ export class LlamaModel { public readonly onDispose = new EventRelay(); private constructor({ - modelPath, gpuLayers, vocabOnly = false, useMmap, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides + modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides }: LlamaModelOptions & { gpuLayers: number }, { @@ -219,6 +236,7 @@ export class LlamaModel { gpuLayers, vocabOnly: this._vocabOnly, useMmap, + useDirectIo, useMlock: _llama.supportsMlock ? useMlock : undefined, @@ -709,6 +727,7 @@ export class LlamaModel { }) { const {loadSignal, defaultContextFlashAttention} = modelOptions; const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap); + const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo; const fileInfo = await readGgufFileInfo(modelOptions.modelPath, { sourceType: "filesystem", @@ -732,7 +751,7 @@ export class LlamaModel { useMmap }); - const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, { + const model = new LlamaModel({...modelOptions, gpuLayers, useMmap, useDirectIo}, { _fileInfo: fileInfo, _fileInsights: ggufInsights, _llama, diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index 257f7080..b4d984d5 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -12,6 +12,7 @@ export const enum GgufArchitectureType { starcoder = "starcoder", refact = "refact", bert = "bert", + modernBert = "modern-bert", nomicBert = "nomic-bert", nomicBertMoe = "nomic-bert-moe", neoBert = "neo-bert", @@ -33,6 +34,7 @@ export const enum GgufArchitectureType { phimoe = "phimoe", plamo = "plamo", plamo2 = "plamo2", + plamo3 = "plamo3", codeshell = "codeshell", orion = "orion", internlm2 = "internlm2", @@ -68,6 +70,7 @@ export const enum GgufArchitectureType { jais = "jais", nemotron = "nemotron", nemotronH = "nemotron_h", + nemotronHMoe = "nemotron_h_moe", exaone = "exaone", exaone4 = "exaone4", rwkv6 = "rwkv6", @@ -105,6 +108,9 @@ export const enum GgufArchitectureType { rnd1 = "rnd1", panguEmbedded = "pangu-embedded", mistral3 = "mistral3", + mimo2 = "mimo2", + llamaEmbed = "llama-embed", + maincoder = "maincoder", clip = "clip", unknown = "(unknown)" } diff --git a/templates/electron-typescript-react/.editorconfig b/templates/electron-typescript-react/.editorconfig index 1c7d0091..4cf71102 100644 --- a/templates/electron-typescript-react/.editorconfig +++ b/templates/electron-typescript-react/.editorconfig @@ -7,7 +7,7 @@ indent_size = 4 [{*.ts,*.tsx,*.js,*.jsx,*.css,*.scss}] insert_final_newline = true -[{package.json,package-lock.json,manifest.json,electron-builder.json5}] +[{package.json,package-lock.json,manifest.json}] indent_size = 2 [*.yml] diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts index 72e6a94f..79addacc 100644 --- a/test/modelDependent/llama3.2/sequenceState.test.ts +++ b/test/modelDependent/llama3.2/sequenceState.test.ts @@ -108,7 +108,7 @@ describe("llama 3.2", () => { const res1_2 = await chatSession1_2.prompt("What's the exact thing I told you to remember?", {maxTokens: 12}); const contextSequence1TokensState4 = contextSequence1.tokenMeter.getState(); - expect(res1_2).to.toMatchInlineSnapshot('"You told me to "Remember: locks are not doors"."'); + expect(res1_2).toMatch(/^(You told me to "Remember: locks are not doors".|You told me to "Remember: locks are not doors.")/); const contextSequence1TokensState4Diff = TokenMeter.diff(contextSequence1TokensState4, contextSequence1TokensState3); expect(contextSequence1TokensState4Diff.usedInputTokens).to.be.lessThan(contextSequence1TokensState1.usedInputTokens); expect(contextSequence1TokensState4Diff).toMatchInlineSnapshot(` @@ -202,7 +202,7 @@ describe("llama 3.2", () => { }); const res1 = await chatSession1.prompt("Remember: locks are not doors. Also, write a long poem about it", {maxTokens: 154}); - expect(res1).toMatch(/^(A clever reminder indeed.|A wise phrase to ponder)/); + expect(res1).toMatch(/^(A clever reminder indeed.|A wise phrase to ponder|A wise phrase indeed)/); const stateFile1Path = await getTempTestFilePath("state1");