diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 63c5a0e0..dd8ea895 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -97,8 +97,8 @@ jobs:
       - name: Install dependencies on Windows
         if: startsWith(matrix.config.os, 'windows')
         run: |
-          choco install cmake.install --version=3.31.1
-          choco install cmake --version=3.31.1
+          choco install cmake.install --version=4.2.1
+          choco install cmake --version=4.2.1
           choco install ninja
 
       - name: Install dependencies on Ubuntu (1)
@@ -107,9 +107,9 @@ jobs:
           sudo apt-get update
           sudo apt-get install ninja-build libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
           
-          wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
-          sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
-          rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
+          wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz
+          sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz
+          rm -f ./cmake-4.2.1-linux-x86_64.tar.gz
           
           which aarch64-linux-gnu-gcc
           which aarch64-linux-gnu-g++
@@ -125,31 +125,19 @@ jobs:
           sudo apt-get update
           sudo apt-get install ninja-build libtbb-dev
           
-          wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
-          sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
-          rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
+          wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz
+          sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz
+          rm -f ./cmake-4.2.1-linux-x86_64.tar.gz
           
           cmake --version
 
-      - name: Install Cuda 13.0 on Windows (1)
+      - name: Install Cuda 13.1 on Windows (1)
         if: matrix.config.name == 'Windows (1)'
-        shell: bash
-        timeout-minutes: 60
-        run: |
-          curl -Lo cuda_13.0.0_windows_network.exe https://developer.download.nvidia.com/compute/cuda/13.0.0/network_installers/cuda_13.0.0_windows_network.exe
-          
-          echo "Installing Cuda 13.0.0"
-          powershell -Command "Start-Process -FilePath cuda_13.0.0_windows_network.exe -ArgumentList '-s','-n' -Wait"
-          echo "Cuda installation finished"
-          
-          rm -f ./cuda_13.0.0_windows_network.exe
-          
-          echo "where cudart64_13.dll: $(where cudart64_13.dll)"
-          
-          echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
-          echo "CUDA_PATH_V13_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V13_0" >> $GITHUB_ENV
-          echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\bin" >> $GITHUB_PATH
+        uses: Jimver/cuda-toolkit@v0.2.30
+        with:
+          cuda: '13.1.0'
+          method: 'network'
+          use-local-cache: false
 
       - name: Install Cuda 12.4 on Windows (2)
         if: matrix.config.name == 'Windows (2)'
@@ -160,11 +148,11 @@ jobs:
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
           use-local-cache: false
 
-      - name: Install Cuda 13.0 on Ubuntu (1)
+      - name: Install Cuda 13.1 on Ubuntu (1)
         if: matrix.config.name == 'Ubuntu (1)'
-        uses: Jimver/cuda-toolkit@v0.2.27
+        uses: Jimver/cuda-toolkit@v0.2.30
         with:
-          cuda: '13.0.0'
+          cuda: '13.1.0'
           method: 'network'
 
       - name: Install Cuda 12.4 on Ubuntu (2)
diff --git a/docs/guide/CUDA.md b/docs/guide/CUDA.md
index 81ed8a60..66ba2d14 100644
--- a/docs/guide/CUDA.md
+++ b/docs/guide/CUDA.md
@@ -9,14 +9,14 @@ description: CUDA support in node-llama-cpp
 and these are automatically used when CUDA is detected on your machine.
 
 To use `node-llama-cpp`'s CUDA support with your NVIDIA GPU,
-make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher installed on your machine.
+make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher installed on your machine.
 
 If the pre-built binaries don't work with your CUDA installation,
 `node-llama-cpp` will automatically download a release of `llama.cpp` and build it from source with CUDA support.
 Building from source with CUDA support is slow and can take up to an hour.
 
-The pre-built binaries are compiled with CUDA Toolkit 12.4,
-so any version of CUDA Toolkit that is 12.4 or higher should work with the pre-built binaries.
+The pre-built binaries are compiled with CUDA Toolkits 12.4 and 13.1,
+so any CUDA Toolkit 12 that's on version 12.4 or higher or CUDA Toolkit 13 on version 13.1 or higher should work with the pre-built binaries.
 If you have an older version of CUDA Toolkit installed on your machine,
 consider updating it to avoid having to wait the long build time.
 
@@ -42,7 +42,7 @@ You should see an output like this:
 If you see `CUDA used VRAM` in the output, it means that CUDA support is working on your machine.
 
 ## Prerequisites
-* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher
+* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher
 * [NVIDIA Drivers](https://www.nvidia.com/en-us/drivers/)
 * [`cmake-js` dependencies](https://github.com/cmake-js/cmake-js#:~:text=%5Bstring%5D-,Requirements%3A,-CMake)
 * [CMake](https://cmake.org/download/) 3.26 or higher (optional, recommended if you have build issues)
@@ -83,21 +83,21 @@ To build `node-llama-cpp` with any of these options, set an environment variable
 To fix this issue you have to set the `CUDACXX` environment variable to the path of the `nvcc` compiler,
 and the `CUDA_PATH` environment variable to the path of the CUDA home directory that contains the `nvcc` compiler.
 
-For example, if you have installed CUDA Toolkit 12.4, you have to run a command like this:
+For example, if you have installed CUDA Toolkit 13.1, you have to run a command like this:
 ::: code-group
 ```shell [Linux]
-export CUDACXX=/usr/local/cuda-12.4/bin/nvcc
-export CUDA_PATH=/usr/local/cuda-12.4
+export CUDACXX=/usr/local/cuda-13.1/bin/nvcc
+export CUDA_PATH=/usr/local/cuda-13.1
 ```
 
 ```cmd [Windows (cmd)]
-set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
+set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1
 ```
 
 ```cmd [Windows (PowerShell)]
-$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe"
-$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe"
+$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
 ```
 :::
 
diff --git a/docs/guide/text-completion.md b/docs/guide/text-completion.md
index e94f6be9..f6498cac 100644
--- a/docs/guide/text-completion.md
+++ b/docs/guide/text-completion.md
@@ -76,3 +76,51 @@ const res = await completion.generateInfillCompletion(prefix, suffix, {
 console.log("Fill: " + res);
 ```
 > This example uses [CodeGemma](https://huggingface.co/bartowski/codegemma-2b-GGUF).
+
+## Stop Text Completion Generation {#stop-generation}
+To stop the generation of an ongoing text completion without throwing an error (to get the partially generated text),
+you can use the [`stopOnAbortSignal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#stoponabortsignal) option
+to configure what happens when the given [`signal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#signal) is aborted.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaCompletion} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const completion = new LlamaCompletion({
+    contextSequence: context.getSequence()
+});
+
+const abortController = new AbortController();
+const input = "Here is a list of sweet fruits:\n* ";
+console.log("Input: " + input);
+
+let result = "";
+
+process.stdout.write("Streamed completion: ");
+const res = await completion.generateCompletion(input, {
+    maxTokens: 256,
+
+    // stop the generation, instead of cancelling it
+    stopOnAbortSignal: true,
+
+    signal: abortController.signal,
+    onTextChunk(chunk) {
+        result += chunk;
+        process.stdout.write(chunk);
+
+        // max 10 lines
+        if (result.split("\n").length >= 10)
+            abortController.abort();
+    }
+});
+console.log();
+console.log("Completion: " + res);
+```
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 0298ce1f..e379d6a5 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -84,13 +84,6 @@ else()
     set(NLC_GGML_NATIVE ON)
 endif()
 
-if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
-    find_package(CUDAToolkit)
-    if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
-        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
-    endif()
-endif()
-
 add_subdirectory("llama.cpp")
 include_directories("llama.cpp")
 include_directories("./llama.cpp/common")
diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
index d25d043c..d58f8731 100644
--- a/llama/addon/AddonModel.cpp
+++ b/llama/addon/AddonModel.cpp
@@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonM
             model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
         }
 
+        if (options.Has("useDirectIo")) {
+            model_params.use_direct_io = options.Get("useDirectIo").As<Napi::Boolean>().Value();
+        }
+
         if (options.Has("useMlock")) {
             model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
         }
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index 122c87cf..08e19e61 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -9,6 +9,7 @@ export type BindingModule = {
             gpuLayers?: number,
             vocabOnly?: boolean,
             useMmap?: boolean,
+            useDirectIo?: boolean,
             useMlock?: boolean,
             checkTensors?: boolean,
             onLoadProgress?(loadPercentage: number): void,
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index e32fa62a..3de7fcdb 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -72,6 +72,7 @@ type ChatCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -329,6 +330,11 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -342,7 +348,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
-        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo,
+        printTimings
     }) {
         try {
             await RunChat({
@@ -351,7 +358,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-                debug, numa, meter, timing, noMmap, printTimings
+                debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -368,7 +375,7 @@ async function RunChat({
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
-    tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -395,6 +402,7 @@ async function RunChat({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -452,6 +460,7 @@ async function RunChat({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -486,6 +495,7 @@ async function RunChat({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -591,6 +601,7 @@ async function RunChat({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         printBos: true,
         printEos: true,
         logBatchSize,
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index d4d12c7e..a434721b 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -54,6 +54,7 @@ type CompleteCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -249,6 +250,11 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -261,14 +267,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
         flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, numa, meter, timing, noMmap, printTimings
+        debug, numa, meter, timing, noMmap, noDirectIo, printTimings
     }) {
         try {
             await RunCompletion({
                 modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -283,7 +289,7 @@ async function RunCompletion({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: CompleteCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -308,6 +314,7 @@ async function RunCompletion({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -358,6 +365,7 @@ async function RunCompletion({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -392,6 +400,7 @@ async function RunCompletion({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -470,6 +479,7 @@ async function RunCompletion({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         minTitleLength: "Complete".length + 1,
         logBatchSize,
         tokenMeterEnabled: meter
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index 934057c3..ffbf5ecd 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -56,6 +56,7 @@ type InfillCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -259,6 +260,11 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -271,14 +277,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
         flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, numa, meter, timing, noMmap, printTimings
+        debug, numa, meter, timing, noMmap, noDirectIo, printTimings
     }) {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
                 swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -293,7 +299,7 @@ async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
     swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: InfillCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -318,6 +324,7 @@ async function RunInfill({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -382,6 +389,7 @@ async function RunInfill({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -416,6 +424,7 @@ async function RunInfill({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -494,6 +503,7 @@ async function RunInfill({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         logBatchSize,
         tokenMeterEnabled: meter
     });
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index c3bf548c..482353f6 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -22,6 +22,7 @@ import {documentationPageUrls} from "../../../../config.js";
 import {Llama} from "../../../../bindings/Llama.js";
 import {toBytes} from "../../../utils/toBytes.js";
 import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
+import {getPlatform} from "../../../../bindings/utils/getPlatform.js";
 
 type InspectMeasureCommand = {
     modelPath?: string,
@@ -37,6 +38,7 @@ type InspectMeasureCommand = {
     measures: number,
     memory: "vram" | "ram" | "all",
     noMmap: boolean,
+    noDirectIo: boolean,
     printHeaderBeforeEachLayer?: boolean,
     evaluateText?: string,
     repeatEvaluateText?: number
@@ -135,6 +137,11 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printHeaderBeforeEachLayer", {
                 alias: "ph",
                 type: "boolean",
@@ -155,7 +162,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
     },
     async handler({
         modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache,
-        batchSize, measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
+        batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText,
+        repeatEvaluateText
     }: InspectMeasureCommand) {
         if (maxLayers === -1) maxLayers = undefined;
         if (maxContextSize === -1) maxContextSize = undefined;
@@ -174,7 +182,9 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 logLevel: LlamaLogLevel.error
             });
 
+        const platform = getPlatform();
         const useMmap = !noMmap && llama.supportsMmap;
+        const useDirectIo = !noDirectIo;
         const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
             flashAttention, swaFullCache, useMmap
         });
@@ -188,6 +198,14 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                     ? "enabled"
                     : "disabled"
         ));
+
+        if (platform !== "mac") // Direct I/O is not supported on macOS
+            console.info(chalk.yellow("Direct I/O:") + " " + (
+                useDirectIo
+                    ? "enabled"
+                    : "disabled"
+            ));
+
         if (measureMemoryType === "ram" || measureMemoryType === "all")
             console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
 
@@ -221,6 +239,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
             const done = await measureModel({
                 modelPath: resolvedGgufPath,
                 useMmap,
+                useDirectIo,
                 gpu: gpu == null
                     ? undefined
                     : llama.gpu,
@@ -513,11 +532,12 @@ const detectedFileName = path.basename(__filename);
 const expectedFileName = "InspectMeasureCommand";
 
 async function measureModel({
-    modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention,
-    swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
+    modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers,
+    flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
 }: {
     modelPath: string,
     useMmap?: boolean,
+    useDirectIo?: boolean,
     gpu?: BuildGpu | "auto",
     tests: number,
     initialMaxContextSize?: number,
@@ -628,6 +648,7 @@ async function measureModel({
                         type: "start",
                         modelPath,
                         useMmap,
+                        useDirectIo,
                         tests,
                         initialMaxContextSize,
                         maxContextSize,
@@ -828,12 +849,12 @@ async function runTestWorkerLogic() {
     }
 
     async function testWithGpuLayers({
-        modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, batchSize,
-        evaluateText, exitAfterMeasurement = false
+        modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache,
+        batchSize, evaluateText, exitAfterMeasurement = false
     }: {
-        modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number,
-        minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, evaluateText?: string,
-        exitAfterMeasurement?: boolean
+        modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number,
+        maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number,
+        evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
         try {
             const preModelVramUsage = (await llama.getVramState()).used;
@@ -841,6 +862,7 @@ async function runTestWorkerLogic() {
             const model = await llama.loadModel({
                 modelPath,
                 useMmap,
+                useDirectIo,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
@@ -908,6 +930,7 @@ async function runTestWorkerLogic() {
                 const measurementsDone = await testWithGpuLayers({
                     modelPath: message.modelPath,
                     useMmap: message.useMmap,
+                    useDirectIo: message.useDirectIo,
                     gpuLayers,
                     tests: message.tests,
                     startContextSize: gpuLayers == message.maxGpuLayers
@@ -1005,6 +1028,7 @@ type ParentToChildMessage = {
     type: "start",
     modelPath: string,
     useMmap?: boolean,
+    useDirectIo?: boolean,
     tests: number,
     maxGpuLayers: number,
     minGpuLayers?: number,
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
index f8bfea47..983a1056 100644
--- a/src/cli/utils/printCommonInfoLines.ts
+++ b/src/cli/utils/printCommonInfoLines.ts
@@ -1,6 +1,7 @@
 import chalk from "chalk";
 import {getPrettyBuildGpuName} from "../../bindings/consts.js";
 import {LlamaContext} from "../../evaluator/LlamaContext/LlamaContext.js";
+import {getPlatform} from "../../bindings/utils/getPlatform.js";
 import {printInfoLine} from "./printInfoLine.js";
 import {toBytes} from "./toBytes.js";
 
@@ -9,6 +10,7 @@ export async function printCommonInfoLines({
     draftContext,
     minTitleLength = 0,
     useMmap,
+    useDirectIo,
     logBatchSize = false,
     tokenMeterEnabled = false,
     printBos = false,
@@ -18,11 +20,13 @@ export async function printCommonInfoLines({
     draftContext?: LlamaContext,
     minTitleLength?: number,
     useMmap?: boolean,
+    useDirectIo?: boolean,
     logBatchSize?: boolean,
     tokenMeterEnabled?: boolean,
     printBos?: boolean,
     printEos?: boolean
 }) {
+    const platform = getPlatform();
     const llama = context._llama;
     const model = context.model;
     const padTitle = Math.max(
@@ -79,6 +83,14 @@ export async function printCommonInfoLines({
                 : (useMmap || useMmap == null)
                     ? "enabled"
                     : "disabled"
+        }, {
+            title: "Direct I/O",
+            show: platform !== "mac", // Direct IO is not supported on macOS
+            value: platform === "mac"
+                ? "unsupported"
+                : (useDirectIo || useDirectIo == null)
+                    ? "enabled"
+                    : "disabled"
         }, {
             show: printBos,
             title: "BOS",
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index faf17bca..75b5ca93 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -168,6 +168,13 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
      */
     onResponseChunk?: (chunk: LlamaChatResponseChunk) => void,
 
+    /**
+     * An AbortSignal to later abort the generation.
+     *
+     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
+     *
+     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
+     */
     signal?: AbortSignal,
 
     /**
@@ -178,6 +185,7 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
      */
     stopOnAbortSignal?: boolean,
 
+    /** Maximum number of tokens to generate */
     maxTokens?: number,
 
     /**
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 88939085..48c94d08 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -96,6 +96,13 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
      */
     onResponseChunk?: (chunk: LlamaChatResponseChunk) => void,
 
+    /**
+     * An AbortSignal to later abort the generation.
+     *
+     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
+     *
+     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
+     */
     signal?: AbortSignal,
 
     /**
@@ -106,6 +113,7 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
      */
     stopOnAbortSignal?: boolean,
 
+    /** Maximum number of tokens to generate */
     maxTokens?: number,
 
     /**
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index 3b500472..d17ed531 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -45,7 +45,24 @@ export type LlamaCompletionGenerationOptions = {
      */
     onToken?: (tokens: Token[]) => void,
 
+    /**
+     * An AbortSignal to later abort the generation.
+     *
+     * When the signal is aborted, the generation will stop and throw `signal.reason` as the error.
+     *
+     * > To stop an ongoing generation without throwing an error, also set `stopOnAbortSignal` to `true`.
+     */
     signal?: AbortSignal,
+
+    /**
+     * When a completion already started being generated and then the signal is aborted,
+     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     *
+     * Defaults to `false`.
+     */
+    stopOnAbortSignal?: boolean,
+
+    /** Maximum number of tokens to generate */
     maxTokens?: number,
 
     /**
@@ -160,7 +177,7 @@ export type LlamaCompletionResponse = {
     response: string,
     metadata: {
         remainingGenerationAfterStop?: string | Token[],
-        stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens"
+        stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort"
     } | {
         remainingGenerationAfterStop?: string | Token[],
         stopReason: "customStopTrigger",
@@ -247,6 +264,7 @@ export class LlamaCompletion {
             onTextChunk,
             onToken,
             signal,
+            stopOnAbortSignal = false,
             maxTokens,
             temperature,
             minP,
@@ -295,7 +313,7 @@ export class LlamaCompletion {
         }
 
         const ensureNotAborted = () => {
-            if (signal?.aborted)
+            if (signal?.aborted && !stopOnAbortSignal)
                 throw signal.reason;
 
             if (this.disposed)
@@ -334,6 +352,7 @@ export class LlamaCompletion {
                 onTextChunk: safeEventCallback(onTextChunk),
                 onToken: safeEventCallback(onToken),
                 signal,
+                stopOnAbortSignal,
                 maxTokens: resolvedMaxTokens,
                 temperature,
                 minP,
@@ -390,6 +409,7 @@ export class LlamaCompletion {
             onTextChunk,
             onToken,
             signal,
+            stopOnAbortSignal = false,
             maxTokens,
             temperature,
             minP,
@@ -496,7 +516,7 @@ export class LlamaCompletion {
         }
 
         const ensureNotAborted = () => {
-            if (signal?.aborted)
+            if (signal?.aborted && !stopOnAbortSignal)
                 throw signal.reason;
 
             if (this.disposed)
@@ -533,6 +553,7 @@ export class LlamaCompletion {
                 onTextChunk: safeEventCallback(onTextChunk),
                 onToken: safeEventCallback(onToken),
                 signal,
+                stopOnAbortSignal,
                 maxTokens: resolvedMaxTokens,
                 temperature,
                 minP,
@@ -571,6 +592,7 @@ export class LlamaCompletion {
             onTextChunk,
             onToken,
             signal,
+            stopOnAbortSignal = false,
             maxTokens,
             temperature,
             minP,
@@ -638,7 +660,7 @@ export class LlamaCompletion {
                 .map((stopTrigger) => customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
 
         const ensureNotAborted = () => {
-            if (signal?.aborted)
+            if (signal?.aborted && !stopOnAbortSignal)
                 throw signal.reason;
 
             if (this.disposed)
@@ -805,7 +827,10 @@ export class LlamaCompletion {
                     }
                 }
 
-                if (maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens) {
+                const aborted = (signal?.aborted ?? false) && stopOnAbortSignal;
+                const maxTokensReached = maxTokens != null && maxTokens > 0 && generatedTokens >= maxTokens;
+
+                if (aborted || maxTokensReached) {
                     let modelResponse = model.detokenize(res);
 
                     if (grammar?.trimWhitespaceSuffix || trimWhitespaceSuffix)
@@ -814,7 +839,9 @@ export class LlamaCompletion {
                     return {
                         response: modelResponse,
                         metadata: {
-                            stopReason: "maxTokens"
+                            stopReason: aborted
+                                ? "abort"
+                                : "maxTokens"
                         }
                     };
                 }
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 9d7fa343..2da930e6 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -76,6 +76,22 @@ export type LlamaModelOptions = {
      */
     useMmap?: boolean,
 
+    /**
+     * Direct I/O is a method of reading and writing data to and from the storage device directly to the application memory,
+     * bypassing OS in-memory caches.
+     *
+     * It leads to improved model loading times and reduced RAM usage,
+     * on the expense of higher loading times when the model unloaded and loaded again repeatedly in a short period of time.
+     *
+     * When this option is enabled, if Direct I/O is supported by the system (and for the given file)
+     * it will be used and mmap will be disabled.
+     *
+     * Unsupported on macOS.
+     *
+     * Defaults to `true`.
+     */
+    useDirectIo?: boolean,
+
     /**
      * Force the system to keep the model in the RAM/VRAM.
      * Use with caution as this can crash your system if the available resources are insufficient.
@@ -150,6 +166,7 @@ export type LlamaModelOptions = {
 };
 
 const defaultUseMmap = true;
+const defaultUseDirectIo = true;
 const defaultContextFlashAttentionEnabled = false;
 const defaultContextSwaFullCache = false;
 
@@ -181,7 +198,7 @@ export class LlamaModel {
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        modelPath, gpuLayers, vocabOnly = false, useMmap, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides
+        modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides
     }: LlamaModelOptions & {
         gpuLayers: number
     }, {
@@ -219,6 +236,7 @@ export class LlamaModel {
             gpuLayers,
             vocabOnly: this._vocabOnly,
             useMmap,
+            useDirectIo,
             useMlock: _llama.supportsMlock
                 ? useMlock
                 : undefined,
@@ -709,6 +727,7 @@ export class LlamaModel {
     }) {
         const {loadSignal, defaultContextFlashAttention} = modelOptions;
         const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
+        const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo;
 
         const fileInfo = await readGgufFileInfo(modelOptions.modelPath, {
             sourceType: "filesystem",
@@ -732,7 +751,7 @@ export class LlamaModel {
             useMmap
         });
 
-        const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, {
+        const model = new LlamaModel({...modelOptions, gpuLayers, useMmap, useDirectIo}, {
             _fileInfo: fileInfo,
             _fileInsights: ggufInsights,
             _llama,
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index 257f7080..b4d984d5 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -12,6 +12,7 @@ export const enum GgufArchitectureType {
     starcoder = "starcoder",
     refact = "refact",
     bert = "bert",
+    modernBert = "modern-bert",
     nomicBert = "nomic-bert",
     nomicBertMoe = "nomic-bert-moe",
     neoBert = "neo-bert",
@@ -33,6 +34,7 @@ export const enum GgufArchitectureType {
     phimoe = "phimoe",
     plamo = "plamo",
     plamo2 = "plamo2",
+    plamo3 = "plamo3",
     codeshell = "codeshell",
     orion = "orion",
     internlm2 = "internlm2",
@@ -68,6 +70,7 @@ export const enum GgufArchitectureType {
     jais = "jais",
     nemotron = "nemotron",
     nemotronH = "nemotron_h",
+    nemotronHMoe = "nemotron_h_moe",
     exaone = "exaone",
     exaone4 = "exaone4",
     rwkv6 = "rwkv6",
@@ -105,6 +108,9 @@ export const enum GgufArchitectureType {
     rnd1 = "rnd1",
     panguEmbedded = "pangu-embedded",
     mistral3 = "mistral3",
+    mimo2 = "mimo2",
+    llamaEmbed = "llama-embed",
+    maincoder = "maincoder",
     clip = "clip",
     unknown = "(unknown)"
 }
diff --git a/templates/electron-typescript-react/.editorconfig b/templates/electron-typescript-react/.editorconfig
index 1c7d0091..4cf71102 100644
--- a/templates/electron-typescript-react/.editorconfig
+++ b/templates/electron-typescript-react/.editorconfig
@@ -7,7 +7,7 @@ indent_size = 4
 [{*.ts,*.tsx,*.js,*.jsx,*.css,*.scss}]
 insert_final_newline = true
 
-[{package.json,package-lock.json,manifest.json,electron-builder.json5}]
+[{package.json,package-lock.json,manifest.json}]
 indent_size = 2
 
 [*.yml]
diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts
index 72e6a94f..79addacc 100644
--- a/test/modelDependent/llama3.2/sequenceState.test.ts
+++ b/test/modelDependent/llama3.2/sequenceState.test.ts
@@ -108,7 +108,7 @@ describe("llama 3.2", () => {
                 const res1_2 = await chatSession1_2.prompt("What's the exact thing I told you to remember?", {maxTokens: 12});
                 const contextSequence1TokensState4 = contextSequence1.tokenMeter.getState();
 
-                expect(res1_2).to.toMatchInlineSnapshot('"You told me to "Remember: locks are not doors"."');
+                expect(res1_2).toMatch(/^(You told me to "Remember: locks are not doors".|You told me to "Remember: locks are not doors.")/);
                 const contextSequence1TokensState4Diff = TokenMeter.diff(contextSequence1TokensState4, contextSequence1TokensState3);
                 expect(contextSequence1TokensState4Diff.usedInputTokens).to.be.lessThan(contextSequence1TokensState1.usedInputTokens);
                 expect(contextSequence1TokensState4Diff).toMatchInlineSnapshot(`
@@ -202,7 +202,7 @@ describe("llama 3.2", () => {
                 });
 
                 const res1 = await chatSession1.prompt("Remember: locks are not doors. Also, write a long poem about it", {maxTokens: 154});
-                expect(res1).toMatch(/^(A clever reminder indeed.|A wise phrase to ponder)/);
+                expect(res1).toMatch(/^(A clever reminder indeed.|A wise phrase to ponder|A wise phrase indeed)/);
 
 
                 const stateFile1Path = await getTempTestFilePath("state1");