fix: CUDA 13 support (#494)

giladgd · web-flow · commit b10999de02a6 · 2025-08-26T01:07:01.000+03:00
fix: prebuilt binaries CUDA 13 support
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -56,15 +56,18 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - name: "Windows for x64"
+          - name: "Windows (1)"
             os: windows-2022
-            artifact: "win-x64"
-          - name: "Windows for Arm"
+            artifact: "win-1"
+          - name: "Windows (2)"
             os: windows-2022
-            artifact: "win-arm"
-          - name: "Ubuntu"
+            artifact: "win-2"
+          - name: "Ubuntu (1)"
+            os: ubuntu-22.04
+            artifact: "linux-1"
+          - name: "Ubuntu (2)"
             os: ubuntu-22.04
-            artifact: "linux"
+            artifact: "linux-2"
           - name: "macOS x64"
             os: macos-13
             artifact: "mac-x64"
@@ -97,8 +100,8 @@ jobs:
           choco install cmake --version=3.31.1
           choco install ninja
 
-      - name: Install dependencies on Ubuntu
-        if: matrix.config.name == 'Ubuntu'
+      - name: Install dependencies on Ubuntu (1)
+        if: matrix.config.name == 'Ubuntu (1)'
         run: |
           sudo apt-get update
           sudo apt-get install ninja-build cmake libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
@@ -108,25 +111,66 @@ jobs:
           
           which arm-linux-gnueabihf-gcc
           which arm-linux-gnueabihf-g++
+          
+          cmake --version
+
+      - name: Install dependencies on Ubuntu (2)
+        if: matrix.config.name == 'Ubuntu (2)'
+        run: |
+          sudo apt-get update
+          sudo apt-get install ninja-build libtbb-dev
+          
+          wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
+          sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
+          rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
+          
+          cmake --version
 
-      - name: Install Cuda on Windows for x64
-        if: matrix.config.name == 'Windows for x64'
+      - name: Install Cuda 12.4 on Windows (1)
+        if: matrix.config.name == 'Windows (1)'
         uses: Jimver/cuda-toolkit@v0.2.15
         with:
           cuda: '12.4.0'
           method: 'network'
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
           use-local-cache: false
 
-      - name: Install Cuda on Ubuntu
-        if: matrix.config.name == 'Ubuntu'
+      - name: Install Cuda 13.0 on Windows (2)
+        if: matrix.config.name == 'Windows (2)'
+        shell: bash
+        timeout-minutes: 30
+        run: |
+          curl -Lo cuda_13.0.0_windows_network.exe https://developer.download.nvidia.com/compute/cuda/13.0.0/network_installers/cuda_13.0.0_windows_network.exe
+          
+          echo "Installing Cuda 13.0.0"
+          powershell -Command "Start-Process -FilePath cuda_13.0.0_windows_network.exe -ArgumentList '-s','-n' -Wait"
+          echo "Cuda installation finished"
+          
+          rm -f ./cuda_13.0.0_windows_network.exe
+          
+          echo "where cudart64_13.dll: $(where cudart64_13.dll)"
+          
+          echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
+          echo "CUDA_PATH_V13_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V13_0" >> $GITHUB_ENV
+          echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\bin" >> $GITHUB_PATH
+
+      - name: Install Cuda 12.4 on Ubuntu
+        if: matrix.config.name == 'Ubuntu (1)'
         uses: Jimver/cuda-toolkit@v0.2.15
         with:
           cuda: '12.4.0'
           method: 'network'
 
-      - name: Install Vulkan SDK on Windows for x64
-        if: matrix.config.name == 'Windows for x64'
+      - name: Install Cuda 13.0 on Ubuntu
+        if: matrix.config.name == 'Ubuntu (2)'
+        uses: Jimver/cuda-toolkit@v0.2.27
+        with:
+          cuda: '13.0.0'
+          method: 'network'
+
+      - name: Install Vulkan SDK on Windows (1)
+        if: matrix.config.name == 'Windows (1)'
         shell: powershell
         env:
           VULKAN_VERSION: 1.3.261.1
@@ -137,7 +181,7 @@ jobs:
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 
       - name: Install Vulkan SDK on Ubuntu
-        if: matrix.config.name == 'Ubuntu'
+        if: matrix.config.name == 'Ubuntu (1)'
         run: |
           wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
           sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
@@ -205,18 +249,21 @@ jobs:
           }
           
           // build binaries
-          if (process.env.ARTIFACT_NAME === "win-x64") {
+          if (process.env.ARTIFACT_NAME === "win-1") {
             await buildBinary("x64", ["--gpu", "false"]);
             await buildBinary("x64", ["--gpu", "cuda"]);
             await buildBinary("x64", ["--gpu", "vulkan"]);
-          } else if (process.env.ARTIFACT_NAME === "win-arm") {
+          } else if (process.env.ARTIFACT_NAME === "win-2") {
             await buildBinary("arm64", ["--gpu", "false"], windowsOnArmNodeVersion);
-          } else if (process.env.ARTIFACT_NAME === "linux") {
+            await buildBinary("x64", ["--gpu", "cuda"]);
+          } else if (process.env.ARTIFACT_NAME === "linux-1") {
             await buildBinary("x64", ["--gpu", "false"]);
             await buildBinary("x64", ["--gpu", "cuda"]);
             await buildBinary("x64", ["--gpu", "vulkan"]);
             await buildBinary("arm64", ["--gpu", "false"]);
             await buildBinary("armv7l", ["--gpu", "false"]);
+          } else if (process.env.ARTIFACT_NAME === "linux-2") {
+            await buildBinary("x64", ["--gpu", "cuda"]);
           } else if (process.env.ARTIFACT_NAME === "mac-x64") {
             await buildBinary("x64", ["--gpu", "false"]);
           } else if (process.env.ARTIFACT_NAME === "mac-arm64") {
@@ -233,6 +280,26 @@ jobs:
             );
           }
           
+          if (process.env.ARTIFACT_NAME === "win-2") {
+            await fs.move(
+              path.join(llamaBinsDirectoryPath, "win-x64-cuda"),
+              path.join(llamaBinsDirectoryPath, "win-x64-cuda-2")
+            );
+            
+            if (!(await fs.pathExists(path.join(llamaBinsDirectoryPath, "win-x64-cuda-2", "ggml-cuda.dll")))) {
+              throw new Error("ggml-cuda.dll not found in win-x64-cuda-2");
+            }
+          } else if (process.env.ARTIFACT_NAME === "linux-2") {
+            await fs.move(
+              path.join(llamaBinsDirectoryPath, "linux-x64-cuda"),
+              path.join(llamaBinsDirectoryPath, "linux-x64-cuda-2")
+            );
+            
+            if (!(await fs.pathExists(path.join(llamaBinsDirectoryPath, "linux-x64-cuda-2", "libggml-cuda.so")))) {
+              throw new Error("libggml-cuda.so not found in linux-x64-cuda-2");
+            }
+          }
+          
           await $`echo "Built binaries:"`;
           await $`ls bins`;
           
@@ -494,6 +561,14 @@ jobs:
           mv artifacts/bins-*/* bins/
           mv artifacts/build dist/
           
+          mkdir -p ./bins/win-x64-cuda/fallback
+          mv ./bins/win-x64-cuda-2/ggml-cuda.dll ./bins/win-x64-cuda/fallback/ggml-cuda.dll
+          rm -rf ./bins/win-x64-cuda-2
+          
+          mkdir -p ./bins/linux-x64-cuda/fallback
+          mv ./bins/linux-x64-cuda-2/libggml-cuda.so ./bins/linux-x64-cuda/fallback/libggml-cuda.so
+          rm -rf ./bins/linux-x64-cuda-2
+          
           cp -r artifacts/llama.cpp/llama.cpp/grammars llama/grammars
           
           rm -f ./llama/binariesGithubRelease.json
diff --git a/docs/guide/CUDA.md b/docs/guide/CUDA.md
@@ -43,6 +43,7 @@ If you see `CUDA used VRAM` in the output, it means that CUDA support is working
 
 ## Prerequisites
 * [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher
+* [NVIDIA Drivers](https://www.nvidia.com/en-us/drivers/)
 * [`cmake-js` dependencies](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake)
 * [CMake](https://cmake.org/download/) 3.26 or higher (optional, recommended if you have build issues)
 
@@ -89,10 +90,15 @@ export CUDACXX=/usr/local/cuda-12.4/bin/nvcc
 export CUDA_PATH=/usr/local/cuda-12.4
 ```
 
-```cmd [Windows]
+```cmd [Windows (cmd)]
 set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe
 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
 ```
+
+```cmd [Windows (PowerShell)]
+$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe"
+$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+```
 :::
 
 Then run the build command again to check whether setting the `CUDACXX` and `CUDA_PATH` environment variables fixed the issue.
@@ -110,9 +116,13 @@ To do this, set the `NODE_LLAMA_CPP_CMAKE_OPTION_CMAKE_GENERATOR_TOOLSET` enviro
 export NODE_LLAMA_CPP_CMAKE_OPTION_CMAKE_GENERATOR_TOOLSET=$CUDA_PATH
 ```
 
-```cmd [Windows]
+```cmd [Windows (cmd)]
 set NODE_LLAMA_CPP_CMAKE_OPTION_CMAKE_GENERATOR_TOOLSET=%CUDA_PATH%
 ```
+
+```cmd [Windows (PowerShell)]
+$env:NODE_LLAMA_CPP_CMAKE_OPTION_CMAKE_GENERATOR_TOOLSET=$env:CUDA_PATH
+```
 :::
 
 Then run the build command again to check whether setting the `CMAKE_GENERATOR_TOOLSET` cmake option fixed the issue.
@@ -136,13 +146,22 @@ Run this command inside of your project:
 ldd ./node_modules/@node-llama-cpp/linux-x64-cuda/bins/linux-x64-cuda/libggml-cuda.so
 ```
 
-```cmd [Windows]
+```cmd [Windows (cmd)]
 "C:\Program Files\Git\usr\bin\ldd.exe" node_modules\@node-llama-cpp\win-x64-cuda\bins\win-x64-cuda\ggml-cuda.dll
 ```
+
+```cmd [Windows (PowerShell)]
+& "C:\Program Files\Git\usr\bin\ldd.exe" node_modules\@node-llama-cpp\win-x64-cuda\bins\win-x64-cuda\ggml-cuda.dll
+```
 :::
 
 ::::
 
+### Fix the `ggml_cuda_init: failed to initialize CUDA: (null)` Error {#fix-failed-to-initialize-cuda-null}
+This error usually happens when the NVIDIA drivers installed on your machine are incompatible with the version of CUDA you have installed.
+
+To fix it, update your NVIDIA drivers to the latest version from the [NVIDIA Driver Downloads](https://www.nvidia.com/en-us/drivers/) page.
+
 
 ## Using `node-llama-cpp` With CUDA
 It's recommended to use [`getLlama`](../api/functions/getLlama) without specifying a GPU type,
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -76,6 +76,21 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
     OUTPUT_STRIP_TRAILING_WHITESPACE)
 include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
 
+if (DEFINED GGML_NATIVE)
+    set(NLC_GGML_NATIVE ${GGML_NATIVE})
+elseif(CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
+    set(NLC_GGML_NATIVE OFF)
+else()
+    set(NLC_GGML_NATIVE ON)
+endif()
+
+if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
+    find_package(CUDAToolkit)
+    if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
+        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
+    endif()
+endif()
+
 add_subdirectory("llama.cpp")
 include_directories("llama.cpp")
 include_directories("./llama.cpp/common")
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
@@ -11,7 +11,10 @@ import {LlamaGrammar, LlamaGrammarOptions} from "../evaluator/LlamaGrammar.js";
 import {ThreadsSplitter} from "../utils/ThreadsSplitter.js";
 import {getLlamaClasses, LlamaClasses} from "../utils/getLlamaClasses.js";
 import {BindingModule} from "./AddonTypes.js";
-import {BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel, LlamaLogLevelGreaterThanOrEqual, LlamaNuma} from "./types.js";
+import {
+    BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel,
+    LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaNuma
+} from "./types.js";
 import {MemoryOrchestrator, MemoryReservation} from "./utils/MemoryOrchestrator.js";
 
 export const LlamaLogLevelToAddonLogLevel: ReadonlyMap<LlamaLogLevel, number> = new Map([
@@ -41,6 +44,7 @@ export class Llama {
     /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator;
     /** @internal */ public readonly _debug: boolean;
     /** @internal */ public readonly _threadsSplitter: ThreadsSplitter;
+    /** @internal */ public _hadErrorLogs: boolean = false;
     /** @internal */ private readonly _gpu: LlamaGpuType;
     /** @internal */ private readonly _numa: LlamaNuma;
     /** @internal */ private readonly _buildType: "localBuild" | "prebuilt";
@@ -107,9 +111,17 @@ export class Llama {
         }
 
         bindings.loadBackends();
-        const loadedGpu = bindings.getGpuType();
-        if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
-            bindings.loadBackends(path.dirname(bindingPath));
+        let loadedGpu = bindings.getGpuType();
+        if (loadedGpu == null || (loadedGpu === false && buildGpu !== false)) {
+            const backendsPath = path.dirname(bindingPath);
+            const fallbackBackendsDir = path.join(backendsPath, "fallback");
+
+            bindings.loadBackends(backendsPath);
+
+            loadedGpu = bindings.getGpuType();
+            if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
+                bindings.loadBackends(fallbackBackendsDir);
+        }
 
         bindings.ensureGpuDeviceIsSupported();
 
@@ -462,6 +474,9 @@ export class Llama {
 
         this._previousLog = message;
         this._previousLogLevel = level;
+
+        if (!this._hadErrorLogs && LlamaLogLevelGreaterThan(level, LlamaLogLevel.error))
+            this._hadErrorLogs = true;
     }
 
     /** @internal */
diff --git a/src/bindings/types.ts b/src/bindings/types.ts
@@ -117,14 +117,23 @@ export const LlamaVocabularyTypeValues = Object.freeze([
 ] as const);
 
 /**
- *Check if a log level is higher than another log level
+ * Check if a log level is higher than another log level
+ * @example
+ * ```ts
+ * LlamaLogLevelGreaterThan(LlamaLogLevel.error, LlamaLogLevel.info); // true
+ * ```
  */
 export function LlamaLogLevelGreaterThan(a: LlamaLogLevel, b: LlamaLogLevel): boolean {
     return LlamaLogLevelValues.indexOf(a) < LlamaLogLevelValues.indexOf(b);
 }
 
 /**
- *Check if a log level is higher than or equal to another log level
+ * Check if a log level is higher than or equal to another log level
+ * @example
+ * ```ts
+ * LlamaLogLevelGreaterThanOrEqual(LlamaLogLevel.error, LlamaLogLevel.info); // true
+ * LlamaLogLevelGreaterThanOrEqual(LlamaLogLevel.error, LlamaLogLevel.error); // true
+ * ```
  */
 export function LlamaLogLevelGreaterThanOrEqual(a: LlamaLogLevel, b: LlamaLogLevel): boolean {
     return LlamaLogLevelValues.indexOf(a) <= LlamaLogLevelValues.indexOf(b);
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
@@ -245,6 +245,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
         else if (buildOptions.gpu === "cuda") {
             if (!ignoreWorkarounds.includes("cudaArchitecture") && (platform === "win" || platform === "linux") &&
                 err instanceof SpawnError && (
+                err.combinedStd.toLowerCase().includes("CUDA Toolkit not found".toLowerCase()) ||
                 err.combinedStd.toLowerCase().includes("Failed to detect a default CUDA architecture".toLowerCase()) ||
                 err.combinedStd.toLowerCase().includes("CMAKE_CUDA_COMPILER-NOTFOUND".toLowerCase()) || (
                     err.combinedStd.toLowerCase().includes(
@@ -253,6 +254,10 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     err.combinedStd.toLowerCase().includes(
                         'variable "CUDACXX" or the CMake cache entry CMAKE_CUDA_COMPILER to the full'.toLowerCase()
                     )
+                ) || (
+                    err.combinedStd.toLowerCase().includes("The CUDA compiler".toLowerCase()) &&
+                    err.combinedStd.toLowerCase().includes("is not able to compile a simple test program".toLowerCase()) &&
+                    err.combinedStd.toLowerCase().includes("nvcc fatal".toLowerCase())
                 )
             )) {
                 for (const {nvccPath, cudaHomePath} of await getCudaNvccPaths()) {
diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts
@@ -286,9 +286,17 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
                     throw new Error("Binding binary is not loaded");
 
                 binding.loadBackends();
-                const loadedGpu = binding.getGpuType();
-                if (loadedGpu == null || (loadedGpu === false && message.gpu !== false))
-                    binding.loadBackends(path.dirname(path.resolve(message.bindingBinaryPath)));
+                let loadedGpu = binding.getGpuType();
+                if (loadedGpu == null || (loadedGpu === false && message.gpu !== false)) {
+                    const backendsPath = path.dirname(path.resolve(message.bindingBinaryPath));
+                    const fallbackBackendsDir = path.join(backendsPath, "fallback");
+
+                    binding.loadBackends(backendsPath);
+
+                    loadedGpu = binding.getGpuType();
+                    if (loadedGpu == null || (loadedGpu === false && message.gpu !== false))
+                        binding.loadBackends(fallbackBackendsDir);
+                }
 
                 await binding.init();
                 binding.getGpuVramInfo();
diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts