withcatai · giladgd · Aug 25, 2025 · Aug 24, 2025 · Aug 24, 2025 · Aug 24, 2025
diff --git a/docs/blog/v3.12-gpt-oss.md b/docs/blog/v3.12-gpt-oss.md
@@ -24,7 +24,7 @@ image:
 
 Here are a few highlights of these models:
 * Due to the low number of active parameters, these models are very fast
-* These are reasoning models, and you can adjust their reasoning efforts
+* These are reasoning models, and you can adjust their reasoning effort
 * They are very good at function calling, and are built with agentic capabilities in mind
 * These models were trained with native MXFP4 precision, so no need to quantize them further.
   They're small compared to their capabilities already
@@ -74,7 +74,7 @@ but offers better precision and thus better quality.
 To quickly try out [`gpt-oss-20b`](https://huggingface.co/giladgd/gpt-oss-20b-GGUF), you can use the [CLI `chat` command](../cli/chat.md):
 
 ```shell
-npx -y node-llama-cpp chat --ef --prompt "Hi there" hf:giladgd/gpt-oss-20b-GGUF/gpt-oss-20b.MXFP4.gguf
+npx -y node-llama-cpp chat --prompt "Hi there" hf:giladgd/gpt-oss-20b-GGUF/gpt-oss-20b.MXFP4.gguf
 ```
 
 

diff --git a/docs/guide/CUDA.md b/docs/guide/CUDA.md
@@ -9,14 +9,14 @@ description: CUDA support in node-llama-cpp
 and these are automatically used when CUDA is detected on your machine.
 
 To use `node-llama-cpp`'s CUDA support with your NVIDIA GPU,
-make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.2 or higher installed on your machine.
+make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher installed on your machine.
 
 If the pre-built binaries don't work with your CUDA installation,
 `node-llama-cpp` will automatically download a release of `llama.cpp` and build it from source with CUDA support.
 Building from source with CUDA support is slow and can take up to an hour.
 
-The pre-built binaries are compiled with CUDA Toolkit 12.2,
-so any version of CUDA Toolkit that is 12.2 or higher should work with the pre-built binaries.
+The pre-built binaries are compiled with CUDA Toolkit 12.4,
+so any version of CUDA Toolkit that is 12.4 or higher should work with the pre-built binaries.
 If you have an older version of CUDA Toolkit installed on your machine,
 consider updating it to avoid having to wait the long build time.
 
@@ -42,7 +42,7 @@ You should see an output like this:
 If you see `CUDA used VRAM` in the output, it means that CUDA support is working on your machine.
 
 ## Prerequisites
-* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.2 or higher
+* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher
 * [`cmake-js` dependencies](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake)
 * [CMake](https://cmake.org/download/) 3.26 or higher (optional, recommended if you have build issues)
 
@@ -79,20 +79,23 @@ const cudaCmakeOptionsTable = data.cudaCmakeOptionsTable;
 To build `node-llama-cpp` with any of these options, set an environment variable of an option prefixed with `NODE_LLAMA_CPP_CMAKE_OPTION_`.
 
 ### Fix the `Failed to detect a default CUDA architecture` Build Error
-To fix this issue you have to set the `CUDACXX` environment variable to the path of the `nvcc` compiler.
+To fix this issue you have to set the `CUDACXX` environment variable to the path of the `nvcc` compiler,
+and the `CUDA_PATH` environment variable to the path of the CUDA home directory that contains the `nvcc` compiler.
 
-For example, if you have installed CUDA Toolkit 12.2, you have to run a command like this:
+For example, if you have installed CUDA Toolkit 12.4, you have to run a command like this:
 ::: code-group
 ```shell [Linux]
-export CUDACXX=/usr/local/cuda-12.2/bin/nvcc
+export CUDACXX=/usr/local/cuda-12.4/bin/nvcc
+export CUDA_PATH=/usr/local/cuda-12.4
 ```
 
 ```cmd [Windows]
-set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin\nvcc.exe
+set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
 ```
 :::
 
-Then run the build command again to check whether setting the `CUDACXX` environment variable fixed the issue.
+Then run the build command again to check whether setting the `CUDACXX` and `CUDA_PATH` environment variables fixed the issue.
 
 ### Fix the `The CUDA compiler identification is unknown` Build Error
 The solution to this error is the same as [the solution to the `Failed to detect a default CUDA architecture` error](#fix-the-failed-to-detect-a-default-cuda-architecture-build-error).

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -4,6 +4,8 @@ if (NLC_CURRENT_PLATFORM STREQUAL "win-x64" OR NLC_CURRENT_PLATFORM STREQUAL "wi
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
+include("./cmake/addVariantSuffix.cmake")
+
 if (NLC_CURRENT_PLATFORM STREQUAL "win-x64")
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL" CACHE STRING "" FORCE)
@@ -109,6 +111,9 @@ list(REMOVE_DUPLICATES GPU_INFO_HEADERS)
 list(REMOVE_DUPLICATES GPU_INFO_SOURCES)
 list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS)
 
+addVariantSuffix(llama ${NLC_VARIANT})
+addVariantSuffix(ggml ${NLC_VARIANT})
+
 file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES})
 
 add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC} ${GPU_INFO_HEADERS})

diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp
@@ -54,9 +54,13 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
             // this means that we counted memory from devices that aren't used by llama.cpp
             vulkanDeviceUnifiedVramSize = 0;
         }
-        
+
         unifiedVramSize += vulkanDeviceUnifiedVramSize;
     }
+
+    if (used == 0 && vulkanDeviceUsed != 0) {
+        used = vulkanDeviceUsed;
+    }
 #endif
 
     Napi::Object result = Napi::Object::New(info.Env());
@@ -93,7 +97,7 @@ std::pair<ggml_backend_dev_t, std::string> getGpuDevice() {
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
         ggml_backend_dev_t device = ggml_backend_dev_get(i);
         const auto deviceName = std::string(ggml_backend_dev_name(device));
-        
+
         if (deviceName == "Metal") {
             return std::pair<ggml_backend_dev_t, std::string>(device, "metal");
         } else if (std::string(deviceName).find("Vulkan") == 0) {
@@ -106,7 +110,7 @@ std::pair<ggml_backend_dev_t, std::string> getGpuDevice() {
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
         ggml_backend_dev_t device = ggml_backend_dev_get(i);
         const auto deviceName = std::string(ggml_backend_dev_name(device));
-        
+
         if (deviceName == "CPU") {
             return std::pair<ggml_backend_dev_t, std::string>(device, "cpu");
         }
@@ -119,7 +123,7 @@ Napi::Value getGpuType(const Napi::CallbackInfo& info) {
     const auto gpuDeviceRes = getGpuDevice();
     const auto device = gpuDeviceRes.first;
     const auto deviceType = gpuDeviceRes.second;
-    
+
     if (deviceType == "cpu") {
         return Napi::Boolean::New(info.Env(), false);
     } else if (device != nullptr && deviceType != "") {

diff --git a/llama/cmake/addVariantSuffix.cmake b/llama/cmake/addVariantSuffix.cmake
@@ -0,0 +1,21 @@
+function(addVariantSuffix originalTarget variantSuffix)
+    if (NOT TARGET ${originalTarget} OR variantSuffix STREQUAL "")
+        return()
+    endif()
+
+    set(_name "${originalTarget}.${variantSuffix}")
+
+    set_target_properties(${originalTarget} PROPERTIES
+        OUTPUT_NAME "${_name}"
+        RUNTIME_OUTPUT_NAME "${_name}" # Windows .dll
+        LIBRARY_OUTPUT_NAME "${_name}" # Unix shared lib
+        ARCHIVE_OUTPUT_NAME "${_name}" # static / import lib
+    )
+
+    if (APPLE)
+        set_target_properties(${originalTarget} PROPERTIES
+            MACOSX_RPATH     ON
+            INSTALL_NAME_DIR "@rpath"
+        )
+    endif()
+endfunction()