withcatai · giladgd · Oct 29, 2024 · Oct 26, 2024 · Oct 26, 2024 · Oct 26, 2024
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -35,11 +35,10 @@ body:
     id: steps
     attributes:
       label: Steps to reproduce
-      description: >-
+      description: |-
         Your bug can be investigated much faster if your code can be run without any dependencies other than `node-llama-cpp`.
         Issues without reproduction steps or code examples may be closed as not actionable.
-        Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)).
-        Please include a link to the model file you used if possible.
+        Please try to provide a Minimal, Complete, and Verifiable example ([link](http://stackoverflow.com/help/mcve)), including a link to the model file you used if possible.
         Also, please enable enable debug logs by using `getLlama({debug: true})` to get more information.
       placeholder: >-
         Please try to provide a Minimal, Complete, and Verifiable example.
@@ -50,10 +49,9 @@ body:
     id: env
     attributes:
       label: My Environment
-      description: >-
+      description: |-
         Please include the result of the command `npx --yes node-llama-cpp inspect gpu`.
-        Please also add any other relevant dependencies to this table at the end.
-        For example: Electron, Bun, Webpack.
+        Please also add any other relevant dependencies to this table at the end. For example: Electron, Bun, Webpack.
       value: |
         | Dependency               | Version             |
         | ---                      | ---                 |

diff --git a/.github/ISSUE_TEMPLATE/documentation-issue.yml b/.github/ISSUE_TEMPLATE/documentation-issue.yml
@@ -13,7 +13,7 @@ body:
     id: details
     attributes:
       label: What was unclear or otherwise insufficient?
-      description: >-
+      description: |-
         If relevant, please be clear about the documentation URL, as well as the location within the page.
         Add a link to the relevant documentation you're referring to.
       placeholder: >-

diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -51,8 +51,12 @@ body:
           required: false
         - label: CUDA support
           required: false
+        - label: Vulkan support
+          required: false
         - label: Grammar
           required: false
+        - label: Function calling
+          required: false
   - type: dropdown
     id: pr
     attributes:

diff --git a/.vitepress/config.ts b/.vitepress/config.ts
@@ -34,7 +34,8 @@ const packageVersion = env.get("DOCS_PACKAGE_VERSION")
     .default(packageJson.version)
     .asString();
 
-const hostname = "https://node-llama-cpp.withcat.ai/";
+const hostname = "https://node-llama-cpp.withcat.ai/"
+const buildDate = new Date();
 
 const socialPosterLink = hostname + "social.poster.jpg";
 const defaultPageTitle = "node-llama-cpp - node.js bindings for llama.cpp";
@@ -90,7 +91,7 @@ export default defineConfig({
     base: urlBase,
     sitemap: {
         hostname,
-        transformItems(items) {
+        async transformItems(items) {
             function priorityMatch(a: {url: string}, b: {url: string}, matchers: ((url: string) => boolean)[]): number {
                 for (const matcher of matchers) {
                     const aMatch = matcher(a.url);
@@ -105,13 +106,38 @@ export default defineConfig({
                 return 0;
             }
 
+            const blogPosts = await createContentLoader("blog/*.md", {
+                excerpt: true,
+                render: true
+            })
+                .load();
+            const blogPostMap = new Map<string, typeof blogPosts[number]>();
+            for (const blogPost of blogPosts) {
+                let url = blogPost.url;
+                if (url.startsWith("/"))
+                    url = url.slice("/".length);
+
+                blogPostMap.set(url, blogPost);
+            }
+
             return items
                 .map((item) => {
-                    if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
+                    if (item.url === "" || item.url === "blog/") {
+                        item.lastmod = new Date(buildDate);
+                    } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
                         item = {
                             ...item,
-                            lastmod: undefined
+                            lastmod: new Date(buildDate)
                         };
+                    } else if (item.lastmod == null && item.url.startsWith("blog/")) {
+                        const postDate = blogPostMap.get(item.url)?.frontmatter.date;
+                        if (postDate != null) {
+                            const parsedDate = new Date(postDate);
+                            if (Number.isFinite(parsedDate.getTime()))
+                                item.lastmod = parsedDate;
+                        }
+                    } else if (item.lastmod == null) {
+                        item.lastmod = new Date(buildDate);
                     }
 
                     return item;

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-    <img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.v3.roundEdges.avif" width="360px" />
+    <a href="https://node-llama-cpp.withcat.ai" target="_blank"><img alt="node-llama-cpp Logo" src="https://raw.githubusercontent.com/withcatai/node-llama-cpp/master/assets/logo.v3.roundEdges.avif" width="360px" /></a>
     <h1>node-llama-cpp</h1>
     <p>Run AI models locally on your machine</p>
     <sub>Pre-built bindings are provided with a fallback to building from source with cmake</sub>

diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md
@@ -671,3 +671,34 @@ await new Promise(resolve => setTimeout(resolve, 1500));
 const cachedCompletion = completionEngine.complete("Hi there! How");
 console.log("Cached completion:", cachedCompletion);
 ```
+
+## Response Prefix {#response-prefix}
+You can force the model response to start with a specific prefix,
+to make the model follow a certain direction in its response.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaChatSession, GeneralChatWrapper} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence(),
+    chatWrapper: new GeneralChatWrapper()
+});
+
+
+const q1 = "Hi there, how are you?";
+console.log("User: " + q1);
+
+const a1 = await session.prompt(q1, {
+    responsePrefix: "The weather today is"
+});
+console.log("AI: " + a1);
+```
diff --git a/docs/guide/electron.md b/docs/guide/electron.md
@@ -37,3 +37,27 @@ so that `node-llama-cpp` can find them.
 Cross packaging from one platform to another is not supported, since binaries for other platforms are not downloaded to you machine when your run `npm install`.
 
 Packaging an `arm64` app on an `x64` machine is supported, but packaging an `x64` app on an `arm64` machine is not.
+
+## Bundling
+When bundling your code for Electron using [Electron Vite](https://electron-vite.org) or Webpack,
+ensure that `node-llama-cpp` is not bundled, and is instead treated as an external module.
+
+Marking `node-llama-cpp` as an external module will prevent its code from being bundled with your application code,
+and instead, it'll be loaded from the `node_modules` directory at runtime (which should be packed into a `.asar` archive).
+
+The file structure of `node-llama-cpp` is crucial for it to function correctly,
+so bundling it will break its functionality.
+Moreover, since `node-llama-cpp` includes prebuilt binaries (and also local builds from source),
+those files must be retained in their original structure for it to work.
+
+Electron has [its own bundling solution called ASAR](https://www.electronjs.org/docs/latest/tutorial/asar-archives) that is designed to work with node modules.
+ASAR retains the original file structure of node modules by packing all the files into a single `.asar` archive file that Electron will read from at runtime like it would from the file system.
+This method ensures node modules work as intended in Electron applications, even though they are bundled into a single file.
+
+Using ASAR is the recommended way to bundle `node-llama-cpp` in your Electron app.
+
+If you're using the scaffolded Electron app, this is already taken care of.
+
+::: tip NOTE
+We recommend using [Electron Vite](https://electron-vite.org) over Webpack for your Electron app due to to Vite's speed and Webpack's lack of proper ESM support in the output bundle, which complicates the bundling process.
+:::
diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md
@@ -85,3 +85,37 @@ npx --no node-llama-cpp source download
 ```
 
 Now, just use `node-llama-cpp` as you normally would.
+
+## Intel AMX {#intel-amx}
+> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors
+> that helps optimize and accelerate matrix multiplication operations.
+> 
+> It's available on the 4th Gen and newer Intel Xeon processors.
+
+Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions).
+
+If you're using a 4th Gen or newer Intel Xeon processor,
+you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware.
+
+To do this, run this command inside your project on the machine you run your project on:
+```shell
+npx --no node-llama-cpp source download
+```
+
+Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries
+and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU:
+
+```typescript
+import os from "os";
+import {getLlama} from "node-llama-cpp";
+
+const llama = await getLlama({
+    usePrebuiltBinaries: !os.cpus().some((cpu) => (
+        cpu.model.toLowerCase().includes("Xeon".toLowerCase())
+    ))
+});
+```
+::: info NOTE
+Building from source can take some time (when using CUDA even up to an hour in extreme cases),
+so ensure you dedicate some time for this as part of the deployment process.
+:::
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -22,6 +22,12 @@ execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
                 OUTPUT_VARIABLE NODE_ADDON_API_DIR
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
 
+set(LLAMA_BUILD_COMMON ON)
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    add_compile_options(-Wno-c++17-extensions)
+endif()
+
 include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
 
 add_subdirectory("llama.cpp")

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -447,7 +447,7 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {
     GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);
 
     for (size_t i = 0; i < tokensLength; i++) {
-        llama_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
+        common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
     }
 
     if (generateLogitAtTheEnd) {

diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
@@ -426,7 +426,7 @@ Napi::Value AddonModel::Tokenize(const Napi::CallbackInfo& info) {
     std::string text = info[0].As<Napi::String>().Utf8Value();
     bool specialTokens = info[1].As<Napi::Boolean>().Value();
 
-    std::vector<llama_token> tokens = llama_tokenize(model, text, false, specialTokens);
+    std::vector<llama_token> tokens = common_tokenize(model, text, false, specialTokens);
 
     Napi::Uint32Array result = Napi::Uint32Array::New(info.Env(), tokens.size());
     for (size_t i = 0; i < tokens.size(); ++i) {
@@ -539,23 +539,23 @@ Napi::Value AddonModel::PrefixToken(const Napi::CallbackInfo& info) {
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_prefix(model));
+    return getNapiToken(info, model, llama_token_fim_pre(model));
 }
 Napi::Value AddonModel::MiddleToken(const Napi::CallbackInfo& info) {
     if (disposed) {
         Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_middle(model));
+    return getNapiToken(info, model, llama_token_fim_mid(model));
 }
 Napi::Value AddonModel::SuffixToken(const Napi::CallbackInfo& info) {
     if (disposed) {
         Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
         return info.Env().Undefined();
     }
 
-    return getNapiToken(info, model, llama_token_suffix(model));
+    return getNapiToken(info, model, llama_token_fim_suf(model));
 }
 Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {
     if (disposed) {

diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp
@@ -52,11 +52,6 @@ void AddonSampler::dispose() {
         topPSampler = nullptr;
     }
 
-    if (softmaxSampler != nullptr) {
-        llama_sampler_free(softmaxSampler);
-        softmaxSampler = nullptr;
-    }
-
     if (seedSampler != nullptr) {
         llama_sampler_free(seedSampler);
         seedSampler = nullptr;
@@ -135,10 +130,6 @@ void AddonSampler::rebuildChainIfNeeded() {
             llama_sampler_chain_add(chain, temperatureSampler);
         }
 
-        if (softmaxSampler != nullptr) {
-            llama_sampler_chain_add(chain, softmaxSampler);
-        }
-
         if (seedSampler != nullptr) {
             llama_sampler_chain_add(chain, seedSampler);
         }
@@ -206,10 +197,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {
         }
     }
 
-    if (softmaxSampler == nullptr) {
-        softmaxSampler = llama_sampler_init_softmax();
-    }
-
     if (config.Has("minP")) {
         auto minP = config.Get("minP").As<Napi::Number>().FloatValue();
         if (minP != minPSampler_minP) {

diff --git a/llama/addon/AddonSampler.h b/llama/addon/AddonSampler.h
@@ -25,8 +25,6 @@ class AddonSampler : public Napi::ObjectWrap<AddonSampler> {
 
         llama_sampler * topPSampler = nullptr;
         float topPSampler_topP = 0.0f; // Top p sampling >=1.0 = disabled
-
-        llama_sampler * softmaxSampler = nullptr;
 
         llama_sampler * seedSampler = nullptr;
         uint32_t seedSampler_seed = 0;

diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
@@ -8,6 +8,7 @@
 #include "globals/addonLog.h"
 #include "globals/addonProgress.h"
 #include "globals/getGpuInfo.h"
+#include "globals/getSwapInfo.h"
 
 bool backendInitialized = false;
 bool backendDisposed = false;
@@ -203,6 +204,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("getGpuVramInfo", getGpuVramInfo),
         Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo),
         Napi::PropertyDescriptor::Function("getGpuType", getGpuType),
+        Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo),
         Napi::PropertyDescriptor::Function("init", addonInit),
         Napi::PropertyDescriptor::Function("dispose", addonDispose),
     });

diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp
@@ -26,6 +26,7 @@ void logVulkanWarning(const char* message) {
 Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
     uint64_t total = 0;
     uint64_t used = 0;
+    uint64_t unifiedVramSize = 0;
 
 #ifdef GPU_INFO_USE_CUDA
     size_t cudaDeviceTotal = 0;
@@ -41,26 +42,31 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
 #ifdef GPU_INFO_USE_VULKAN
     uint64_t vulkanDeviceTotal = 0;
     uint64_t vulkanDeviceUsed = 0;
-    const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, logVulkanWarning);
+    uint64_t vulkanDeviceUnifiedVramSize = 0;
+    const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, &vulkanDeviceUnifiedVramSize, logVulkanWarning);
 
     if (vulkanDeviceSupportsMemoryBudgetExtension) {
         total += vulkanDeviceTotal;
         used += vulkanDeviceUsed;
+        unifiedVramSize += vulkanDeviceUnifiedVramSize;
     }
 #endif
 
 #ifdef GPU_INFO_USE_METAL
     uint64_t metalDeviceTotal = 0;
     uint64_t metalDeviceUsed = 0;
-    getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed);
+    uint64_t metalDeviceUnifiedVramSize = 0;
+    getMetalGpuInfo(&metalDeviceTotal, &metalDeviceUsed, &metalDeviceUnifiedVramSize);
 
     total += metalDeviceTotal;
     used += metalDeviceUsed;
+    unifiedVramSize += metalDeviceUnifiedVramSize;
 #endif
 
     Napi::Object result = Napi::Object::New(info.Env());
     result.Set("total", Napi::Number::From(info.Env(), total));
     result.Set("used", Napi::Number::From(info.Env(), used));
+    result.Set("unifiedSize", Napi::Number::From(info.Env(), unifiedVramSize));
 
     return result;
 }