withcatai · giladgd · Mar 27, 2025 · Mar 20, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -188,7 +188,7 @@ jobs:
 
           const {versions: latestNodeVersions} = await getLatestNodeVersions(Date.now() - 1000 * 60 * 60 * 24 * 14);
 
-          const nodeVersion = latestNodeVersions.get(18);
+          const nodeVersion = latestNodeVersions.get(20);
           const windowsOnArmNodeVersion = latestNodeVersions.get(20);
 
           if (nodeVersion == null || windowsOnArmNodeVersion == null) {
@@ -389,7 +389,7 @@ jobs:
 
   model-dependent-tests:
     name: Model dependent tests
-    runs-on: ubuntu-24.04
+    runs-on: macos-13
     env:
       NODE_LLAMA_CPP_GPU: false
     needs:
@@ -412,10 +412,16 @@ jobs:
           name: llama.cpp
           path: llama
 
-      - name: Install dependencies on Ubuntu
+#      - name: Install dependencies on Ubuntu
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install ninja-build cmake
+
+      - name: Install dependencies on macOS
+        if: matrix.config.name == 'macOS'
         run: |
-          sudo apt-get update
-          sudo apt-get install ninja-build cmake
+          brew install cmake ninja
+          alias make=cmake
 
       - name: Install modules
         run: npm ci
@@ -427,18 +433,29 @@ jobs:
         run: node ./dist/cli/cli.js inspect gpu
 
       - name: Cache models
-        id: cache-test-models
-        uses: actions/cache@v4
+        id: cache-restore-test-models
+        uses: actions/cache/restore@v4
         with:
           path: "test/.models/**.gguf"
           key: cache-test-models-${{ runner.os }}-${{ github.workflow }}
 
       - name: Download models or ensure all models are downloaded
+        id: download-all-test-models
         run: npm run dev:setup:downloadAllTestModels
 
       - name: Run model dependent tests
+        env:
+          NODE_OPTIONS: "--max-old-space-size=4096"
         run: npm run test:modelDependent
 
+      - name: Save cached models
+        id: cache-save-test-models
+        if: steps.download-all-test-models.outcome == 'success' && always()
+        uses: actions/cache/save@v4
+        with:
+          path: "test/.models/**.gguf"
+          key: cache-test-models-${{ runner.os }}-${{ github.workflow }}
+
   release:
     name: Release
     if: needs.resolve-next-release.outputs.next-version != '' && needs.resolve-next-release.outputs.next-version != 'false'

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-node@v4
         with:
-          node-version: "18"
+          node-version: "20"
       - name: Install modules
         run: npm ci
       - name: ESLint

diff --git a/docs/guide/development.md b/docs/guide/development.md
@@ -79,6 +79,15 @@ lldb node -- ./node_modules/.bin/vite-node ./src/cli/cli.ts chat <path-to-a-mode
 After it finishes loading, type `run` (or `process launch` if `run` fails) and press Enter for the execution of `node` to start.
 When the process crashes, you'll get a stack trace in the terminal.
 
+#### Finding Process Crash Stack Trace for Native Code (Linux) {#native-crash-stack-trace-linux}
+To get the stack trace of a crash stemming in `llama.cpp` or the bindings, run `node` with `gdb`:
+```shell
+gdb --args node ./node_modules/.bin/vite-node ./src/cli/cli.ts chat <path-to-a-model-file-on-your-computer>
+```
+
+After it finishes loading, type `run` and press Enter for the execution of `node` to start.
+When the process crashes, type `bt full` and press Enter to see the stack trace.
+
 ### Updating the Documentation
 All the documentation is written in Markdown files in the `docs` directory.
 To see the changes you made to the documentation, run the following command:

diff --git a/docs/guide/electron.md b/docs/guide/electron.md
@@ -9,7 +9,7 @@ Trying to use `node-llama-cpp` on a renderer process will crash the application.
 
 You can scaffold an example Electron app that uses `node-llama-cpp` with complete configuration for packaging and distribution by running the following command:
 ```shell
-npm create node-llama-cpp@latest --template electron-typescript-react
+npm create node-llama-cpp@latest -- --template electron-typescript-react
 ```
 
 ::: tip

diff --git a/docs/guide/function-calling.md b/docs/guide/function-calling.md
@@ -408,3 +408,42 @@ getFruitPrice({name: "banana"}) result: {name: "banana", price: "$4"};
 
 
 ```
+
+## Troubleshooting {#troubleshooting}
+### Function Calling Issues With [`JinjaTemplateChatWrapper`](../api/classes/JinjaTemplateChatWrapper.md) {#troubleshoot-jinja-function-calling-issues}
+If function calling doesn't work well (or at all) with a model you're trying to use,
+and the [chat wrapper](./chat-wrapper.md) used by your [`LlamaChatSession`](../api/classes/LlamaChatSession.md)
+is a [`JinjaTemplateChatWrapper`](../api/classes/JinjaTemplateChatWrapper.md)
+(you can check that by accessing [`.chatWrapper`](../api/classes/LlamaChatSession.md#chatwrapper)),
+you can try to force it to not use the function calling template defined in the Jinja template.
+
+Doing this can help you achieve better function calling performance with some models.
+
+To do this, create your [`LlamaChatSession`](../api/classes/LlamaChatSession.md) like this:
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+
+// ---cut---
+import {LlamaChatSession, resolveChatWrapper} from "node-llama-cpp";
+
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence(),
+    chatWrapper: resolveChatWrapper(model, {
+        customWrapperSettings: {
+            jinjaTemplate: {
+                functionCallMessageTemplate: "noJinja"
+            }
+        }
+    })
+});
+```
diff --git a/docs/public/giscus/style.css b/docs/public/giscus/style.css
@@ -31,6 +31,10 @@ body, #__next {
         border-start-start-radius: 0;
     }
 
+    .gsc-comment .gsc-replies {
+        padding-top: 0;
+    }
+
     .gsc-reactions-popover {
         border-radius: 12px;
 

diff --git a/eslint.config.js b/eslint.config.js
@@ -148,7 +148,8 @@ export default tseslint.config({
             ]
         }],
         "@stylistic/no-trailing-spaces": ["off"],
-        "@stylistic/no-multi-spaces": ["warn"]
+        "@stylistic/no-multi-spaces": ["warn"],
+        "@stylistic/generator-star-spacing": ["off"]
     }
 }, {
     files: ["**/**.{,c,m}ts"],

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -583,7 +583,7 @@ Napi::Value AddonContext::DisposeSequence(const Napi::CallbackInfo& info) {
 
     int32_t sequenceId = info[0].As<Napi::Number>().Int32Value();
 
-    bool result = llama_kv_cache_seq_rm(ctx, sequenceId, -1, -1);
+    bool result = llama_kv_self_seq_rm(ctx, sequenceId, -1, -1);
 
     if (!result) {
         Napi::Error::New(info.Env(), "Failed to dispose sequence").ThrowAsJavaScriptException();
@@ -602,7 +602,7 @@ Napi::Value AddonContext::RemoveTokenCellsFromSequence(const Napi::CallbackInfo&
     int32_t startPos = info[1].As<Napi::Number>().Int32Value();
     int32_t endPos = info[2].As<Napi::Number>().Int32Value();
 
-    bool result = llama_kv_cache_seq_rm(ctx, sequenceId, startPos, endPos);
+    bool result = llama_kv_self_seq_rm(ctx, sequenceId, startPos, endPos);
 
     return Napi::Boolean::New(info.Env(), result);
 }
@@ -617,7 +617,7 @@ Napi::Value AddonContext::ShiftSequenceTokenCells(const Napi::CallbackInfo& info
     int32_t endPos = info[2].As<Napi::Number>().Int32Value();
     int32_t shiftDelta = info[3].As<Napi::Number>().Int32Value();
 
-    llama_kv_cache_seq_add(ctx, sequenceId, startPos, endPos, shiftDelta);
+    llama_kv_self_seq_add(ctx, sequenceId, startPos, endPos, shiftDelta);
 
     return info.Env().Undefined();
 }
@@ -639,6 +639,7 @@ Napi::Value AddonContext::GetEmbedding(const Napi::CallbackInfo& info) {
     }
 
     int32_t inputTokensLength = info[0].As<Napi::Number>().Int32Value();
+    int32_t maxVectorSize = (info.Length() > 1 && info[1].IsNumber()) ? info[1].As<Napi::Number>().Int32Value() : 0;
 
     if (inputTokensLength <= 0) {
         Napi::Error::New(info.Env(), "Invalid input tokens length").ThrowAsJavaScriptException();
@@ -650,15 +651,16 @@ Napi::Value AddonContext::GetEmbedding(const Napi::CallbackInfo& info) {
     const auto* embeddings = pooling_type == LLAMA_POOLING_TYPE_NONE ? NULL : llama_get_embeddings_seq(ctx, 0);
     if (embeddings == NULL) {
         embeddings = llama_get_embeddings_ith(ctx, inputTokensLength - 1);
+    }
 
-        if (embeddings == NULL) {
-            Napi::Error::New(info.Env(), std::string("Failed to get embeddings for token ") + std::to_string(inputTokensLength - 1)).ThrowAsJavaScriptException();
-            return info.Env().Undefined();
-        }
+    if (embeddings == NULL) {
+        Napi::Error::New(info.Env(), std::string("Failed to get embeddings for token ") + std::to_string(inputTokensLength - 1)).ThrowAsJavaScriptException();
+        return info.Env().Undefined();
     }
 
-    Napi::Float64Array result = Napi::Float64Array::New(info.Env(), n_embd);
-    for (size_t i = 0; i < n_embd; ++i) {
+    size_t resultSize = maxVectorSize == 0 ? n_embd : std::min(n_embd, maxVectorSize);
+    Napi::Float64Array result = Napi::Float64Array::New(info.Env(), resultSize);
+    for (size_t i = 0; i < resultSize; i++) {
         result[i] = embeddings[i];
     }