fix: metadata string encoding (#420)

giladgd · web-flow · commit 314d7e81b077 · 2025-01-27T04:52:19.000+02:00
* fix: metadata string encoding
* fix: Vulkan parallel decoding
* fix: try auth token on 401 response
diff --git a/docs/guide/external-chat-state.md b/docs/guide/external-chat-state.md
@@ -9,9 +9,9 @@ You can [save and restore a chat history](./chat-session.md#save-and-restore) on
 :::
 
 To interact with a model in a chat form, you can use [`LlamaChatSession`](../api/classes/LlamaChatSession.md),
-which is stateful chat session that manages the chat state on its own.
+which is a stateful chat session that manages the chat state on its own.
 
-When building a library around `node-llama-cpp`, you may want to store that chat state externally and control the evaluations on your own.
+When building a library around `node-llama-cpp`, you may want to store that chat state externally and control the evaluations yourself.
 
 This is where [`LlamaChat`](../api/classes/LlamaChat.md) may come in handy.
 [`LlamaChat`](../api/classes/LlamaChat.md) Allows you to generate a completion to an existing chat session and manage the evaluation yourself,
@@ -69,9 +69,9 @@ const res = await llamaChat.generateResponse(chatHistory, {
 console.log("AI: " + res.response);
 ```
 
-Now, let say we want to ask the model a follow-up question based on the previous response.
+Now, let's say we want to ask the model a follow-up question based on the previous response.
 Since we already have a context sequence loaded with the previous chat history,
-we'd want to use it as much a possible.
+we'd want to reuse it as much a possible.
 
 To do so, we pass the context window of the previous evaluation output to the new evaluation.
 This is important, since if a context shift has happened, we want to use the existing post-context-shift context sequence state
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -1,4 +1,4 @@
-import {acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
+import {acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, Lock, withLock} from "lifecycle-utils";
 import {removeNullFields} from "../../utils/removeNullFields.js";
 import {Token} from "../../types.js";
 import {AddonContext, AddonModelLora, BatchLogitIndex} from "../../bindings/AddonTypes.js";
@@ -32,6 +32,10 @@ const defaultFailedCreationRemedy = {
 } as const satisfies Required<LlamaContextOptions["failedCreationRemedy"]>;
 const defaultEvaluationPriority: EvaluationPriority = 5;
 
+const decodeSyncWorkaround = {
+    vulkanLock: {}
+};
+
 export class LlamaContext {
     /** @internal */ public readonly _llama: Llama;
     /** @internal */ public readonly _ctx: AddonContext;
@@ -573,11 +577,17 @@ export class LlamaContext {
                         return;
                     }
 
+                    let decodeLock: Lock | undefined;
+                    // this is a workaround to prevent Vulkan from crashing the process when decoding on multiple contexts in parallel
+                    if (this._llama.gpu === "vulkan")
+                        decodeLock = await acquireLock(decodeSyncWorkaround.vulkanLock, "decode");
+
                     try {
                         await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
 
                         shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
                     } finally {
+                        decodeLock?.dispose();
                         preventDisposalHandle.dispose();
                     }
                 }
diff --git a/src/gguf/fileReaders/GgufFileReader.ts b/src/gguf/fileReaders/GgufFileReader.ts
@@ -99,12 +99,7 @@ export abstract class GgufFileReader {
         const readLength = valueTypeToBytesToRead.uint8 * length;
 
         return this._withBufferRead(offset, readLength, (resolvedOffset) => {
-            const res: string[] = [];
-
-            for (let i = resolvedOffset; i < resolvedOffset + readLength && i < this._buffer.length; i++)
-                res.push(String.fromCharCode(this._buffer[i]!));
-
-            return res.join("");
+            return this._buffer.toString("utf8", resolvedOffset, Math.min(resolvedOffset + readLength, this._buffer.length));
         });
     }
 
diff --git a/src/gguf/fileReaders/GgufNetworkFetchFileReader.ts b/src/gguf/fileReaders/GgufNetworkFetchFileReader.ts
@@ -104,7 +104,7 @@ export class GgufNetworkFetchFileReader extends GgufFileReader {
                 signal: this._signal
             });
 
-            if ((response.status >= 500 || response.status === 429) && headersToTry.length > 0)
+            if ((response.status >= 500 || response.status === 429 || response.status === 401) && headersToTry.length > 0)
                 continue;
 
             if (!response.ok)
diff --git a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
@@ -114,16 +114,16 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         "bos_token_id": 128000,
         "eos_token_id": 128001,
         "merges": [
-          "Ä  Ä ",
-          "Ä  Ä Ä Ä ",
-          "Ä Ä  Ä Ä ",
-          "Ä Ä Ä  Ä ",
+          "Ġ Ġ",
+          "Ġ ĠĠĠ",
+          "ĠĠ ĠĠ",
+          "ĠĠĠ Ġ",
           "i n",
-          "Ä  t",
-          "Ä  Ä Ä Ä Ä Ä Ä Ä ",
-          "Ä Ä  Ä Ä Ä Ä Ä Ä ",
-          "Ä Ä Ä Ä  Ä Ä Ä Ä ",
-          "Ä Ä Ä  Ä Ä Ä Ä Ä ",
+          "Ġ t",
+          "Ġ ĠĠĠĠĠĠĠ",
+          "ĠĠ ĠĠĠĠĠĠ",
+          "ĠĠĠĠ ĠĠĠĠ",
+          "ĠĠĠ ĠĠĠĠĠ",
         ],
         "model": "gpt2",
         "padding_token_id": 128001,
@@ -325,16 +325,16 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         "bos_token_id": 128000,
         "eos_token_id": 128001,
         "merges": [
-          "Ä  Ä ",
-          "Ä  Ä Ä Ä ",
-          "Ä Ä  Ä Ä ",
-          "Ä Ä Ä  Ä ",
+          "Ġ Ġ",
+          "Ġ ĠĠĠ",
+          "ĠĠ ĠĠ",
+          "ĠĠĠ Ġ",
           "i n",
-          "Ä  t",
-          "Ä  Ä Ä Ä Ä Ä Ä Ä ",
-          "Ä Ä  Ä Ä Ä Ä Ä Ä ",
-          "Ä Ä Ä Ä  Ä Ä Ä Ä ",
-          "Ä Ä Ä  Ä Ä Ä Ä Ä ",
+          "Ġ t",
+          "Ġ ĠĠĠĠĠĠĠ",
+          "ĠĠ ĠĠĠĠĠĠ",
+          "ĠĠĠĠ ĠĠĠĠ",
+          "ĠĠĠ ĠĠĠĠĠ",
         ],
         "model": "gpt2",
         "padding_token_id": 128001,
diff --git a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap
@@ -83,15 +83,15 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
       "ggml": {
         "eos_token_id": 11,
         "merges": [
-          "Ä  t",
-          "Ä  a",
+          "Ġ t",
+          "Ġ a",
           "i n",
           "h e",
           "r e",
           "o n",
           "e r",
-          "Ä  s",
-          "Ä t he",
+          "Ġ s",
+          "Ġt he",
           "a t",
         ],
         "model": "gpt2",
@@ -229,15 +229,15 @@ exports[`gguf > parser > should parse remote gguf model without tensor info 1`]
       "ggml": {
         "eos_token_id": 11,
         "merges": [
-          "Ä  t",
-          "Ä  a",
+          "Ġ t",
+          "Ġ a",
           "i n",
           "h e",
           "r e",
           "o n",
           "e r",
-          "Ä  s",
-          "Ä t he",
+          "Ġ s",
+          "Ġt he",
           "a t",
         ],
         "model": "gpt2",