withcatai
diff --git a/‎.config/typedoc.json‎
Lines changed: 1 addition & 1 deletion b/‎.config/typedoc.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/guide/low-level-api.md‎
Lines changed: 78 additions & 7 deletions b/‎docs/guide/low-level-api.md‎
Lines changed: 78 additions & 7 deletions
diff --git a/‎llama/addon/AddonContext.cpp‎
Lines changed: 86 additions & 27 deletions b/‎llama/addon/AddonContext.cpp‎
Lines changed: 86 additions & 27 deletions
diff --git a/‎src/bindings/AddonTypes.ts‎
Lines changed: 3 additions & 2 deletions b/‎src/bindings/AddonTypes.ts‎
Lines changed: 3 additions & 2 deletions
@@ -27,6 +27,6 @@
     "interfacePropertiesFormat": "list",
     "sort": ["source-order"],
     "docsRoot": "../docs",
-    "intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"],
+    "intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"],
     "useHTMLEncodedBrackets": true
 }
@@ -38,13 +38,25 @@ and you can pass no sampling options to avoid making any adjustments to the prob
 It's best to avoid getting the full probabilities list unless you really need it,
 as passing it to the JavaScript side can be slow.
 
+### Context Shift {#context-shift}
+When the context sequence is full and you want to evaluate more tokens onto it,
+some tokens will have to be removed to make room for new ones to be added.
+
+Ideally, you'd want to do that on your logic level, so you can control which content to keep and which to remove.
+> All the high-level APIs of `node-llama-cpp` [automatically do that](./chat-context-shift.md).
+
+If you don't do that, `node-llama-cpp` will automatically remove the oldest tokens from the context sequence state to make room for new ones.
+
+You can customize the context shift strategy `node-llama-cpp` uses for the context sequence by configuring the [`contextShift`](../api/classes/LlamaContext.md#parameters) option when calling [`.getSequence(...)`](../api/classes/LlamaContext.md#getsequence),
+or by passing a customized the [`contextShift`](../api/type-aliases/SequenceEvaluateOptions#contextshift) option to the evaluation method you use.
+
 ## Simple Evaluation {#simple-evaluation}
-You can evaluate the given input tokens onto a context sequence using [`.evaluate`](../api/classes/LlamaContextSequence.md#evaluate)
+You can evaluate the given input tokens onto a context sequence using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate)
 and generate the next token for the last input token.
 
 On each iteration of the returned iterator, the generated token is then added to the context sequence state and the next token is generated for it, and so on.
 
-When using [`.evaluate`](../api/classes/LlamaContextSequence.md#evaluate), the configured [token predictor](./token-prediction.md) is used to speed up the generation process.
+When using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate), the configured [token predictor](./token-prediction.md) is used to speed up the generation process.
 
 ```typescript
 import {fileURLToPath} from "url";
@@ -130,9 +142,67 @@ console.log("Result: " + resText);
 ```
 > If you want to adjust the token probabilities when generating output, consider using [token bias](./token-bias.md) instead
 
+### With Metadata {#evaluation-with-metadata}
+You can use [`.evaluateWithMetadata(...)`](../api/classes/LlamaContextSequence.md#evaluatewithmetadata) to evaluate tokens onto the context sequence state like [`.evaluate(...)`](#simple-evaluation), but with metadata emitted for each token.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const maxTokens = 10;
+const res: Array<{
+    token: Token,
+    confidence: number,
+    probabilities: Map<Token, number>
+}> = [];
+const metadataOptions = {
+    // configure which metadata should be returned
+    confidence: true,
+    probabilities: true
+} as const;
+const options: SequenceEvaluateOptions = {
+    temperature: 0.8
+};
+
+const iterator = sequence.evaluateWithMetadata(
+    tokens,
+    metadataOptions,
+    options
+);
+for await (const item of iterator) {
+    res.push({
+        token: item.token,
+        confidence: item.confidence,
+        probabilities: new Map(
+            // only keep the top 5 probabilities
+            [...item.probabilities.entries()].slice(0, 5)
+        )
+    });
+
+    if (res.length >= maxTokens)
+        break;
+}
+
+const resText = model.detokenize(res.map(({token}) => token));
+console.log("Result: " + resText);
+console.log("With metadata:", res);
+```
+
 ### No Generation {#evaluation-without-generation}
 To evaluate the input tokens onto a context sequence without generating new tokens,
-you can use [`.evaluateWithoutGeneratingNewTokens`](../api/classes/LlamaContextSequence.md#evaluatewithoutgeneratingnewtokens).
+you can use [`.evaluateWithoutGeneratingNewTokens(...)`](../api/classes/LlamaContextSequence.md#evaluatewithoutgeneratingnewtokens).
 
 ```typescript
 import {fileURLToPath} from "url";
@@ -154,7 +224,8 @@ await sequence.evaluateWithoutGeneratingNewTokens(tokens);
 ```
 
 ## Controlled Evaluation {#controlled-evaluation}
-To manually control for which of the input tokens to generate output, you can use [`.controlledEvaluate`](../api/classes/LlamaContextSequence.md#controlledevaluate).
+To manually control for which of the input tokens to generate output,
+you can use [`.controlledEvaluate(...)`](../api/classes/LlamaContextSequence.md#controlledevaluate).
 
 ```typescript
 import {fileURLToPath} from "url";
@@ -179,8 +250,8 @@ const lastToken = evaluateInput.pop() as Token;
 if (lastToken != null)
     evaluateInput.push([lastToken, {
         generateNext: {
-            singleToken: true,
-            probabilitiesList: true,
+            token: true,
+            probabilities: true,
             options: {
                 temperature: 0.8
             }
@@ -222,7 +293,7 @@ as it may lead to unexpected results.
 
 ### Erase State Ranges {#erase-state-ranges}
 To erase a range of tokens from the context sequence state,
-you can use [`.eraseContextTokenRanges`](../api/classes/LlamaContextSequence.md#erasecontexttokenranges).
+you can use [`.eraseContextTokenRanges(...)`](../api/classes/LlamaContextSequence.md#erasecontexttokenranges).
 
 ```typescript
 import {fileURLToPath} from "url";
 
@@ -191,11 +191,13 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
         AddonContext* ctx;
         AddonSampler* sampler;
         bool arrayResult = false;
-        bool returnLogprobs = false;
-        bool has_logprobs = false;
-        size_t logprobs_size;
-        llama_token * logprobs_tokens;
-        float * logprobs_probs;
+        bool returnProbabilities = false;
+        bool returnConfidence = false;
+        float tokenConfidence = -1;
+        bool has_probabilities = false;
+        size_t probabilities_size;
+        llama_token * probabilities_tokens;
+        float * probabilities_probs;
         int32_t batchLogitIndex;
         llama_token result;
         bool no_output = false;
@@ -209,16 +211,17 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
             batchLogitIndex = info[0].As<Napi::Number>().Int32Value();
             sampler = Napi::ObjectWrap<AddonSampler>::Unwrap(info[1].As<Napi::Object>());
             arrayResult = info.Length() > 2 && info[2].IsBoolean();
-            returnLogprobs = arrayResult ? info[2].As<Napi::Boolean>().Value() : false;
+            returnProbabilities = arrayResult ? info[2].As<Napi::Boolean>().Value() : false;
+            returnConfidence = arrayResult && info.Length() > 3 && info[3].IsBoolean() ? info[3].As<Napi::Boolean>().Value() : false;
             sampler->Ref();
         }
         ~AddonContextSampleTokenWorker() {
             ctx->Unref();
             sampler->Unref();
 
-            if (has_logprobs) {
-                delete[] logprobs_tokens;
-                delete[] logprobs_probs;
+            if (has_probabilities) {
+                delete[] probabilities_tokens;
+                delete[] probabilities_probs;
             }
         }
 
@@ -264,32 +267,84 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             llama_sampler_apply(sampler->chain, &cur_p);
 
-            if (returnLogprobs) {
+            if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) {
+                no_output = true;
+                return;
+            }
+
+            auto new_token_id = cur_p.data[cur_p.selected].id;
+
+            if (returnProbabilities || returnConfidence) {
                 if (!cur_p.sorted) {
                     std::sort(cur_p.data, cur_p.data + cur_p.size, [](const llama_token_data & a, const llama_token_data & b) {
                         return a.logit > b.logit;
                     });
                     cur_p.sorted = true;
+
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        if (cur_p.data[i].id == new_token_id) {
+                            cur_p.selected = i;
+                            break;
+                        }
+                    }
                 }
+            }
 
-                logprobs_size = cur_p.size;
-                logprobs_tokens = new llama_token[logprobs_size];
-                logprobs_probs = new float[logprobs_size];
+            if (returnProbabilities) {
+                probabilities_size = cur_p.size;
+                probabilities_tokens = new llama_token[probabilities_size];
+                probabilities_probs = new float[probabilities_size];
+                float maxLogit = cur_p.size > 0 ? cur_p.data[0].logit : -INFINITY;
 
                 for (size_t i = 0; i < cur_p.size; i++) {
-                    logprobs_tokens[i] = cur_p.data[i].id;
-                    logprobs_probs[i] = cur_p.data[i].logit;
+                    auto logit = cur_p.data[i].logit;
+
+                    probabilities_tokens[i] = cur_p.data[i].id;
+                    probabilities_probs[i] = logit;
+
+                    if (logit > maxLogit) {
+                        maxLogit = logit;
+                    }
+                }
+
+                if (probabilities_size > 0 && maxLogit != -INFINITY) {
+                    float sum = 0.0f;
+                    for (size_t i = 0; i < probabilities_size; i++) {
+                        float prob = expf(probabilities_probs[i] - maxLogit);
+                        probabilities_probs[i] = prob;
+                        sum += prob;
+                    }
+
+                    for (size_t i = 0; i < probabilities_size; i++) {
+                        probabilities_probs[i] /= sum;
+                    }
                 }
 
-                has_logprobs = true;
+                has_probabilities = true;
             }
 
-            if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) {
-                no_output = true;
-                return;
+            if (returnConfidence) {
+                if (has_probabilities && cur_p.selected < probabilities_size) {
+                    tokenConfidence = probabilities_probs[cur_p.selected];
+                } else {
+                    float maxLogit = cur_p.data[0].logit;
+                    float sum = 0.0f;
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        auto logit = cur_p.data[i].logit;
+
+                        if (logit > maxLogit) {
+                            maxLogit = logit;
+                        }
+                    }
+
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        sum += expf(cur_p.data[i].logit - maxLogit);
+                    }
+
+                    tokenConfidence = expf(cur_p.data[cur_p.selected].logit - maxLogit) / sum;
+                }
             }
 
-            auto new_token_id = cur_p.data[cur_p.selected].id;
             sampler->acceptToken(new_token_id);
             result = new_token_id;
         }
@@ -308,14 +363,18 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             Napi::Array resultArray = Napi::Array::New(Env(), 2);
             resultArray.Set(Napi::Number::New(Env(), 0), resultToken);
-            
-            if (has_logprobs) {
-                Napi::Array logprobs = Napi::Array::New(Env(), logprobs_size * 2);
-                for (size_t i = 0; i < logprobs_size; i++) {
-                    logprobs.Set(i * 2, Napi::Number::New(Env(), logprobs_tokens[i]));
-                    logprobs.Set(i * 2 + 1, Napi::Number::New(Env(), logprobs_probs[i]));
+
+            if (has_probabilities) {
+                Napi::Array probabilities = Napi::Array::New(Env(), probabilities_size * 2);
+                for (size_t i = 0; i < probabilities_size; i++) {
+                    probabilities.Set(i * 2, Napi::Number::New(Env(), probabilities_tokens[i]));
+                    probabilities.Set(i * 2 + 1, Napi::Number::New(Env(), probabilities_probs[i]));
                 }
-                resultArray.Set(1, logprobs);
+                resultArray.Set(1, probabilities);
+            }
+
+            if (returnConfidence && tokenConfidence != -1) {
+                resultArray.Set(2, Napi::Number::New(Env(), tokenConfidence));
             }
 
             deferred.Resolve(resultArray);
 
@@ -131,8 +131,9 @@ export type AddonContext = {
     sampleToken(
         batchLogitIndex: BatchLogitIndex,
         sampler: AddonSampler,
-        logprobs: boolean
-    ): Promise<[Token | -1, (Token | number)[] | undefined]>,
+        probabilities: boolean,
+        confidence?: boolean
+    ): Promise<[token: Token | -1, probabilities: (Token | number)[] | undefined, confidence: number | undefined]>,
     disposeSequence(sequenceId: number): void,
 
     // startPos in inclusive, endPos is exclusive
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,6 @@`
`27`	`27`	`"interfacePropertiesFormat": "list",`
`28`	`28`	`"sort": ["source-order"],`
`29`	`29`	`"docsRoot": "../docs",`
`30`		`- "intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"],`
	`30`	`+ "intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"],`
`31`	`31`	`"useHTMLEncodedBrackets": true`
`32`	`32`	`}`