docs: response segments, stop generation

giladgd · giladgd · commit 288fb8208797 · 2025-02-18T03:44:23.000+02:00
diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md
@@ -100,9 +100,10 @@ const a1 = await session.prompt(q1, {
         process.stdout.write(chunk);
     }
 });
-
 ```
 
+> To stream `thought` segment, see [Stream Response Segments](#stream-response-segments)
+
 ## Repeat Penalty Customization {#repeat-penalty}
 You can see all the possible options of the [`prompt`](../api/classes/LlamaChatSession.md#prompt) function [here](../api/type-aliases/LLamaChatPromptOptions.md).
 ```typescript
@@ -682,7 +683,7 @@ to make the model follow a certain direction in its response.
 ```typescript
 import {fileURLToPath} from "url";
 import path from "path";
-import {getLlama, LlamaChatSession, GeneralChatWrapper} from "node-llama-cpp";
+import {getLlama, LlamaChatSession} from "node-llama-cpp";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
@@ -692,8 +693,7 @@ const model = await llama.loadModel({
 });
 const context = await model.createContext();
 const session = new LlamaChatSession({
-    contextSequence: context.getSequence(),
-    chatWrapper: new GeneralChatWrapper()
+    contextSequence: context.getSequence()
 });
 
 
@@ -705,3 +705,115 @@ const a1 = await session.prompt(q1, {
 });
 console.log("AI: " + a1);
 ```
+
+## Stop Response Generation {#stop-response-generation}
+To stop the generation of the current response, without removing the existing partial generation from the chat history,
+you can use the [`stopOnAbortSignal`](../api/type-aliases/LLamaChatPromptOptions.md#stoponabortsignal) option
+to configure what happens when the given [`signal`](../api/type-aliases/LLamaChatPromptOptions.md#signal) is aborted.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaChatSession} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence()
+});
+
+
+const abortController = new AbortController();
+const q1 = "Hi there, how are you?";
+console.log("User: " + q1);
+
+let response = "";
+
+const a1 = await session.prompt(q1, {
+    // stop the generation, instead of cancelling it
+    stopOnAbortSignal: true,
+    
+    signal: abortController.signal,
+    onTextChunk(chunk) {
+        response += chunk;
+        
+        if (response.length >= 10)
+            abortController.abort();
+    }
+});
+console.log("AI: " + a1);
+```
+
+
+## Stream Response Segments {#stream-response-segments}
+The raw model response is automatically segmented into different types of segments.
+The main response is not segmented, but other kinds of sections, like thoughts (chain of thought), are segmented.
+
+To stream response segments you can use the [`onResponseChunk`](../api/type-aliases/LLamaChatPromptOptions.md#onresponsechunk) option.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaChatSession} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "DeepSeek-R1-Distill-Qwen-14B.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence()
+});
+
+
+const q1 = "Hi there, how are you?";
+console.log("User: " + q1);
+
+process.stdout.write("AI: ");
+const a1 = await session.promptWithMeta(q1, {
+    onResponseChunk(chunk) {
+        const isThoughtSegment = chunk.type === "segment" &&
+            chunk.segmentType === "thought";
+        
+        if (chunk.type === "segment" && chunk.segmentStartTime != null)
+            process.stdout.write(` [segment start: ${chunk.segmentType}] `);
+
+        process.stdout.write(chunk.text);
+
+        if (chunk.type === "segment" && chunk.segmentEndTime != null)
+            process.stdout.write(` [segment end: ${chunk.segmentType}] `);
+    }
+});
+
+const fullResponse = a1.response
+    .map((item) => {
+        if (typeof item === "string")
+            return item;
+        else if (item.type === "segment") {
+            const isThoughtSegment = item.segmentType === "thought";
+            let res = "";
+            
+            if (item.startTime != null)
+                res += ` [segment start: ${item.segmentType}] `;
+
+            res += item.text;
+
+            if (item.endTime != null)
+                res += ` [segment end: ${item.segmentType}] `;
+
+            return res;
+        }
+
+        return "";
+    })
+    .join("");
+
+console.log("Full response: " + fullResponse);
+```
diff --git a/docs/guide/external-chat-state.md b/docs/guide/external-chat-state.md
@@ -66,7 +66,29 @@ const res = await llamaChat.generateResponse(chatHistory, {
     }
 });
 
+const fullResponse = res.fullResponse
+    .map((item) => {
+        if (typeof item === "string")
+            return item;
+        else if (item.type === "segment") {
+            let res = "";
+            if (item.startTime != null)
+                res += ` [segment start: ${item.segmentType}] `;
+
+            res += item.text;
+
+            if (item.endTime != null)
+                res += ` [segment end: ${item.segmentType}] `;
+
+            return res;
+        }
+
+        return "";
+    })
+    .join("");
+
 console.log("AI: " + res.response);
+console.log("Full response:", fullResponse);
 ```
 
 Now, let's say we want to ask the model a follow-up question based on the previous response.
@@ -169,6 +191,7 @@ const res2 = await llamaChat.generateResponse(chatHistory, {
 });
 
 console.log("AI: " + res2.response);
+console.log("Full response:", res2.fullResponse);
 ```
 
 ## Handling Function Calling {#function-calling}
@@ -270,8 +293,31 @@ while (true) {
     lastContextShiftMetadata = res.lastEvaluation.contextShiftMetadata;
 
     // print the text the model generated before calling functions
-    if (res.response !== "")
+    if (res.response !== "") {
+        const fullResponse = res.fullResponse
+            .map((item) => {
+                if (typeof item === "string")
+                    return item;
+                else if (item.type === "segment") {
+                    let res = "";
+                    if (item.startTime != null)
+                        res += ` [segment start: ${item.segmentType}] `;
+    
+                    res += item.text;
+    
+                    if (item.endTime != null)
+                        res += ` [segment end: ${item.segmentType}] `;
+    
+                    return res;
+                }
+    
+                return "";
+            })
+            .join("");
+        
         console.log("AI: " + res.response);
+        console.log("Full response:", fullResponse);
+    }
 
     // when there are no function calls,
     // it means the model has finished generating the response
diff --git a/docs/guide/function-calling.md b/docs/guide/function-calling.md
@@ -7,15 +7,15 @@ description: Using function calling
 When prompting a model using a [`LlamaChatSession`](../api/classes/LlamaChatSession.md), you can provide a list of functions that a model can call during generation to retrieve information or perform actions.
 
 For this to work, `node-llama-cpp` tells the model what functions are available and what parameters they take, and instructs it to call those as needed.
-It also ensures that the model can only call functions with the correct parameters.
+It also ensures that when the model calls a function, it always uses the correct parameters.
 
 Some models have built-in support for function calling, and some of them are not trained for that.
 
 For example, _Llama 3_ is not trained for function calling.
 When using a _Llama 3_ model, the [`Llama3ChatWrapper`](../api/classes/Llama3ChatWrapper.md) is automatically used, and it includes a custom handling for function calling,
 which contains a fine-tuned instruction for explaining the model how to call functions and when to do so.
 
-There are also model that do have built-in support for function calling, like _Llama 3.1_.
+There are also models that do have built-in support for function calling, like _Llama 3.1_.
 When using a _Llama 3.1_ model, the [`Llama3_1ChatWrapper`](../api/classes/Llama3_1ChatWrapper.md) is automatically used, and it knows how to handle function calling for this model.
 
 In order for the model to know what functions can do and what they return, you need to provide this information in the function description.
diff --git a/docs/index.md b/docs/index.md
@@ -42,7 +42,7 @@ features:
     linkText: Learn more
   - icon: <svg xmlns="http://www.w3.org/2000/svg" height="24" viewBox="0 -960 960 960" width="24" fill="currentColor"><path d="M600-160q-17 0-28.5-11.5T560-200q0-17 11.5-28.5T600-240h80q17 0 28.5-11.5T720-280v-80q0-38 22-69t58-44v-14q-36-13-58-44t-22-69v-80q0-17-11.5-28.5T680-720h-80q-17 0-28.5-11.5T560-760q0-17 11.5-28.5T600-800h80q50 0 85 35t35 85v80q0 17 11.5 28.5T840-560t28.5 11.5Q880-537 880-520v80q0 17-11.5 28.5T840-400t-28.5 11.5Q800-377 800-360v80q0 50-35 85t-85 35h-80Zm-320 0q-50 0-85-35t-35-85v-80q0-17-11.5-28.5T120-400t-28.5-11.5Q80-423 80-440v-80q0-17 11.5-28.5T120-560t28.5-11.5Q160-583 160-600v-80q0-50 35-85t85-35h80q17 0 28.5 11.5T400-760q0 17-11.5 28.5T360-720h-80q-17 0-28.5 11.5T240-680v80q0 38-22 69t-58 44v14q36 13 58 44t22 69v80q0 17 11.5 28.5T280-240h80q17 0 28.5 11.5T400-200q0 17-11.5 28.5T360-160h-80Z"/></svg>
     title: Powerful features
-    details: Enforce a model to generate output according to a JSON schema, provide a model with functions it can call on demand, and much more
+    details: Force a model to generate output according to a JSON schema, provide a model with functions it can call on demand, and much more
     link: /guide/grammar#json-schema
     linkText: Learn more
 ---
@@ -98,6 +98,7 @@ npx -y node-llama-cpp inspect gpu
 * [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp)
 * [Token prediction](./guide/token-prediction.md)
 * [Reranking](./guide/embedding.md#reranking)
+* [Thought segmentation](./guide/chat-session.md#stream-response-segments)
 
 </template>
 <template v-slot:simple-code>