withcatai
diff --git a/‎docs/guide/chat-session.md‎
Lines changed: 55 additions & 0 deletions b/‎docs/guide/chat-session.md‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts‎
Lines changed: 2 additions & 1 deletion b/‎src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/cli/commands/ChatCommand.ts‎
Lines changed: 16 additions & 4 deletions b/‎src/cli/commands/ChatCommand.ts‎
Lines changed: 16 additions & 4 deletions
@@ -898,3 +898,58 @@ const fullResponse = a1.response
 
 console.log("Full response: " + fullResponse);
 ```
+
+## Set Thinking Budget {#thinking-budget}
+You can set a thinking budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments).
+```typescript
+import {
+    getLlama, LlamaChatSession, resolveModelFile, Token
+} from "node-llama-cpp";
+
+const modelPath = await resolveModelFile("hf:Qwen/Qwen3-14B-GGUF:Q4_K_M");
+
+const llama = await getLlama();
+const model = await llama.loadModel({modelPath});
+const context = await model.createContext();
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence()
+});
+
+
+const q1 = "Where do llamas come from?";
+console.log("User: " + q1);
+
+const maxThoughtTokens = 100;
+
+let responseTokens = 0;
+let thoughtTokens = 0;
+
+process.stdout.write("AI: ");
+const response = await session.prompt(q1, {
+    budgets: {
+        thoughtTokens: maxThoughtTokens
+    },
+    onResponseChunk(chunk) {
+        const isThoughtSegment = chunk.type === "segment" &&
+            chunk.segmentType === "thought";
+
+        if (chunk.type === "segment" && chunk.segmentStartTime != null)
+            process.stdout.write(` [segment start: ${chunk.segmentType}] `);
+
+        process.stdout.write(chunk.text);
+
+        if (chunk.type === "segment" && chunk.segmentEndTime != null)
+            process.stdout.write(` [segment end: ${chunk.segmentType}] `);
+
+        if (isThoughtSegment)
+            thoughtTokens += chunk.tokens.length;
+        else
+            responseTokens += chunk.tokens.length;
+    }
+});
+
+console.log("Response: " + response);
+
+console.log("Response tokens: " + responseTokens);
+console.log("Thought tokens: " + thoughtTokens);
+```
@@ -41,7 +41,8 @@ export function extractSegmentSettingsFromTokenizerAndChatTemplate(
     return removeUndefinedFields({
         thought: tryMatchPrefixSuffixPair([
             ["<think>", "</think>"], // DeepSeek, QwQ
-            ["<thought>", "</thought>"] // EXAONE Deep
+            ["<thought>", "</thought>"], // EXAONE Deep
+            ["<|START_THINKING|>", "<|END_THINKING|>"] // Command R7B
         ])
     });
 }
@@ -62,6 +62,7 @@ type ChatCommand = {
     repeatFrequencyPenalty?: number,
     repeatPresencePenalty?: number,
     maxTokens: number,
+    thoughtBudget?: number,
     noHistory: boolean,
     environmentFunctions: boolean,
     tokenPredictionDraftModel?: string,
@@ -262,6 +263,13 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: 0,
                 description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
             })
+            .option("thoughtBudget", {
+                alias: ["tb", "thinkingBudget", "reasoningBudget"],
+                type: "number",
+                default: -1,
+                defaultDescription: "Unlimited",
+                description: "Maximum number of tokens the model can use for thoughts. Set to `0` to disable reasoning"
+            })
             .option("noHistory", {
                 alias: "nh",
                 type: "boolean",
@@ -318,7 +326,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory,
         environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
     }) {
         try {
@@ -327,8 +335,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
                 temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-                maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
-                timing, noMmap, printTimings
+                maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
+                debug, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -344,11 +352,12 @@ async function RunChat({
     contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
-    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
+    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
     tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
+    if (thoughtBudget === -1) thoughtBudget = undefined;
 
     const headers = resolveHeaderFlag(headerArg);
     const trimWhitespace = !noTrimWhitespace;
@@ -686,6 +695,9 @@ async function RunChat({
                 seed: seed ?? undefined,
                 signal: abortController.signal,
                 stopOnAbortSignal: true,
+                budgets: {
+                    thoughtTokens: thoughtBudget
+                },
                 repeatPenalty: {
                     penalty: repeatPenalty,
                     frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined,