Skip to content

Commit 25f016d

Browse files
committed
feat: thought budget, improve prompt completion
1 parent 2e4877a commit 25f016d

File tree

8 files changed

+431
-31
lines changed

8 files changed

+431
-31
lines changed

docs/guide/chat-session.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -898,3 +898,58 @@ const fullResponse = a1.response
898898

899899
console.log("Full response: " + fullResponse);
900900
```
901+
902+
## Set Thinking Budget {#thinking-budget}
903+
You can set a thinking budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments).
904+
```typescript
905+
import {
906+
getLlama, LlamaChatSession, resolveModelFile, Token
907+
} from "node-llama-cpp";
908+
909+
const modelPath = await resolveModelFile("hf:Qwen/Qwen3-14B-GGUF:Q4_K_M");
910+
911+
const llama = await getLlama();
912+
const model = await llama.loadModel({modelPath});
913+
const context = await model.createContext();
914+
const session = new LlamaChatSession({
915+
contextSequence: context.getSequence()
916+
});
917+
918+
919+
const q1 = "Where do llamas come from?";
920+
console.log("User: " + q1);
921+
922+
const maxThoughtTokens = 100;
923+
924+
let responseTokens = 0;
925+
let thoughtTokens = 0;
926+
927+
process.stdout.write("AI: ");
928+
const response = await session.prompt(q1, {
929+
budgets: {
930+
thoughtTokens: maxThoughtTokens
931+
},
932+
onResponseChunk(chunk) {
933+
const isThoughtSegment = chunk.type === "segment" &&
934+
chunk.segmentType === "thought";
935+
936+
if (chunk.type === "segment" && chunk.segmentStartTime != null)
937+
process.stdout.write(` [segment start: ${chunk.segmentType}] `);
938+
939+
process.stdout.write(chunk.text);
940+
941+
if (chunk.type === "segment" && chunk.segmentEndTime != null)
942+
process.stdout.write(` [segment end: ${chunk.segmentType}] `);
943+
944+
if (isThoughtSegment)
945+
thoughtTokens += chunk.tokens.length;
946+
else
947+
responseTokens += chunk.tokens.length;
948+
}
949+
});
950+
951+
console.log("Response: " + response);
952+
953+
console.log("Response tokens: " + responseTokens);
954+
console.log("Thought tokens: " + thoughtTokens);
955+
```

src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ export function extractSegmentSettingsFromTokenizerAndChatTemplate(
4141
return removeUndefinedFields({
4242
thought: tryMatchPrefixSuffixPair([
4343
["<think>", "</think>"], // DeepSeek, QwQ
44-
["<thought>", "</thought>"] // EXAONE Deep
44+
["<thought>", "</thought>"], // EXAONE Deep
45+
["<|START_THINKING|>", "<|END_THINKING|>"] // Command R7B
4546
])
4647
});
4748
}

src/cli/commands/ChatCommand.ts

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ type ChatCommand = {
6262
repeatFrequencyPenalty?: number,
6363
repeatPresencePenalty?: number,
6464
maxTokens: number,
65+
thoughtBudget?: number,
6566
noHistory: boolean,
6667
environmentFunctions: boolean,
6768
tokenPredictionDraftModel?: string,
@@ -262,6 +263,13 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
262263
default: 0,
263264
description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
264265
})
266+
.option("thoughtBudget", {
267+
alias: ["tb", "thinkingBudget", "reasoningBudget"],
268+
type: "number",
269+
default: -1,
270+
defaultDescription: "Unlimited",
271+
description: "Maximum number of tokens the model can use for thoughts. Set to `0` to disable reasoning"
272+
})
265273
.option("noHistory", {
266274
alias: "nh",
267275
type: "boolean",
@@ -318,7 +326,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
318326
promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
319327
noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
320328
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
321-
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
329+
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory,
322330
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
323331
}) {
324332
try {
@@ -327,8 +335,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
327335
batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
328336
temperature, minP, topK, topP, seed,
329337
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
330-
maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
331-
timing, noMmap, printTimings
338+
maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
339+
debug, meter, timing, noMmap, printTimings
332340
});
333341
} catch (err) {
334342
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -344,11 +352,12 @@ async function RunChat({
344352
contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
345353
jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
346354
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
347-
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
355+
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
348356
tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
349357
}: ChatCommand) {
350358
if (contextSize === -1) contextSize = undefined;
351359
if (gpuLayers === -1) gpuLayers = undefined;
360+
if (thoughtBudget === -1) thoughtBudget = undefined;
352361

353362
const headers = resolveHeaderFlag(headerArg);
354363
const trimWhitespace = !noTrimWhitespace;
@@ -686,6 +695,9 @@ async function RunChat({
686695
seed: seed ?? undefined,
687696
signal: abortController.signal,
688697
stopOnAbortSignal: true,
698+
budgets: {
699+
thoughtTokens: thoughtBudget
700+
},
689701
repeatPenalty: {
690702
penalty: repeatPenalty,
691703
frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined,

0 commit comments

Comments
 (0)