@@ -62,6 +62,7 @@ type ChatCommand = {
6262 repeatFrequencyPenalty ?: number ,
6363 repeatPresencePenalty ?: number ,
6464 maxTokens : number ,
65+ thoughtBudget ?: number ,
6566 noHistory : boolean ,
6667 environmentFunctions : boolean ,
6768 tokenPredictionDraftModel ?: string ,
@@ -262,6 +263,13 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
262263 default : 0 ,
263264 description : "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
264265 } )
266+ . option ( "thoughtBudget" , {
267+ alias : [ "tb" , "thinkingBudget" , "reasoningBudget" ] ,
268+ type : "number" ,
269+ default : - 1 ,
270+ defaultDescription : "Unlimited" ,
271+ description : "Maximum number of tokens the model can use for thoughts. Set to `0` to disable reasoning"
272+ } )
265273 . option ( "noHistory" , {
266274 alias : "nh" ,
267275 type : "boolean" ,
@@ -318,7 +326,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
318326 promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
319327 noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
320328 topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
321- repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
329+ repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget , noHistory,
322330 environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
323331 } ) {
324332 try {
@@ -327,8 +335,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
327335 batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
328336 temperature, minP, topK, topP, seed,
329337 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
330- maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug , meter ,
331- timing, noMmap, printTimings
338+ maxTokens, thoughtBudget , noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
339+ debug , meter , timing, noMmap, printTimings
332340 } ) ;
333341 } catch ( err ) {
334342 await new Promise ( ( accept ) => setTimeout ( accept , 0 ) ) ; // wait for logs to finish printing
@@ -344,11 +352,12 @@ async function RunChat({
344352 contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar : grammarArg ,
345353 jsonSchemaGrammarFile : jsonSchemaGrammarFilePath ,
346354 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
347- repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
355+ repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget , noHistory, environmentFunctions, tokenPredictionDraftModel,
348356 tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
349357} : ChatCommand ) {
350358 if ( contextSize === - 1 ) contextSize = undefined ;
351359 if ( gpuLayers === - 1 ) gpuLayers = undefined ;
360+ if ( thoughtBudget === - 1 ) thoughtBudget = undefined ;
352361
353362 const headers = resolveHeaderFlag ( headerArg ) ;
354363 const trimWhitespace = ! noTrimWhitespace ;
@@ -686,6 +695,9 @@ async function RunChat({
686695 seed : seed ?? undefined ,
687696 signal : abortController . signal ,
688697 stopOnAbortSignal : true ,
698+ budgets : {
699+ thoughtTokens : thoughtBudget
700+ } ,
689701 repeatPenalty : {
690702 penalty : repeatPenalty ,
691703 frequencyPenalty : repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined ,
0 commit comments