@@ -45,6 +45,7 @@ type ChatCommand = {
4545 contextSize ?: number ,
4646 batchSize ?: number ,
4747 flashAttention ?: boolean ,
48+ swaFullCache ?: boolean ,
4849 noTrimWhitespace : boolean ,
4950 grammar : "text" | Parameters < typeof LlamaGrammar . getFor > [ 1 ] ,
5051 jsonSchemaGrammarFile ?: string ,
@@ -162,6 +163,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
162163 default : false ,
163164 description : "Enable flash attention"
164165 } )
166+ . option ( "swaFullCache" , {
167+ alias : "noSwa" ,
168+ type : "boolean" ,
169+ default : false ,
170+ description : "Disable SWA (Sliding Window Attention) on supported models"
171+ } )
165172 . option ( "noTrimWhitespace" , {
166173 type : "boolean" ,
167174 alias : [ "noTrim" ] ,
@@ -308,7 +315,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
308315 } ,
309316 async handler ( {
310317 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
311- promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention,
318+ promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache ,
312319 noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
313320 topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
314321 repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
@@ -317,7 +324,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
317324 try {
318325 await RunChat ( {
319326 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
320- batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed,
327+ batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
328+ temperature, minP, topK, topP, seed,
321329 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
322330 maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
323331 timing, noMmap, printTimings
@@ -333,7 +341,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
333341
334342async function RunChat ( {
335343 modelPath : modelArg , header : headerArg , gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
336- contextSize, batchSize, flashAttention, noTrimWhitespace, grammar : grammarArg , jsonSchemaGrammarFile : jsonSchemaGrammarFilePath ,
344+ contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar : grammarArg ,
345+ jsonSchemaGrammarFile : jsonSchemaGrammarFilePath ,
337346 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
338347 repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
339348 tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
@@ -363,11 +372,13 @@ async function RunChat({
363372
364373 const resolvedModelPath = await resolveCommandGgufPath ( modelArg , llama , headers , {
365374 flashAttention,
375+ swaFullCache,
366376 useMmap
367377 } ) ;
368378 const resolvedDraftModelPath = ( tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "" )
369379 ? await resolveCommandGgufPath ( tokenPredictionDraftModel , llama , headers , {
370380 flashAttention,
381+ swaFullCache,
371382 useMmap,
372383 consoleTitle : "Draft model file"
373384 } )
@@ -413,6 +424,7 @@ async function RunChat({
413424 ? { fitContext : { contextSize} }
414425 : undefined ,
415426 defaultContextFlashAttention : flashAttention ,
427+ defaultContextSwaFullCache : swaFullCache ,
416428 useMmap,
417429 ignoreMemorySafetyChecks : gpuLayers != null ,
418430 onLoadProgress ( loadProgress : number ) {
@@ -446,6 +458,7 @@ async function RunChat({
446458 return await llama . loadModel ( {
447459 modelPath : resolvedDraftModelPath ,
448460 defaultContextFlashAttention : flashAttention ,
461+ defaultContextSwaFullCache : swaFullCache ,
449462 useMmap,
450463 onLoadProgress ( loadProgress : number ) {
451464 progressUpdater . setProgress ( loadProgress ) ;
0 commit comments