@@ -28,6 +28,7 @@ type ChatCommand = {
2828 temperature : number ,
2929 topK : number ,
3030 topP : number ,
31+ gpuLayers ?: number ,
3132 repeatPenalty : number ,
3233 lastTokensRepeatPenalty : number ,
3334 penalizeRepeatingNewLine : boolean ,
@@ -122,6 +123,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
122123 description : "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`." ,
123124 group : "Optional:"
124125 } )
126+ . option ( "gpuLayers" , {
127+ alias : "gl" ,
128+ type : "number" ,
129+ description : "number of layers to store in VRAM" ,
130+ group : "Optional:"
131+ } )
125132 . option ( "repeatPenalty" , {
126133 alias : "rp" ,
127134 type : "number" ,
@@ -165,12 +172,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
165172 } ,
166173 async handler ( {
167174 model, systemInfo, systemPrompt, prompt, wrapper, contextSize,
168- grammar, threads, temperature, topK, topP, repeatPenalty,
175+ grammar, threads, temperature, topK, topP, gpuLayers , repeatPenalty,
169176 lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
170177 } ) {
171178 try {
172179 await RunChat ( {
173- model, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar, threads, temperature, topK, topP,
180+ model, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar, threads, temperature, topK, topP, gpuLayers ,
174181 lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
175182 } ) ;
176183 } catch ( err ) {
@@ -183,7 +190,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
183190
184191async function RunChat ( {
185192 model : modelArg , systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar : grammarArg , threads, temperature, topK, topP,
186- lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
193+ gpuLayers , lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
187194} : ChatCommand ) {
188195 const { LlamaChatSession} = await import ( "../../llamaEvaluator/LlamaChatSession.js" ) ;
189196 const { LlamaModel} = await import ( "../../llamaEvaluator/LlamaModel.js" ) ;
@@ -192,7 +199,8 @@ async function RunChat({
192199
193200 let initialPrompt = prompt ?? null ;
194201 const model = new LlamaModel ( {
195- modelPath : path . resolve ( process . cwd ( ) , modelArg )
202+ modelPath : path . resolve ( process . cwd ( ) , modelArg ) ,
203+ gpuLayers : gpuLayers != null ? gpuLayers : undefined
196204 } ) ;
197205 const context = new LlamaContext ( {
198206 model,
0 commit comments