@@ -28,6 +28,7 @@ type ChatCommand = {
28
28
temperature : number ,
29
29
topK : number ,
30
30
topP : number ,
31
+ gpuLayers ?: number ,
31
32
repeatPenalty : number ,
32
33
lastTokensRepeatPenalty : number ,
33
34
penalizeRepeatingNewLine : boolean ,
@@ -122,6 +123,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
122
123
description : "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`." ,
123
124
group : "Optional:"
124
125
} )
126
+ . option ( "gpuLayers" , {
127
+ alias : "gl" ,
128
+ type : "number" ,
129
+ description : "number of layers to store in VRAM" ,
130
+ group : "Optional:"
131
+ } )
125
132
. option ( "repeatPenalty" , {
126
133
alias : "rp" ,
127
134
type : "number" ,
@@ -165,12 +172,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
165
172
} ,
166
173
async handler ( {
167
174
model, systemInfo, systemPrompt, prompt, wrapper, contextSize,
168
- grammar, threads, temperature, topK, topP, repeatPenalty,
175
+ grammar, threads, temperature, topK, topP, gpuLayers , repeatPenalty,
169
176
lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
170
177
} ) {
171
178
try {
172
179
await RunChat ( {
173
- model, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar, threads, temperature, topK, topP,
180
+ model, systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar, threads, temperature, topK, topP, gpuLayers ,
174
181
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
175
182
} ) ;
176
183
} catch ( err ) {
@@ -183,7 +190,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
183
190
184
191
async function RunChat ( {
185
192
model : modelArg , systemInfo, systemPrompt, prompt, wrapper, contextSize, grammar : grammarArg , threads, temperature, topK, topP,
186
- lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
193
+ gpuLayers , lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens
187
194
} : ChatCommand ) {
188
195
const { LlamaChatSession} = await import ( "../../llamaEvaluator/LlamaChatSession.js" ) ;
189
196
const { LlamaModel} = await import ( "../../llamaEvaluator/LlamaModel.js" ) ;
@@ -192,7 +199,8 @@ async function RunChat({
192
199
193
200
let initialPrompt = prompt ?? null ;
194
201
const model = new LlamaModel ( {
195
- modelPath : path . resolve ( process . cwd ( ) , modelArg )
202
+ modelPath : path . resolve ( process . cwd ( ) , modelArg ) ,
203
+ gpuLayers : gpuLayers != null ? gpuLayers : undefined
196
204
} ) ;
197
205
const context = new LlamaContext ( {
198
206
model,
0 commit comments