@@ -76,13 +76,6 @@ export type LlamaModelOptions = {
7676 */
7777 topP ?: number ,
7878
79- /**
80- * use fp16 for KV cache
81- * @deprecated use the `f16Kv` option on `LlamaContext` instead
82- * @hidden
83- */
84- f16Kv ?: boolean ,
85-
8679 /**
8780 * the llama_eval() call computes all logits, not just the last one
8881 * @deprecated use the `logitsAll` option on `LlamaContext` instead
@@ -116,7 +109,6 @@ export class LlamaModel {
116109 seed : LlamaModelOptions [ "seed" ] ,
117110 contextSize : LlamaModelOptions [ "contextSize" ] ,
118111 batchSize : LlamaModelOptions [ "batchSize" ] ,
119- f16Kv : LlamaModelOptions [ "f16Kv" ] ,
120112 logitsAll : LlamaModelOptions [ "logitsAll" ] ,
121113 embedding : LlamaModelOptions [ "embedding" ] ,
122114 threads : LlamaModelOptions [ "threads" ]
@@ -160,7 +152,6 @@ export class LlamaModel {
160152 * Set to `1` to disable.
161153 *
162154 * Only relevant when `temperature` is set to a value greater than `0`.
163- * @param {boolean } [options.f16Kv] - use fp16 for KV cache
164155 * @param {boolean } [options.logitsAll] - the llama_eval() call computes all logits, not just the last one
165156 * @param {boolean } [options.vocabOnly] - only load the vocabulary, no weights
166157 * @param {boolean } [options.useMmap] - use mmap if possible
@@ -169,7 +160,7 @@ export class LlamaModel {
169160 */
170161 public constructor ( {
171162 modelPath, seed = null , contextSize = 1024 * 4 , batchSize, gpuLayers,
172- threads = 6 , temperature = 0 , topK = 40 , topP = 0.95 , f16Kv , logitsAll, vocabOnly, useMmap, useMlock, embedding
163+ threads = 6 , temperature = 0 , topK = 40 , topP = 0.95 , logitsAll, vocabOnly, useMmap, useMlock, embedding
173164 } : LlamaModelOptions ) {
174165 this . _model = new LLAMAModel ( path . resolve ( process . cwd ( ) , modelPath ) , removeNullFields ( {
175166 gpuLayers,
@@ -182,7 +173,6 @@ export class LlamaModel {
182173 seed,
183174 contextSize,
184175 batchSize,
185- f16Kv,
186176 logitsAll,
187177 embedding,
188178 threads
0 commit comments