@@ -2,6 +2,75 @@ import {removeNullFields} from "../utils/removeNullFields.js";
2
2
import { llamaCppNode , LLAMAModel } from "./LlamaBins.js" ;
3
3
4
4
5
+ export type LlamaModelOptions = {
6
+ /** path to the model on the filesystem */
7
+ modelPath : string ,
8
+
9
+ /** If null, a random seed will be used */
10
+ seed ?: number | null ,
11
+
12
+ /** text context size */
13
+ contextSize ?: number ,
14
+
15
+ /** prompt processing batch size */
16
+ batchSize ?: number ,
17
+
18
+ /** number of layers to store in VRAM */
19
+ gpuLayers ?: number ,
20
+
21
+ /** if true, reduce VRAM usage at the cost of performance */
22
+ lowVram ?: boolean ,
23
+
24
+ /**
25
+ * Temperature is a hyperparameter that controls the randomness of the generated text.
26
+ * It affects the probability distribution of the model's output tokens.
27
+ * A higher temperature (e.g., 1.5) makes the output more random and creative,
28
+ * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
29
+ * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
30
+ * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
31
+ *
32
+ * Set to `0` to disable.
33
+ */
34
+ temperature ?: number ,
35
+
36
+ /**
37
+ * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
38
+ * An integer number between `1` and the size of the vocabulary.
39
+ * Set to `0` to disable (which uses the full vocabulary).
40
+ *
41
+ * Only relevant when `temperature` is set to a value greater than 0.
42
+ * */
43
+ topK ?: number ,
44
+
45
+ /**
46
+ * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
47
+ * and samples the next token only from this set.
48
+ * A float number between `0` and `1`.
49
+ * Set to `1` to disable.
50
+ *
51
+ * Only relevant when `temperature` is set to a value greater than `0`.
52
+ * */
53
+ topP ?: number ,
54
+
55
+ /** use fp16 for KV cache */
56
+ f16Kv ?: boolean ,
57
+
58
+ /** the llama_eval() call computes all logits, not just the last one */
59
+ logitsAll ?: boolean ,
60
+
61
+ /** only load the vocabulary, no weights */
62
+ vocabOnly ?: boolean ,
63
+
64
+ /** use mmap if possible */
65
+ useMmap ?: boolean ,
66
+
67
+ /** force system to keep model in RAM */
68
+ useMlock ?: boolean ,
69
+
70
+ /** embedding mode only */
71
+ embedding ?: boolean
72
+ } ;
73
+
5
74
export class LlamaModel {
6
75
/** @internal */
7
76
public readonly _model : LLAMAModel ;
@@ -46,74 +115,7 @@ export class LlamaModel {
46
115
public constructor ( {
47
116
modelPath, seed = null , contextSize = 1024 * 4 , batchSize, gpuLayers,
48
117
lowVram, temperature = 0 , topK = 40 , topP = 0.95 , f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding
49
- } : {
50
- /** path to the model on the filesystem */
51
- modelPath : string ,
52
-
53
- /** If null, a random seed will be used */
54
- seed ?: number | null ,
55
-
56
- /** text context size */
57
- contextSize ?: number ,
58
-
59
- /** prompt processing batch size */
60
- batchSize ?: number ,
61
-
62
- /** number of layers to store in VRAM */
63
- gpuLayers ?: number ,
64
-
65
- /** if true, reduce VRAM usage at the cost of performance */
66
- lowVram ?: boolean ,
67
-
68
- /**
69
- * Temperature is a hyperparameter that controls the randomness of the generated text.
70
- * It affects the probability distribution of the model's output tokens.
71
- * A higher temperature (e.g., 1.5) makes the output more random and creative,
72
- * while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
73
- * The suggested temperature is 0.8, which provides a balance between randomness and determinism.
74
- * At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
75
- *
76
- * Set to `0` to disable.
77
- */
78
- temperature ?: number ,
79
-
80
- /**
81
- * Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
82
- * An integer number between `1` and the size of the vocabulary.
83
- * Set to `0` to disable (which uses the full vocabulary).
84
- *
85
- * Only relevant when `temperature` is set to a value greater than 0.
86
- * */
87
- topK ?: number ,
88
-
89
- /**
90
- * Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
91
- * and samples the next token only from this set.
92
- * A float number between `0` and `1`.
93
- * Set to `1` to disable.
94
- *
95
- * Only relevant when `temperature` is set to a value greater than `0`.
96
- * */
97
- topP ?: number ,
98
-
99
- /** use fp16 for KV cache */
100
- f16Kv ?: boolean ,
101
-
102
- /** the llama_eval() call computes all logits, not just the last one */
103
- logitsAll ?: boolean ,
104
-
105
- /** only load the vocabulary, no weights */
106
- vocabOnly ?: boolean ,
107
-
108
- /** use mmap if possible */
109
- useMmap ?: boolean ,
110
-
111
- /** force system to keep model in RAM */
112
- useMlock ?: boolean ,
113
-
114
- /** embedding mode only */
115
- embedding ?: boolean
116
- } ) {
118
+ } : LlamaModelOptions ) {
117
119
this . _model = new LLAMAModel ( modelPath , removeNullFields ( {
118
120
seed : seed != null ? Math . max ( - 1 , seed ) : undefined ,
119
121
contextSize,
0 commit comments