forked from go-skynet/go-llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathtypes.go
More file actions
158 lines (134 loc) · 3.97 KB
/
types.go
File metadata and controls
158 lines (134 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package llama
import (
"runtime"
)
// contextConfig holds configuration for context creation
type contextConfig struct {
contextSize int
batchSize int
threads int
threadsBatch int
nParallel int // Number of parallel sequences (for batch embeddings)
f16Memory bool
embeddings bool
prefixCaching bool // Enable KV cache prefix reuse (default: true)
kvCacheType string // KV cache quantization type: "f16", "q8_0", "q4_0" (default: "q8_0")
flashAttn string // Flash Attention mode: "auto", "enabled", "disabled" (default: "auto")
}
// generateConfig holds configuration for text generation
type generateConfig struct {
// Basic generation
maxTokens int
temperature float32
seed int
stopWords []string
draftTokens int
debug bool
// Basic sampling parameters
topK int
topP float32
minP float32
typP float32
topNSigma float32
minKeep int
// Repetition penalties
penaltyLastN int
penaltyRepeat float32
penaltyFreq float32
penaltyPresent float32
// DRY (Don't Repeat Yourself) sampling
dryMultiplier float32
dryBase float32
dryAllowedLength int
dryPenaltyLastN int
drySequenceBreakers []string
// Dynamic temperature
dynatempRange float32
dynatempExponent float32
// XTC (eXclude Top Choices) sampling
xtcProbability float32
xtcThreshold float32
// Mirostat sampling
mirostat int
mirostatTau float32
mirostatEta float32
// Other parameters
nPrev int
nProbs int
ignoreEOS bool
}
// Default context configuration
var defaultContextConfig = contextConfig{
contextSize: 0, // 0 = use model's native maximum (queried after load)
batchSize: 512,
threads: runtime.NumCPU(),
threadsBatch: 0, // 0 means use same as threads (set in wrapper)
nParallel: 1, // 1 for generation, auto-set higher for embeddings
f16Memory: false,
embeddings: false,
prefixCaching: true, // Enable by default for performance
kvCacheType: "q8_0", // 50% VRAM savings with ~0.1% quality loss
flashAttn: "auto", // Let llama.cpp choose optimal path
}
var defaultGenerateConfig = generateConfig{
// Basic generation
maxTokens: 128,
temperature: 0.8,
seed: -1,
draftTokens: 16,
debug: false,
// Basic sampling parameters
topK: 40,
topP: 0.95,
minP: 0.05,
typP: 1.0, // 1.0 = disabled
topNSigma: -1.0, // -1.0 = disabled
minKeep: 0,
// Repetition penalties
penaltyLastN: 64,
penaltyRepeat: 1.0, // 1.0 = disabled
penaltyFreq: 0.0, // 0.0 = disabled
penaltyPresent: 0.0, // 0.0 = disabled
// DRY sampling
dryMultiplier: 0.0, // 0.0 = disabled
dryBase: 1.75,
dryAllowedLength: 2,
dryPenaltyLastN: -1, // -1 = context size
drySequenceBreakers: []string{"\n", ":", "\"", "*"},
// Dynamic temperature
dynatempRange: 0.0, // 0.0 = disabled
dynatempExponent: 1.0,
// XTC sampling
xtcProbability: 0.0, // 0.0 = disabled
xtcThreshold: 0.1,
// Mirostat sampling
mirostat: 0, // 0 = disabled
mirostatTau: 5.0,
mirostatEta: 0.1,
// Other parameters
nPrev: 64,
nProbs: 0, // 0 = disabled
ignoreEOS: false,
}
// modelConfig holds configuration for model loading (model-level only)
type modelConfig struct {
gpuLayers int
mlock bool
mmap bool
mainGPU string
tensorSplit string
disableProgressCallback bool
progressCallback ProgressCallback
}
// Default model configuration
var defaultModelConfig = modelConfig{
gpuLayers: -1, // Offload all layers to GPU by default (falls back to CPU if unavailable)
mlock: false,
mmap: true,
}
// ModelOption configures model loading behaviour (model-level settings).
type ModelOption func(*modelConfig)
// ContextOption configures context creation (context-level settings).
type ContextOption func(*contextConfig)
// GenerateOption configures text generation behaviour.
type GenerateOption func(*generateConfig)