llama-go/types.go at main · tcpipuk/llama-go · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package llama

import (
	"runtime"
)

// contextConfig holds configuration for context creation
type contextConfig struct {
	contextSize   int
	batchSize     int
	threads       int
	threadsBatch  int
	nParallel     int // Number of parallel sequences (for batch embeddings)
	f16Memory     bool
	embeddings    bool
	prefixCaching bool   // Enable KV cache prefix reuse (default: true)
	kvCacheType   string // KV cache quantization type: "f16", "q8_0", "q4_0" (default: "q8_0")
	flashAttn     string // Flash Attention mode: "auto", "enabled", "disabled" (default: "auto")
}

// generateConfig holds configuration for text generation
type generateConfig struct {
	// Basic generation
	maxTokens     int
	temperature   float32
	seed          int
	stopWords     []string
	draftTokens   int
	debug         bool

	// Basic sampling parameters
	topK      int
	topP      float32
	minP      float32
	typP      float32
	topNSigma float32
	minKeep   int

	// Repetition penalties
	penaltyLastN   int
	penaltyRepeat  float32
	penaltyFreq    float32
	penaltyPresent float32

	// DRY (Don't Repeat Yourself) sampling
	dryMultiplier       float32
	dryBase             float32
	dryAllowedLength    int
	dryPenaltyLastN     int
	drySequenceBreakers []string

	// Dynamic temperature
	dynatempRange    float32
	dynatempExponent float32

	// XTC (eXclude Top Choices) sampling
	xtcProbability float32
	xtcThreshold   float32

	// Mirostat sampling
	mirostat    int
	mirostatTau float32
	mirostatEta float32

	// Other parameters
	nPrev     int
	nProbs    int
	ignoreEOS bool
}

// Default context configuration
var defaultContextConfig = contextConfig{
	contextSize:   0, // 0 = use model's native maximum (queried after load)
	batchSize:     512,
	threads:       runtime.NumCPU(),
	threadsBatch:  0, // 0 means use same as threads (set in wrapper)
	nParallel:     1, // 1 for generation, auto-set higher for embeddings
	f16Memory:     false,
	embeddings:    false,
	prefixCaching: true,   // Enable by default for performance
	kvCacheType:   "q8_0", // 50% VRAM savings with ~0.1% quality loss
	flashAttn:     "auto", // Let llama.cpp choose optimal path
}

var defaultGenerateConfig = generateConfig{
	// Basic generation
	maxTokens:     128,
	temperature:   0.8,
	seed:          -1,
	draftTokens:   16,
	debug:         false,

	// Basic sampling parameters
	topK:      40,
	topP:      0.95,
	minP:      0.05,
	typP:      1.0,  // 1.0 = disabled
	topNSigma: -1.0, // -1.0 = disabled
	minKeep:   0,

	// Repetition penalties
	penaltyLastN:   64,
	penaltyRepeat:  1.0, // 1.0 = disabled
	penaltyFreq:    0.0, // 0.0 = disabled
	penaltyPresent: 0.0, // 0.0 = disabled

	// DRY sampling
	dryMultiplier:       0.0, // 0.0 = disabled
	dryBase:             1.75,
	dryAllowedLength:    2,
	dryPenaltyLastN:     -1, // -1 = context size
	drySequenceBreakers: []string{"\n", ":", "\"", "*"},

	// Dynamic temperature
	dynatempRange:    0.0, // 0.0 = disabled
	dynatempExponent: 1.0,

	// XTC sampling
	xtcProbability: 0.0, // 0.0 = disabled
	xtcThreshold:   0.1,

	// Mirostat sampling
	mirostat:    0, // 0 = disabled
	mirostatTau: 5.0,
	mirostatEta: 0.1,

	// Other parameters
	nPrev:     64,
	nProbs:    0, // 0 = disabled
	ignoreEOS: false,
}

// modelConfig holds configuration for model loading (model-level only)
type modelConfig struct {
	gpuLayers               int
	mlock                   bool
	mmap                    bool
	mainGPU                 string
	tensorSplit             string
	disableProgressCallback bool
	progressCallback        ProgressCallback
}

// Default model configuration
var defaultModelConfig = modelConfig{
	gpuLayers: -1, // Offload all layers to GPU by default (falls back to CPU if unavailable)
	mlock:     false,
	mmap:      true,
}

// ModelOption configures model loading behaviour (model-level settings).
type ModelOption func(*modelConfig)

// ContextOption configures context creation (context-level settings).
type ContextOption func(*contextConfig)

// GenerateOption configures text generation behaviour.
type GenerateOption func(*generateConfig)