Skip to content

Commit e785341

Browse files
committed
fix(inference): scale llama-server ready timeout for large GGUF
- Derive wait from model file size (2m + ~1m/GiB, max 45m); CSGHUB_LITE_LLAMA_READY_TIMEOUT override - SSE/CLI heartbeat while loading (seconds elapsed); larger llama log tail on errors Made-with: Cursor
1 parent f49f919 commit e785341

File tree

3 files changed

+51
-7
lines changed

3 files changed

+51
-7
lines changed

internal/cli/client.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ func preloadModel(serverURL, modelID string) error {
189189
if lr.Total > 0 {
190190
pct := lr.Current * 100 / lr.Total
191191
fmt.Fprintf(os.Stderr, "\r\033[K %s (%d/%d) %d%%", lr.Step, lr.Current, lr.Total, pct)
192+
} else if lr.Current > 0 {
193+
// Heartbeat (e.g. seconds waiting for llama-server while loading a large GGUF).
194+
fmt.Fprintf(os.Stderr, "\r\033[K %s (%ds)", lr.Step, lr.Current)
192195
} else if lr.Step != lastStep {
193196
if lastStep != "" {
194197
fmt.Fprintf(os.Stderr, "\n")

internal/inference/engine.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,14 @@ func LoadEngineWithProgress(modelDir string, lm *model.LocalModel, progress Conv
5252

5353
switch format {
5454
case model.FormatGGUF:
55-
return newLlamaEngine(modelFile, lm.FullName(), verbose, mmproj)
55+
return newLlamaEngine(modelFile, lm.FullName(), verbose, progress, mmproj)
5656

5757
case model.FormatSafeTensors:
5858
ggufPath, err := convertSafeTensors(modelDir, progress)
5959
if err != nil {
6060
return nil, fmt.Errorf("auto-converting SafeTensors to GGUF: %w", err)
6161
}
62-
eng, err := newLlamaEngine(ggufPath, lm.FullName(), verbose, mmproj)
62+
eng, err := newLlamaEngine(ggufPath, lm.FullName(), verbose, progress, mmproj)
6363
if err != nil {
6464
log.Printf("removing invalid converted GGUF: %s", ggufPath)
6565
os.Remove(ggufPath)

internal/inference/llama.go

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"os/exec"
1515
"path/filepath"
1616
"runtime"
17+
"strconv"
1718
"strings"
1819
"sync"
1920
"time"
@@ -99,7 +100,34 @@ func findFreePort() (int, error) {
99100
return l.Addr().(*net.TCPAddr).Port, nil
100101
}
101102

102-
func newLlamaEngine(modelPath, modelName string, verbose bool, mmproj ...string) (*llamaEngine, error) {
103+
// llamaReadyTimeout returns how long to wait for llama-server /health after start.
104+
// Large GGUF files can take many minutes to mmap / load to GPU.
105+
func llamaReadyTimeout(modelPath string) time.Duration {
106+
if v := strings.TrimSpace(os.Getenv("CSGHUB_LITE_LLAMA_READY_TIMEOUT")); v != "" {
107+
if d, err := time.ParseDuration(v); err == nil && d > 0 {
108+
return d
109+
}
110+
if sec, err := strconv.Atoi(v); err == nil && sec > 0 {
111+
return time.Duration(sec) * time.Second
112+
}
113+
}
114+
fi, err := os.Stat(modelPath)
115+
if err != nil {
116+
return 20 * time.Minute
117+
}
118+
gb := float64(fi.Size()) / (1024 * 1024 * 1024)
119+
// 2 min base + ~1 min per GiB (F16 9B is ~17GiB on disk → ~19 min).
120+
sec := int(120 + gb*60)
121+
if sec < 120 {
122+
sec = 120
123+
}
124+
if sec > 45*60 {
125+
sec = 45 * 60
126+
}
127+
return time.Duration(sec) * time.Second
128+
}
129+
130+
func newLlamaEngine(modelPath, modelName string, verbose bool, progress ConvertProgressFunc, mmproj ...string) (*llamaEngine, error) {
103131
binary := findLlamaBinary()
104132
if binary == "" {
105133
return nil, fmt.Errorf("llama-server not found in PATH.\n" +
@@ -141,7 +169,8 @@ func newLlamaEngine(modelPath, modelName string, verbose bool, mmproj ...string)
141169
engine.cmd.Stdout = os.Stderr
142170
engine.cmd.Stderr = os.Stderr
143171
} else {
144-
w := newCappedWriter(8192)
172+
// Large models print long tensor/KV lists; keep more tail for error diagnosis.
173+
w := newCappedWriter(64 * 1024)
145174
engine.cmd.Stdout = w
146175
engine.cmd.Stderr = w
147176
engine.logBuf = w
@@ -164,15 +193,19 @@ func newLlamaEngine(modelPath, modelName string, verbose bool, mmproj ...string)
164193
return nil, fmt.Errorf("starting llama-server: %w", err)
165194
}
166195

167-
if err := engine.waitForReady(30 * time.Second); err != nil {
196+
readyTimeout := llamaReadyTimeout(modelPath)
197+
if progress != nil {
198+
progress("Starting llama-server", 0, 0)
199+
}
200+
if err := engine.waitForReady(readyTimeout, progress); err != nil {
168201
engine.Close()
169202
return nil, fmt.Errorf("llama-server failed to start: %w", err)
170203
}
171204

172205
return engine, nil
173206
}
174207

175-
func (e *llamaEngine) waitForReady(timeout time.Duration) error {
208+
func (e *llamaEngine) waitForReady(timeout time.Duration, progress ConvertProgressFunc) error {
176209
deadline := time.Now().Add(timeout)
177210
url := fmt.Sprintf("http://127.0.0.1:%d/health", e.port)
178211

@@ -182,7 +215,15 @@ func (e *llamaEngine) waitForReady(timeout time.Duration) error {
182215
exited := make(chan error, 1)
183216
go func() { exited <- e.cmd.Wait() }()
184217

218+
start := time.Now()
219+
lastBeat := time.Time{}
220+
185221
for time.Now().Before(deadline) {
222+
if progress != nil && time.Since(lastBeat) >= 2*time.Second {
223+
progress("Loading model with llama-server", int(time.Since(start).Seconds()), 0)
224+
lastBeat = time.Now()
225+
}
226+
186227
select {
187228
case err := <-exited:
188229
msg := "llama-server exited unexpectedly"
@@ -209,7 +250,7 @@ func (e *llamaEngine) waitForReady(timeout time.Duration) error {
209250
time.Sleep(500 * time.Millisecond)
210251
}
211252

212-
msg := "timeout waiting for llama-server to be ready"
253+
msg := fmt.Sprintf("timeout waiting for llama-server to be ready (waited %v; large models need more time — try CSGHUB_LITE_LLAMA_READY_TIMEOUT=45m)", timeout)
213254
if e.logBuf != nil {
214255
if tail := strings.TrimSpace(e.logBuf.String()); tail != "" {
215256
msg += "\n\nllama-server output:\n" + tail

0 commit comments

Comments
 (0)