Skip to content

Commit 33b40c0

Browse files
author
Piotr Stankiewicz
committed
inference: Ignore parse errors when estimating model memory
We will run into cases where our model runner is ahead of gguf-parser-go. In such cases we may want to load a model that will cause the model parse to fail. So, for now, in such cases ignore model parsing errors, and assume it takes no resources. In the future we should come up with a cleaner way of dealing with this (e.g. ship a model memory estimator along with the llama-server). Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
1 parent 9e639fd commit 33b40c0

File tree

3 files changed

+17
-2
lines changed

3 files changed

+17
-2
lines changed

pkg/inference/backend.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package inference
22

33
import (
44
"context"
5+
"errors"
56
"net/http"
67
)
78

@@ -17,6 +18,10 @@ const (
1718
BackendModeEmbedding
1819
)
1920

21+
var (
22+
ErrGGUFParse = errors.New("failed to parse GGUF file")
23+
)
24+
2025
// String implements Stringer.String for BackendMode.
2126
func (m BackendMode) String() string {
2227
switch m {

pkg/inference/backends/llamacpp/llamacpp.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,8 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
234234
}
235235
mdlGguf, err := parser.ParseGGUFFile(mdlPath)
236236
if err != nil {
237-
return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)
237+
l.log.Warnf("Failed to parse gguf(%s): %s", mdlPath, err)
238+
return nil, inference.ErrGGUFParse
238239
}
239240
mdlConfig, err := mdl.Config()
240241
if err != nil {

pkg/inference/scheduling/loader.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
421421
runnerConfig = &rc
422422
}
423423
memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig)
424-
if err != nil {
424+
if errors.Is(err, inference.ErrGGUFParse) {
425+
// TODO(p1-0tr): For now override memory checks in case model can't be parsed
426+
// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
427+
// way to bypass these checks.
428+
l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it.", modelID)
429+
memory = &inference.RequiredMemory{
430+
RAM: 0,
431+
VRAM: 0,
432+
}
433+
} else if err != nil {
425434
return nil, err
426435
}
427436
l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)

0 commit comments

Comments
 (0)