inference: Ignore parse errors when estimating model memory

Piotr Stankiewicz · Piotr Stankiewicz · commit 33b40c0ce147 · 2025-08-06T16:40:18.000+02:00
We will run into cases where our model runner is ahead of
gguf-parser-go. In such cases we may want to load a model that will
cause the model parse to fail. So, for now, in such cases ignore model
parsing errors, and assume it takes no resources. In the future we
should come up with a cleaner way of dealing with this (e.g. ship a
model memory estimator along with the llama-server).

Signed-off-by: Piotr Stankiewicz &lt;piotr.stankiewicz@docker.com&gt;
diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
@@ -2,6 +2,7 @@ package inference
 
 import (
 	"context"
+	"errors"
 	"net/http"
 )
 
@@ -17,6 +18,10 @@ const (
 	BackendModeEmbedding
 )
 
+var (
+	ErrGGUFParse = errors.New("failed to parse GGUF file")
+)
+
 // String implements Stringer.String for BackendMode.
 func (m BackendMode) String() string {
 	switch m {
diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -234,7 +234,8 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
 	}
 	mdlGguf, err := parser.ParseGGUFFile(mdlPath)
 	if err != nil {
-		return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)
+		l.log.Warnf("Failed to parse gguf(%s): %s", mdlPath, err)
+		return nil, inference.ErrGGUFParse
 	}
 	mdlConfig, err := mdl.Config()
 	if err != nil {
diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go
@@ -421,7 +421,16 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 		runnerConfig = &rc
 	}
 	memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig)
-	if err != nil {
+	if errors.Is(err, inference.ErrGGUFParse) {
+		// TODO(p1-0tr): For now override memory checks in case model can't be parsed
+		// e.g. model is too new for gguf-parser-go to know. We should provide a cleaner
+		// way to bypass these checks.
+		l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it.", modelID)
+		memory = &inference.RequiredMemory{
+			RAM:  0,
+			VRAM: 0,
+		}
+	} else if err != nil {
 		return nil, err
 	}
 	l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)

Original file line number	Diff line number	Diff line change
`@@ -234,7 +234,8 @@ func (l llamaCpp) GetRequiredMemoryForModel(model string, config inference.Bac`
`234`	`234`	`}`
`235`	`235`	`mdlGguf, err := parser.ParseGGUFFile(mdlPath)`
`236`	`236`	`if err != nil {`
`237`		`- return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)`
	`237`	`+ l.log.Warnf("Failed to parse gguf(%s): %s", mdlPath, err)`
	`238`	`+ return nil, inference.ErrGGUFParse`
`238`	`239`	`}`
`239`	`240`	`mdlConfig, err := mdl.Config()`
`240`	`241`	`if err != nil {`