inference, gpuinfo: Limit allowed models to 1 on windows/arm64 for now

Piotr Stankiewicz · Piotr Stankiewicz · commit 1d72cb68b1f5 · 2025-07-14T15:41:19.000+02:00
Signed-off-by: Piotr Stankiewicz &lt;piotr.stankiewicz@docker.com&gt;
diff --git a/pkg/gpuinfo/memory_windows.go b/pkg/gpuinfo/memory_windows.go
@@ -6,12 +6,19 @@ import (
 	"errors"
 	"os/exec"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"strings"
 )
 
 // getVRAMSize returns total system GPU memory in bytes
 func getVRAMSize(ctx context.Context, modelRuntimeInstallPath string) (uint64, error) {
+	if runtime.GOARCH == "arm64" {
+		// TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will
+		// require backend.GetRequiredMemoryForModel to return 1 as well.
+		return 1, nil
+	}
+
 	nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "com.docker.nv-gpu-info.exe")
 
 	cmd := exec.CommandContext(ctx, nvGPUInfoBin)
diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -234,6 +234,17 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
 		return nil, err
 	}
 
+	if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
+		if mdlConfig.Quantization == "Q4_0" {
+			// TODO(p1-0tr): For now on windows/arm64 stick to the old behaviour, of allowing
+			// one model at a time. This WA requires gpuinfo.GetVRAMSize to return 1.
+			return &inference.RequiredMemory{
+				RAM:  0,
+				VRAM: 1,
+			}, nil
+		}
+	}
+
 	contextSize := GetContextSize(&mdlConfig, config)
 
 	// FIXME(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];