Skip to content

Commit 1d72cb6

Browse files
author
Piotr Stankiewicz
committed
inference, gpuinfo: Limit allowed models to 1 on windows/arm64 for now
Signed-off-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
1 parent b841913 commit 1d72cb6

File tree

2 files changed

+18
-0
lines changed

2 files changed

+18
-0
lines changed

pkg/gpuinfo/memory_windows.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,19 @@ import (
66
"errors"
77
"os/exec"
88
"path/filepath"
9+
"runtime"
910
"strconv"
1011
"strings"
1112
)
1213

1314
// getVRAMSize returns total system GPU memory in bytes
1415
func getVRAMSize(ctx context.Context, modelRuntimeInstallPath string) (uint64, error) {
16+
if runtime.GOARCH == "arm64" {
17+
// TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will
18+
// require backend.GetRequiredMemoryForModel to return 1 as well.
19+
return 1, nil
20+
}
21+
1522
nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "com.docker.nv-gpu-info.exe")
1623

1724
cmd := exec.CommandContext(ctx, nvGPUInfoBin)

pkg/inference/backends/llamacpp/llamacpp.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,17 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac
234234
return nil, err
235235
}
236236

237+
if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
238+
if mdlConfig.Quantization == "Q4_0" {
239+
// TODO(p1-0tr): For now on windows/arm64 stick to the old behaviour, of allowing
240+
// one model at a time. This WA requires gpuinfo.GetVRAMSize to return 1.
241+
return &inference.RequiredMemory{
242+
RAM: 0,
243+
VRAM: 1,
244+
}, nil
245+
}
246+
}
247+
237248
contextSize := GetContextSize(&mdlConfig, config)
238249

239250
// FIXME(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];

0 commit comments

Comments
 (0)