docker · p1-0tr · Jul 23, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -27,7 +27,7 @@ COPY --link . .
 # Build the Go binary (static build)
 RUN --mount=type=cache,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
-    CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
+    CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
 
 # --- Get llama.cpp binary ---
 FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server

diff --git a/Makefile b/Makefile
@@ -17,7 +17,7 @@ LLAMA_ARGS ?=
 
 # Build the Go application
 build:
-	CGO_ENABLED=0 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
+	CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
 
 # Run the application locally
 run: build

diff --git a/go.mod b/go.mod
@@ -29,6 +29,8 @@ require (
 	github.com/docker/cli v27.5.0+incompatible // indirect
 	github.com/docker/distribution v2.8.3+incompatible // indirect
 	github.com/docker/docker-credential-helpers v0.8.2 // indirect
+	github.com/elastic/go-sysinfo v1.15.3 // indirect
+	github.com/elastic/go-windows v1.0.2 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
@@ -45,6 +47,7 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/prometheus/procfs v0.15.1 // indirect
 	github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect
 	github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect
 	github.com/vbatts/tar-split v0.11.6 // indirect

diff --git a/go.sum b/go.sum
@@ -40,6 +40,10 @@ github.com/docker/docker-credential-helpers v0.8.2 h1:bX3YxiGzFP5sOXWc3bTPEXdEaZ
 github.com/docker/docker-credential-helpers v0.8.2/go.mod h1:P3ci7E3lwkZg6XiHdRKft1KckHiO9a2rNtyFbZ/ry9M=
 github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e h1:qBkjP4A20f3RXvtstitIPiStQ4p+bK8xcjosrXLBQZ0=
 github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c=
+github.com/elastic/go-sysinfo v1.15.3 h1:W+RnmhKFkqPTCRoFq2VCTmsT4p/fwpo+3gKNQsn1XU0=
+github.com/elastic/go-sysinfo v1.15.3/go.mod h1:K/cNrqYTDrSoMh2oDkYEMS2+a72GRxMvNP+GC+vRIlo=
+github.com/elastic/go-windows v1.0.2 h1:yoLLsAsV5cfg9FLhZ9EXZ2n2sQFKeDYrHenkcivY4vI=
+github.com/elastic/go-windows v1.0.2/go.mod h1:bGcDpBzXgYSqM0Gx3DM4+UxFj300SZLixie9u9ixLM8=
 github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
@@ -108,6 +112,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw
 github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
 github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4=
 github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
+github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
+github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
 github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8=

diff --git a/main.go b/main.go
@@ -10,6 +10,7 @@ import (
 	"strings"
 	"syscall"
 
+	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
 	"github.com/docker/model-runner/pkg/inference/config"
@@ -89,6 +90,8 @@ func main() {
 		log.Fatalf("unable to initialize %s backend: %v", llamacpp.Name, err)
 	}
 
+	gpuInfo := gpuinfo.New(llamaServerPath)
+
 	scheduler := scheduling.NewScheduler(
 		log,
 		map[string]inference.Backend{llamacpp.Name: llamaCppBackend},
@@ -102,6 +105,7 @@ func main() {
 			"",
 			false,
 		),
+		gpuInfo,
 	)
 
 	router := routing.NewNormalizedServeMux()

diff --git a/pkg/gpuinfo/gpuinfo.go b/pkg/gpuinfo/gpuinfo.go
@@ -0,0 +1,17 @@
+package gpuinfo
+
+type GPUInfo struct {
+	// modelRuntimeInstallPath is the location where DMR installed it's llama-server
+	// and accompanying tools
+	modelRuntimeInstallPath string
+}
+
+func New(modelRuntimeInstallPath string) *GPUInfo {
+	return &GPUInfo{
+		modelRuntimeInstallPath: modelRuntimeInstallPath,
+	}
+}
+
+func (g *GPUInfo) GetVRAMSize() (uint64, error) {
+	return getVRAMSize(g.modelRuntimeInstallPath)
+}
diff --git a/pkg/gpuinfo/memory_darwin.go b/pkg/gpuinfo/memory_darwin.go
@@ -0,0 +1,17 @@
+package gpuinfo
+
+/*
+#cgo LDFLAGS: -framework Metal
+#include "metal.h"
+*/
+import "C"
+import "errors"
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize(_ string) (uint64, error) {
+	vramSize := C.getVRAMSize()
+	if vramSize == 0 {
+		return 0, errors.New("could not get metal VRAM size")
+	}
+	return uint64(vramSize), nil
+}
diff --git a/pkg/gpuinfo/memory_linux.go b/pkg/gpuinfo/memory_linux.go
@@ -0,0 +1,17 @@
+package gpuinfo
+
+/*
+#cgo LDFLAGS: -ldl
+#include "nvidia.h"
+*/
+import "C"
+import "errors"
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize(_ string) (uint64, error) {
+	vramSize := C.getVRAMSize()
+	if vramSize == 0 {
+		return 0, errors.New("could not get nvidia VRAM size")
+	}
+	return uint64(vramSize), nil
+}
diff --git a/pkg/gpuinfo/memory_windows.go b/pkg/gpuinfo/memory_windows.go
@@ -0,0 +1,40 @@
+package gpuinfo
+
+import (
+	"bufio"
+	"context"
+	"errors"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize(modelRuntimeInstallPath string) (uint64, error) {
+	if runtime.GOARCH == "arm64" {
+		// TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will
+		// require backend.GetRequiredMemoryForModel to return 1 as well.
+		return 1, nil
+	}
+
+	nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "bin", "com.docker.nv-gpu-info.exe")
+
+	ctx, _ := context.WithTimeout(context.Background(), 30*time.Second)
+	cmd := exec.CommandContext(ctx, nvGPUInfoBin)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		return 0, err
+	}
+	sc := bufio.NewScanner(strings.NewReader(string(out)))
+	for sc.Scan() {
+		vram, found := strings.CutPrefix(sc.Text(), "GPU[0]: dedicated memory:")
+		if found {
+			vram = strings.TrimSpace(vram)
+			return strconv.ParseUint(vram, 10, 64)
+		}
+	}
+	return 0, errors.New("unexpected nv-gpu-info output format")
+}
diff --git a/pkg/gpuinfo/metal.h b/pkg/gpuinfo/metal.h
@@ -0,0 +1,5 @@
+//go:build darwin
+
+#include <stddef.h>
+
+size_t getVRAMSize();
diff --git a/pkg/gpuinfo/metal.m b/pkg/gpuinfo/metal.m
@@ -0,0 +1,15 @@
+//go:build darwin
+
+#include <Metal/Metal.h>
+
+#include "metal.h"
+
+size_t getVRAMSize() {
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    if (device) {
+        size_t vramsz = [device recommendedMaxWorkingSetSize];
+        [device release];
+        return vramsz;
+    }
+    return 0;
+}
diff --git a/pkg/gpuinfo/nvidia.c b/pkg/gpuinfo/nvidia.c
@@ -0,0 +1,71 @@
+//go:build linux
+
+#include "nvidia.h"
+
+typedef enum {
+    NVML_SUCCESS = 0
+} nvmlReturn_t;
+
+typedef struct {
+    unsigned long long total;
+    unsigned long long free;
+    unsigned long long used;
+} nvmlMemory_t;
+
+typedef void* nvmlDevice_t;
+
+size_t getVRAMSize() {
+    void* handle;
+    nvmlReturn_t (*nvmlInit)(void);
+    nvmlReturn_t (*nvmlShutdown)(void);
+    nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
+    nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t* memory);
+
+    nvmlReturn_t result;
+    nvmlDevice_t device;
+    nvmlMemory_t memory;
+
+    // Try to load libnvidia-ml.so.1 first, then fallback to libnvidia-ml.so
+    handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
+    if (!handle) {
+        handle = dlopen("libnvidia-ml.so", RTLD_LAZY);
+        if (!handle) {
+            return 0;
+        }
+    }
+
+    // Load required functions
+    nvmlInit = dlsym(handle, "nvmlInit");
+    nvmlShutdown = dlsym(handle, "nvmlShutdown");
+    nvmlDeviceGetHandleByIndex = dlsym(handle, "nvmlDeviceGetHandleByIndex");
+    nvmlDeviceGetMemoryInfo = dlsym(handle, "nvmlDeviceGetMemoryInfo");
+
+    if (!nvmlInit || !nvmlShutdown || !nvmlDeviceGetHandleByIndex || !nvmlDeviceGetMemoryInfo) {
+        dlclose(handle);
+        return 0;
+    }
+
+    result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        dlclose(handle);
+        return 0;
+    }
+
+    result = nvmlDeviceGetHandleByIndex(0, &device);
+    if (result != NVML_SUCCESS) {
+        nvmlShutdown();
+        dlclose(handle);
+        return 0;
+    }
+
+    result = nvmlDeviceGetMemoryInfo(device, &memory);
+    if (result != NVML_SUCCESS) {
+        nvmlShutdown();
+        dlclose(handle);
+        return 0;
+    }
+
+    nvmlShutdown();
+    dlclose(handle);
+    return memory.total;
+}
diff --git a/pkg/gpuinfo/nvidia.h b/pkg/gpuinfo/nvidia.h
@@ -0,0 +1,6 @@
+//go:build linux
+
+#include <stddef.h>
+#include <dlfcn.h>
+
+size_t getVRAMSize();
diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
@@ -34,6 +34,11 @@ type BackendConfiguration struct {
 	RuntimeFlags []string `json:"runtime-flags,omitempty"`
 }
 
+type RequiredMemory struct {
+	RAM  uint64
+	VRAM uint64 // TODO(p1-0tr): for now assume we are working with single GPU set-ups
+}
+
 // Backend is the interface implemented by inference engine backends. Backend
 // implementations need not be safe for concurrent invocation of the following
 // methods, though their underlying server implementations do need to support
@@ -76,4 +81,7 @@ type Backend interface {
 	Status() string
 	// GetDiskUsage returns the disk usage of the backend.
 	GetDiskUsage() (int64, error)
+	// GetRequiredMemoryForModel returns the required working memory for a given
+	// model.
+	GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error)
 }