Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ COPY --link . .
# Build the Go binary (static build)
RUN --mount=type=cache,target=/go/pkg/mod \
--mount=type=cache,target=/root/.cache/go-build \
CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go

# --- Get llama.cpp binary ---
FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ LLAMA_ARGS ?=

# Build the Go application
build:
CGO_ENABLED=0 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go

# Run the application locally
run: build
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ require (
github.com/docker/cli v27.5.0+incompatible // indirect
github.com/docker/distribution v2.8.3+incompatible // indirect
github.com/docker/docker-credential-helpers v0.8.2 // indirect
github.com/elastic/go-sysinfo v1.15.3 // indirect
github.com/elastic/go-windows v1.0.2 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
Expand All @@ -45,6 +47,7 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect
github.com/vbatts/tar-split v0.11.6 // indirect
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ github.com/docker/docker-credential-helpers v0.8.2 h1:bX3YxiGzFP5sOXWc3bTPEXdEaZ
github.com/docker/docker-credential-helpers v0.8.2/go.mod h1:P3ci7E3lwkZg6XiHdRKft1KckHiO9a2rNtyFbZ/ry9M=
github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e h1:qBkjP4A20f3RXvtstitIPiStQ4p+bK8xcjosrXLBQZ0=
github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c=
github.com/elastic/go-sysinfo v1.15.3 h1:W+RnmhKFkqPTCRoFq2VCTmsT4p/fwpo+3gKNQsn1XU0=
github.com/elastic/go-sysinfo v1.15.3/go.mod h1:K/cNrqYTDrSoMh2oDkYEMS2+a72GRxMvNP+GC+vRIlo=
github.com/elastic/go-windows v1.0.2 h1:yoLLsAsV5cfg9FLhZ9EXZ2n2sQFKeDYrHenkcivY4vI=
github.com/elastic/go-windows v1.0.2/go.mod h1:bGcDpBzXgYSqM0Gx3DM4+UxFj300SZLixie9u9ixLM8=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
Expand Down Expand Up @@ -108,6 +112,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4=
github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8=
Expand Down
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"
"syscall"

"github.com/docker/model-runner/pkg/gpuinfo"
"github.com/docker/model-runner/pkg/inference"
"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
"github.com/docker/model-runner/pkg/inference/config"
Expand Down Expand Up @@ -89,6 +90,8 @@ func main() {
log.Fatalf("unable to initialize %s backend: %v", llamacpp.Name, err)
}

gpuInfo := gpuinfo.New(llamaServerPath)

scheduler := scheduling.NewScheduler(
log,
map[string]inference.Backend{llamacpp.Name: llamaCppBackend},
Expand All @@ -102,6 +105,7 @@ func main() {
"",
false,
),
gpuInfo,
)

router := routing.NewNormalizedServeMux()
Expand Down
17 changes: 17 additions & 0 deletions pkg/gpuinfo/gpuinfo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package gpuinfo

type GPUInfo struct {
// modelRuntimeInstallPath is the location where DMR installed it's llama-server
// and accompanying tools
modelRuntimeInstallPath string
}

func New(modelRuntimeInstallPath string) *GPUInfo {
return &GPUInfo{
modelRuntimeInstallPath: modelRuntimeInstallPath,
}
}

func (g *GPUInfo) GetVRAMSize() (uint64, error) {
return getVRAMSize(g.modelRuntimeInstallPath)
}
17 changes: 17 additions & 0 deletions pkg/gpuinfo/memory_darwin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package gpuinfo

/*
#cgo LDFLAGS: -framework Metal
#include "metal.h"
*/
import "C"
import "errors"

// getVRAMSize returns total system GPU memory in bytes
func getVRAMSize(_ string) (uint64, error) {
vramSize := C.getVRAMSize()
if vramSize == 0 {
return 0, errors.New("could not get metal VRAM size")
}
return uint64(vramSize), nil
}
17 changes: 17 additions & 0 deletions pkg/gpuinfo/memory_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package gpuinfo

/*
#cgo LDFLAGS: -ldl
#include "nvidia.h"
*/
import "C"
import "errors"

// getVRAMSize returns total system GPU memory in bytes
func getVRAMSize(_ string) (uint64, error) {
vramSize := C.getVRAMSize()
if vramSize == 0 {
return 0, errors.New("could not get nvidia VRAM size")
}
return uint64(vramSize), nil
}
40 changes: 40 additions & 0 deletions pkg/gpuinfo/memory_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package gpuinfo

import (
"bufio"
"context"
"errors"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
)

// getVRAMSize returns total system GPU memory in bytes
func getVRAMSize(modelRuntimeInstallPath string) (uint64, error) {
if runtime.GOARCH == "arm64" {
// TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will
// require backend.GetRequiredMemoryForModel to return 1 as well.
return 1, nil
}

nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "bin", "com.docker.nv-gpu-info.exe")

ctx, _ := context.WithTimeout(context.Background(), 30*time.Second)
cmd := exec.CommandContext(ctx, nvGPUInfoBin)
out, err := cmd.CombinedOutput()
if err != nil {
return 0, err
}
sc := bufio.NewScanner(strings.NewReader(string(out)))
for sc.Scan() {
vram, found := strings.CutPrefix(sc.Text(), "GPU[0]: dedicated memory:")
if found {
vram = strings.TrimSpace(vram)
return strconv.ParseUint(vram, 10, 64)
}
}
return 0, errors.New("unexpected nv-gpu-info output format")
}
5 changes: 5 additions & 0 deletions pkg/gpuinfo/metal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
//go:build darwin

#include <stddef.h>

size_t getVRAMSize();
15 changes: 15 additions & 0 deletions pkg/gpuinfo/metal.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//go:build darwin

#include <Metal/Metal.h>

#include "metal.h"

size_t getVRAMSize() {
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
if (device) {
size_t vramsz = [device recommendedMaxWorkingSetSize];
[device release];
return vramsz;
}
return 0;
}
71 changes: 71 additions & 0 deletions pkg/gpuinfo/nvidia.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
//go:build linux

#include "nvidia.h"

typedef enum {
NVML_SUCCESS = 0
} nvmlReturn_t;

typedef struct {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;

typedef void* nvmlDevice_t;

size_t getVRAMSize() {
void* handle;
nvmlReturn_t (*nvmlInit)(void);
nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t* memory);

nvmlReturn_t result;
nvmlDevice_t device;
nvmlMemory_t memory;

// Try to load libnvidia-ml.so.1 first, then fallback to libnvidia-ml.so
handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
if (!handle) {
handle = dlopen("libnvidia-ml.so", RTLD_LAZY);
if (!handle) {
return 0;
}
}

// Load required functions
nvmlInit = dlsym(handle, "nvmlInit");
nvmlShutdown = dlsym(handle, "nvmlShutdown");
nvmlDeviceGetHandleByIndex = dlsym(handle, "nvmlDeviceGetHandleByIndex");
nvmlDeviceGetMemoryInfo = dlsym(handle, "nvmlDeviceGetMemoryInfo");

if (!nvmlInit || !nvmlShutdown || !nvmlDeviceGetHandleByIndex || !nvmlDeviceGetMemoryInfo) {
dlclose(handle);
return 0;
}

result = nvmlInit();
if (result != NVML_SUCCESS) {
dlclose(handle);
return 0;
}

result = nvmlDeviceGetHandleByIndex(0, &device);
if (result != NVML_SUCCESS) {
nvmlShutdown();
dlclose(handle);
return 0;
}

result = nvmlDeviceGetMemoryInfo(device, &memory);
if (result != NVML_SUCCESS) {
nvmlShutdown();
dlclose(handle);
return 0;
}

nvmlShutdown();
dlclose(handle);
return memory.total;
}
6 changes: 6 additions & 0 deletions pkg/gpuinfo/nvidia.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
//go:build linux

#include <stddef.h>
#include <dlfcn.h>

size_t getVRAMSize();
8 changes: 8 additions & 0 deletions pkg/inference/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ type BackendConfiguration struct {
RuntimeFlags []string `json:"runtime-flags,omitempty"`
}

type RequiredMemory struct {
RAM uint64
VRAM uint64 // TODO(p1-0tr): for now assume we are working with single GPU set-ups
}

// Backend is the interface implemented by inference engine backends. Backend
// implementations need not be safe for concurrent invocation of the following
// methods, though their underlying server implementations do need to support
Expand Down Expand Up @@ -76,4 +81,7 @@ type Backend interface {
Status() string
// GetDiskUsage returns the disk usage of the backend.
GetDiskUsage() (int64, error)
// GetRequiredMemoryForModel returns the required working memory for a given
// model.
GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error)
}
Loading