diff --git a/Dockerfile b/Dockerfile index b7f028abe..67a7f91aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,7 +27,7 @@ COPY --link . . # Build the Go binary (static build) RUN --mount=type=cache,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go + CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go # --- Get llama.cpp binary --- FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server diff --git a/Makefile b/Makefile index 41fd154c3..93f90fd15 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ LLAMA_ARGS ?= # Build the Go application build: - CGO_ENABLED=0 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go + CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go # Run the application locally run: build diff --git a/go.mod b/go.mod index 9b95317a8..2d93f6f36 100644 --- a/go.mod +++ b/go.mod @@ -29,6 +29,8 @@ require ( github.com/docker/cli v27.5.0+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect github.com/docker/docker-credential-helpers v0.8.2 // indirect + github.com/elastic/go-sysinfo v1.15.3 // indirect + github.com/elastic/go-windows v1.0.2 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect @@ -45,6 +47,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect github.com/vbatts/tar-split v0.11.6 // indirect diff --git a/go.sum b/go.sum index 0aeec9e91..d008aff19 100644 --- a/go.sum +++ b/go.sum @@ -40,6 +40,10 @@ github.com/docker/docker-credential-helpers v0.8.2 h1:bX3YxiGzFP5sOXWc3bTPEXdEaZ github.com/docker/docker-credential-helpers v0.8.2/go.mod h1:P3ci7E3lwkZg6XiHdRKft1KckHiO9a2rNtyFbZ/ry9M= github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e h1:qBkjP4A20f3RXvtstitIPiStQ4p+bK8xcjosrXLBQZ0= github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c= +github.com/elastic/go-sysinfo v1.15.3 h1:W+RnmhKFkqPTCRoFq2VCTmsT4p/fwpo+3gKNQsn1XU0= +github.com/elastic/go-sysinfo v1.15.3/go.mod h1:K/cNrqYTDrSoMh2oDkYEMS2+a72GRxMvNP+GC+vRIlo= +github.com/elastic/go-windows v1.0.2 h1:yoLLsAsV5cfg9FLhZ9EXZ2n2sQFKeDYrHenkcivY4vI= +github.com/elastic/go-windows v1.0.2/go.mod h1:bGcDpBzXgYSqM0Gx3DM4+UxFj300SZLixie9u9ixLM8= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -108,6 +112,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4= github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8= diff --git a/main.go b/main.go index ffa4dde21..287b64b80 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "strings" "syscall" + "github.com/docker/model-runner/pkg/gpuinfo" "github.com/docker/model-runner/pkg/inference" "github.com/docker/model-runner/pkg/inference/backends/llamacpp" "github.com/docker/model-runner/pkg/inference/config" @@ -89,6 +90,8 @@ func main() { log.Fatalf("unable to initialize %s backend: %v", llamacpp.Name, err) } + gpuInfo := gpuinfo.New(llamaServerPath) + scheduler := scheduling.NewScheduler( log, map[string]inference.Backend{llamacpp.Name: llamaCppBackend}, @@ -102,6 +105,7 @@ func main() { "", false, ), + gpuInfo, ) router := routing.NewNormalizedServeMux() diff --git a/pkg/gpuinfo/gpuinfo.go b/pkg/gpuinfo/gpuinfo.go new file mode 100644 index 000000000..3bc8f66ee --- /dev/null +++ b/pkg/gpuinfo/gpuinfo.go @@ -0,0 +1,17 @@ +package gpuinfo + +type GPUInfo struct { + // modelRuntimeInstallPath is the location where DMR installed it's llama-server + // and accompanying tools + modelRuntimeInstallPath string +} + +func New(modelRuntimeInstallPath string) *GPUInfo { + return &GPUInfo{ + modelRuntimeInstallPath: modelRuntimeInstallPath, + } +} + +func (g *GPUInfo) GetVRAMSize() (uint64, error) { + return getVRAMSize(g.modelRuntimeInstallPath) +} diff --git a/pkg/gpuinfo/memory_darwin.go b/pkg/gpuinfo/memory_darwin.go new file mode 100644 index 000000000..73677ca07 --- /dev/null +++ b/pkg/gpuinfo/memory_darwin.go @@ -0,0 +1,17 @@ +package gpuinfo + +/* +#cgo LDFLAGS: -framework Metal +#include "metal.h" +*/ +import "C" +import "errors" + +// getVRAMSize returns total system GPU memory in bytes +func getVRAMSize(_ string) (uint64, error) { + vramSize := C.getVRAMSize() + if vramSize == 0 { + return 0, errors.New("could not get metal VRAM size") + } + return uint64(vramSize), nil +} diff --git a/pkg/gpuinfo/memory_linux.go b/pkg/gpuinfo/memory_linux.go new file mode 100644 index 000000000..94525711c --- /dev/null +++ b/pkg/gpuinfo/memory_linux.go @@ -0,0 +1,17 @@ +package gpuinfo + +/* +#cgo LDFLAGS: -ldl +#include "nvidia.h" +*/ +import "C" +import "errors" + +// getVRAMSize returns total system GPU memory in bytes +func getVRAMSize(_ string) (uint64, error) { + vramSize := C.getVRAMSize() + if vramSize == 0 { + return 0, errors.New("could not get nvidia VRAM size") + } + return uint64(vramSize), nil +} diff --git a/pkg/gpuinfo/memory_windows.go b/pkg/gpuinfo/memory_windows.go new file mode 100644 index 000000000..7ca9a0e49 --- /dev/null +++ b/pkg/gpuinfo/memory_windows.go @@ -0,0 +1,40 @@ +package gpuinfo + +import ( + "bufio" + "context" + "errors" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "time" +) + +// getVRAMSize returns total system GPU memory in bytes +func getVRAMSize(modelRuntimeInstallPath string) (uint64, error) { + if runtime.GOARCH == "arm64" { + // TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will + // require backend.GetRequiredMemoryForModel to return 1 as well. + return 1, nil + } + + nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "bin", "com.docker.nv-gpu-info.exe") + + ctx, _ := context.WithTimeout(context.Background(), 30*time.Second) + cmd := exec.CommandContext(ctx, nvGPUInfoBin) + out, err := cmd.CombinedOutput() + if err != nil { + return 0, err + } + sc := bufio.NewScanner(strings.NewReader(string(out))) + for sc.Scan() { + vram, found := strings.CutPrefix(sc.Text(), "GPU[0]: dedicated memory:") + if found { + vram = strings.TrimSpace(vram) + return strconv.ParseUint(vram, 10, 64) + } + } + return 0, errors.New("unexpected nv-gpu-info output format") +} diff --git a/pkg/gpuinfo/metal.h b/pkg/gpuinfo/metal.h new file mode 100644 index 000000000..d7e96a5e9 --- /dev/null +++ b/pkg/gpuinfo/metal.h @@ -0,0 +1,5 @@ +//go:build darwin + +#include + +size_t getVRAMSize(); \ No newline at end of file diff --git a/pkg/gpuinfo/metal.m b/pkg/gpuinfo/metal.m new file mode 100644 index 000000000..edcfce1ec --- /dev/null +++ b/pkg/gpuinfo/metal.m @@ -0,0 +1,15 @@ +//go:build darwin + +#include + +#include "metal.h" + +size_t getVRAMSize() { + id device = MTLCreateSystemDefaultDevice(); + if (device) { + size_t vramsz = [device recommendedMaxWorkingSetSize]; + [device release]; + return vramsz; + } + return 0; +} \ No newline at end of file diff --git a/pkg/gpuinfo/nvidia.c b/pkg/gpuinfo/nvidia.c new file mode 100644 index 000000000..e00aeb189 --- /dev/null +++ b/pkg/gpuinfo/nvidia.c @@ -0,0 +1,71 @@ +//go:build linux + +#include "nvidia.h" + +typedef enum { + NVML_SUCCESS = 0 +} nvmlReturn_t; + +typedef struct { + unsigned long long total; + unsigned long long free; + unsigned long long used; +} nvmlMemory_t; + +typedef void* nvmlDevice_t; + +size_t getVRAMSize() { + void* handle; + nvmlReturn_t (*nvmlInit)(void); + nvmlReturn_t (*nvmlShutdown)(void); + nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device); + nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t* memory); + + nvmlReturn_t result; + nvmlDevice_t device; + nvmlMemory_t memory; + + // Try to load libnvidia-ml.so.1 first, then fallback to libnvidia-ml.so + handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY); + if (!handle) { + handle = dlopen("libnvidia-ml.so", RTLD_LAZY); + if (!handle) { + return 0; + } + } + + // Load required functions + nvmlInit = dlsym(handle, "nvmlInit"); + nvmlShutdown = dlsym(handle, "nvmlShutdown"); + nvmlDeviceGetHandleByIndex = dlsym(handle, "nvmlDeviceGetHandleByIndex"); + nvmlDeviceGetMemoryInfo = dlsym(handle, "nvmlDeviceGetMemoryInfo"); + + if (!nvmlInit || !nvmlShutdown || !nvmlDeviceGetHandleByIndex || !nvmlDeviceGetMemoryInfo) { + dlclose(handle); + return 0; + } + + result = nvmlInit(); + if (result != NVML_SUCCESS) { + dlclose(handle); + return 0; + } + + result = nvmlDeviceGetHandleByIndex(0, &device); + if (result != NVML_SUCCESS) { + nvmlShutdown(); + dlclose(handle); + return 0; + } + + result = nvmlDeviceGetMemoryInfo(device, &memory); + if (result != NVML_SUCCESS) { + nvmlShutdown(); + dlclose(handle); + return 0; + } + + nvmlShutdown(); + dlclose(handle); + return memory.total; +} \ No newline at end of file diff --git a/pkg/gpuinfo/nvidia.h b/pkg/gpuinfo/nvidia.h new file mode 100644 index 000000000..302673b5e --- /dev/null +++ b/pkg/gpuinfo/nvidia.h @@ -0,0 +1,6 @@ +//go:build linux + +#include +#include + +size_t getVRAMSize(); \ No newline at end of file diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go index 0eae5d4cc..944ec126a 100644 --- a/pkg/inference/backend.go +++ b/pkg/inference/backend.go @@ -34,6 +34,11 @@ type BackendConfiguration struct { RuntimeFlags []string `json:"runtime-flags,omitempty"` } +type RequiredMemory struct { + RAM uint64 + VRAM uint64 // TODO(p1-0tr): for now assume we are working with single GPU set-ups +} + // Backend is the interface implemented by inference engine backends. Backend // implementations need not be safe for concurrent invocation of the following // methods, though their underlying server implementations do need to support @@ -76,4 +81,7 @@ type Backend interface { Status() string // GetDiskUsage returns the disk usage of the backend. GetDiskUsage() (int64, error) + // GetRequiredMemoryForModel returns the required working memory for a given + // model. + GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error) } diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go index f745320d0..09de11f5d 100644 --- a/pkg/inference/backends/llamacpp/llamacpp.go +++ b/pkg/inference/backends/llamacpp/llamacpp.go @@ -1,6 +1,7 @@ package llamacpp import ( + "bufio" "context" "errors" "fmt" @@ -10,9 +11,12 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "runtime" "strings" + parser "github.com/gpustack/gguf-parser-go" + "github.com/docker/model-runner/pkg/diskusage" "github.com/docker/model-runner/pkg/inference" "github.com/docker/model-runner/pkg/inference/config" @@ -44,6 +48,8 @@ type llamaCpp struct { status string // config is the configuration for the llama.cpp backend. config config.BackendConfig + // gpuSupported indicates whether the underlying llama-server is built with GPU support. + gpuSupported bool } // New creates a new llama.cpp-based backend. @@ -116,6 +122,9 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error { l.updatedLlamaCpp = true } + l.gpuSupported = l.checkGPUSupport(ctx) + l.log.Infof("installed llama-server with gpuSupport=%t", l.gpuSupported) + return nil } @@ -213,3 +222,86 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) { } return size, nil } + +func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) { + mdl, err := l.modelManager.GetModel(model) + if err != nil { + return nil, fmt.Errorf("getting model(%s): %w", model, err) + } + mdlPath, err := mdl.GGUFPath() + if err != nil { + return nil, fmt.Errorf("getting gguf path for model(%s): %w", model, err) + } + mdlGguf, err := parser.ParseGGUFFile(mdlPath) + if err != nil { + return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err) + } + mdlConfig, err := mdl.Config() + if err != nil { + return nil, fmt.Errorf("accessing model(%s) config: %w", model, err) + } + + contextSize := GetContextSize(&mdlConfig, config) + + ngl := uint64(0) + if l.gpuSupported { + if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" { + ngl = 0 // only Q4_0 models can be accelerated on Adreno + } + ngl = 100 + } + + // TODO(p1-0tr): for now assume we are running on GPU (single one) - Devices[1]; + // sum up weights + kv cache + context for an estimate of total GPU memory needed + // while running inference with the given model + estimate := mdlGguf.EstimateLLaMACppRun(parser.WithLLaMACppContextSize(int32(contextSize)), + // TODO(p1-0tr): add logic for resolving other param values, instead of hardcoding them + parser.WithLLaMACppLogicalBatchSize(2048), + parser.WithLLaMACppOffloadLayers(ngl)) + ram := uint64(estimate.Devices[0].Weight.Sum() + estimate.Devices[0].KVCache.Sum() + estimate.Devices[0].Computation.Sum()) + var vram uint64 + if len(estimate.Devices) > 1 { + vram = uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum()) + } + + if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" { + // TODO(p1-0tr): For now on windows/arm64 stick to the old behaviour, of allowing + // one model at a time. This WA requires gpuinfo.GetVRAMSize to return 1. + vram = 1 + } + + return &inference.RequiredMemory{ + RAM: ram, + VRAM: vram, + }, nil +} + +func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool { + binPath := l.vendoredServerStoragePath + if l.updatedLlamaCpp { + binPath = l.updatedServerStoragePath + } + out, err := exec.CommandContext( + ctx, + filepath.Join(binPath, "com.docker.llama-server"), + "--list-devices", + ).CombinedOutput() + if err != nil { + l.log.Warnf("Failed to determine if llama-server is built with GPU support: %s", err) + return false + } + sc := bufio.NewScanner(strings.NewReader(string(out))) + expectDev := false + devRe := regexp.MustCompile(`\s{2}.*:\s`) + ndevs := 0 + for sc.Scan() { + if expectDev { + if devRe.MatchString(sc.Text()) { + ndevs += 1 + } + } else { + expectDev = strings.HasPrefix(sc.Text(), "Available devices:") + } + } + return ndevs > 0 +} diff --git a/pkg/inference/backends/llamacpp/llamacpp_config.go b/pkg/inference/backends/llamacpp/llamacpp_config.go index 5c8822d3a..becc3a1bc 100644 --- a/pkg/inference/backends/llamacpp/llamacpp_config.go +++ b/pkg/inference/backends/llamacpp/llamacpp_config.go @@ -57,16 +57,10 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen args = append(args, "--embeddings") } - // Add arguments from model config - if modelCfg.ContextSize != nil { - args = append(args, "--ctx-size", strconv.FormatUint(*modelCfg.ContextSize, 10)) - } + args = append(args, "--ctx-size", strconv.FormatUint(GetContextSize(&modelCfg, config), 10)) // Add arguments from backend config if config != nil { - if config.ContextSize > 0 && !containsArg(args, "--ctx-size") { - args = append(args, "--ctx-size", strconv.FormatInt(config.ContextSize, 10)) - } args = append(args, config.RuntimeFlags...) } @@ -79,6 +73,19 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen return args, nil } +func GetContextSize(modelCfg *types.Config, backendCfg *inference.BackendConfiguration) uint64 { + // Model config takes precedence + if modelCfg != nil && modelCfg.ContextSize != nil { + return *modelCfg.ContextSize + } + // else use backend config + if backendCfg != nil && backendCfg.ContextSize > 0 { + return uint64(backendCfg.ContextSize) + } + // finally return default + return 4096 // llama.cpp default +} + // containsArg checks if the given argument is already in the args slice. func containsArg(args []string, arg string) bool { for _, a := range args { diff --git a/pkg/inference/backends/llamacpp/llamacpp_config_test.go b/pkg/inference/backends/llamacpp/llamacpp_config_test.go index 4b145ca6b..f01300916 100644 --- a/pkg/inference/backends/llamacpp/llamacpp_config_test.go +++ b/pkg/inference/backends/llamacpp/llamacpp_config_test.go @@ -91,6 +91,7 @@ func TestGetArgs(t *testing.T) { "--metrics", "--model", modelPath, "--host", socket, + "--ctx-size", "4096", }, }, { @@ -106,6 +107,7 @@ func TestGetArgs(t *testing.T) { "--model", modelPath, "--host", socket, "--embeddings", + "--ctx-size", "4096", }, }, { @@ -165,6 +167,7 @@ func TestGetArgs(t *testing.T) { "--model", modelPath, "--host", socket, "--embeddings", + "--ctx-size", "4096", "--some", "flag", // model config takes precedence }, }, diff --git a/pkg/inference/backends/mlx/mlx.go b/pkg/inference/backends/mlx/mlx.go index d6cf86e09..2bae36730 100644 --- a/pkg/inference/backends/mlx/mlx.go +++ b/pkg/inference/backends/mlx/mlx.go @@ -62,3 +62,7 @@ func (m *mlx) Status() string { func (m *mlx) GetDiskUsage() (int64, error) { return 0, nil } + +func (m *mlx) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) { + return nil, errors.New("not implemented") +} diff --git a/pkg/inference/backends/vllm/vllm.go b/pkg/inference/backends/vllm/vllm.go index c03c367ad..86334d4e6 100644 --- a/pkg/inference/backends/vllm/vllm.go +++ b/pkg/inference/backends/vllm/vllm.go @@ -62,3 +62,7 @@ func (v *vLLM) Status() string { func (v *vLLM) GetDiskUsage() (int64, error) { return 0, nil } + +func (v *vLLM) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) { + return nil, errors.New("not implemented") +} diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go index 8efee1455..5ac78e412 100644 --- a/pkg/inference/scheduling/loader.go +++ b/pkg/inference/scheduling/loader.go @@ -10,10 +10,12 @@ import ( "time" "github.com/docker/model-runner/pkg/environment" + "github.com/docker/model-runner/pkg/gpuinfo" "github.com/docker/model-runner/pkg/inference" "github.com/docker/model-runner/pkg/inference/models" "github.com/docker/model-runner/pkg/logging" "github.com/docker/model-runner/pkg/metrics" + "github.com/elastic/go-sysinfo" ) const ( @@ -71,7 +73,7 @@ type loader struct { // runnerIdleTimeout is the loader-specific default runner idle timeout. runnerIdleTimeout time.Duration // totalMemory is the total system memory allocated to the loader. - totalMemory uint64 + totalMemory inference.RequiredMemory // idleCheck is used to signal the run loop when timestamps have updated. idleCheck chan struct{} // guard is a sempahore controlling access to all subsequent fields. It is @@ -82,7 +84,7 @@ type loader struct { // loadsEnabled signals that loads are currently enabled. loadsEnabled bool // availableMemory is the available portion of the loader's total memory. - availableMemory uint64 + availableMemory inference.RequiredMemory // waiters is the set of signal channels associated with waiting loaders. We // use a set of signaling channels (instead of a sync.Cond) to enable // polling. Each signaling channel should be buffered (with size 1). @@ -95,7 +97,7 @@ type loader struct { // references maps slot indices to reference counts. references []uint // allocations maps slot indices to memory allocation sizes. - allocations []uint64 + allocations []inference.RequiredMemory // timestamps maps slot indices to last usage times. Values in this slice // are only valid if the corresponding reference count is zero. timestamps []time.Time @@ -111,6 +113,7 @@ func newLoader( backends map[string]inference.Backend, modelManager *models.Manager, openAIRecorder *metrics.OpenAIRecorder, + gpuInfo *gpuinfo.GPUInfo, ) *loader { // Compute the number of runner slots to allocate. Because of RAM and VRAM // limitations, it's unlikely that we'll ever be able to fully populate @@ -132,20 +135,31 @@ func newLoader( } // Compute the amount of available memory. - // - // TODO: For now, we treat the system as having memory size 1 and all models - // as having size 1 (and thus we'll only load a single model at a time). - // However, the loader is designed to use "real" values for each and to - // schedule appropriately. Thus, we should switch to polling the system - // VRAM size here (and potentially even reserving a portion of it) and - // computing model size through estimation (using parameter count and - // quantization data type size). - // - // HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove - // this once we have VRAM estimation. - totalMemory := uint64(1) - if isGPUEnabledCloudEnvironment { - totalMemory = 2 + // TODO(p1-0tr): improve error handling + vramSize, err := gpuInfo.GetVRAMSize() + if err != nil { + vramSize = 1 + log.Warnf("Could not read VRAM size: %s", err) + } else { + log.Infof("Running on system with %dMB VRAM", vramSize/1024/1024) + } + ramSize := uint64(1) + hostInfo, err := sysinfo.Host() + if err != nil { + log.Warnf("Could not read host info: %s", err) + } else { + ram, err := hostInfo.Memory() + if err != nil { + log.Warnf("Could not read host RAM size: %s", err) + } else { + ramSize = ram.Total + log.Infof("Running on system with %dMB RAM", ramSize/1024/1024) + } + } + + totalMemory := inference.RequiredMemory{ + RAM: ramSize, + VRAM: vramSize, } // Create the loader. @@ -162,7 +176,7 @@ func newLoader( runners: make(map[runnerKey]runnerInfo, nSlots), slots: make([]*runner, nSlots), references: make([]uint, nSlots), - allocations: make([]uint64, nSlots), + allocations: make([]inference.RequiredMemory, nSlots), timestamps: make([]time.Time, nSlots), runnerConfigs: make(map[runnerKey]inference.BackendConfiguration), openAIRecorder: openAIRecorder, @@ -219,8 +233,9 @@ func (l *loader) evict(idleOnly bool) int { ) l.slots[runnerInfo.slot].terminate() l.slots[runnerInfo.slot] = nil - l.availableMemory += l.allocations[runnerInfo.slot] - l.allocations[runnerInfo.slot] = 0 + l.availableMemory.RAM += l.allocations[runnerInfo.slot].RAM + l.availableMemory.VRAM += l.allocations[runnerInfo.slot].VRAM + l.allocations[runnerInfo.slot] = inference.RequiredMemory{RAM: 0, VRAM: 0} l.timestamps[runnerInfo.slot] = time.Time{} delete(l.runners, r) } @@ -240,8 +255,9 @@ func (l *loader) evictRunner(backend, model string, mode inference.BackendMode) ) l.slots[runnerInfo.slot].terminate() l.slots[runnerInfo.slot] = nil - l.availableMemory += l.allocations[runnerInfo.slot] - l.allocations[runnerInfo.slot] = 0 + l.availableMemory.RAM += l.allocations[runnerInfo.slot].RAM + l.availableMemory.VRAM += l.allocations[runnerInfo.slot].VRAM + l.allocations[runnerInfo.slot] = inference.RequiredMemory{RAM: 0, VRAM: 0} l.timestamps[runnerInfo.slot] = time.Time{} delete(l.runners, r) } @@ -399,15 +415,24 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string // Estimate the amount of memory that will be used by the model and check // that we're even capable of loading it. - // - // TODO: For now, we treat the system as having memory size 1 and all models - // as having size 1 (and thus we'll only load a single model at a time). - // However, the loader is designed to use "real" values for each and to - // schedule appropriately. Thus, we should switch to computing model size - // here through estimation (using parameter count and quantization data type - // size). - memory := uint64(1) - if memory > l.totalMemory { + var runnerConfig *inference.BackendConfiguration + if rc, ok := l.runnerConfigs[runnerKey{backendName, modelID, mode}]; ok { + runnerConfig = &rc + } + memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig) + if err != nil { + return nil, err + } + l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024) + if l.totalMemory.RAM == 1 { + l.log.Warnf("RAM size unknown. Assume model will fit, but only one.") + memory.RAM = 1 + } + if l.totalMemory.VRAM == 1 { + l.log.Warnf("VRAM size unknown. Assume model will fit, but only one.") + memory.VRAM = 1 + } + if memory.RAM > l.totalMemory.RAM || memory.VRAM > l.totalMemory.VRAM { return nil, errModelTooBig } @@ -454,12 +479,12 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string // If there's not sufficient memory or all slots are full, then try // evicting unused runners. - if memory > l.availableMemory || len(l.runners) == len(l.slots) { + if memory.RAM > l.availableMemory.RAM || memory.VRAM > l.availableMemory.VRAM || len(l.runners) == len(l.slots) { l.evict(false) } // If there's sufficient memory and a free slot, then find the slot. - if memory <= l.availableMemory && len(l.runners) < len(l.slots) { + if memory.RAM <= l.availableMemory.RAM && memory.VRAM <= l.availableMemory.VRAM && len(l.runners) < len(l.slots) { for s, runner := range l.slots { if runner == nil { slot = s @@ -499,11 +524,13 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string } // Perform registration and return the runner. - l.availableMemory -= memory + l.availableMemory.RAM -= memory.RAM + l.availableMemory.VRAM -= memory.VRAM l.runners[runnerKey{backendName, modelID, mode}] = runnerInfo{slot, modelRef} l.slots[slot] = runner l.references[slot] = 1 - l.allocations[slot] = memory + l.allocations[slot].RAM = memory.RAM + l.allocations[slot].VRAM = memory.VRAM return runner, nil } diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go index 23477d12c..05d7926b4 100644 --- a/pkg/inference/scheduling/scheduler.go +++ b/pkg/inference/scheduling/scheduler.go @@ -13,6 +13,7 @@ import ( "time" "github.com/docker/model-distribution/distribution" + "github.com/docker/model-runner/pkg/gpuinfo" "github.com/docker/model-runner/pkg/inference" "github.com/docker/model-runner/pkg/inference/models" "github.com/docker/model-runner/pkg/logging" @@ -55,6 +56,7 @@ func NewScheduler( httpClient *http.Client, allowedOrigins []string, tracker *metrics.Tracker, + gpuInfo *gpuinfo.GPUInfo, ) *Scheduler { openAIRecorder := metrics.NewOpenAIRecorder(log.WithField("component", "openai-recorder"), modelManager) @@ -65,7 +67,7 @@ func NewScheduler( defaultBackend: defaultBackend, modelManager: modelManager, installer: newInstaller(log, backends, httpClient), - loader: newLoader(log, backends, modelManager, openAIRecorder), + loader: newLoader(log, backends, modelManager, openAIRecorder, gpuInfo), router: http.NewServeMux(), tracker: tracker, openAIRecorder: openAIRecorder,