diff --git a/pkg/gpuinfo/gpuinfo.go b/pkg/gpuinfo/gpuinfo.go deleted file mode 100644 index 3bc8f66ee..000000000 --- a/pkg/gpuinfo/gpuinfo.go +++ /dev/null @@ -1,17 +0,0 @@ -package gpuinfo - -type GPUInfo struct { - // modelRuntimeInstallPath is the location where DMR installed it's llama-server - // and accompanying tools - modelRuntimeInstallPath string -} - -func New(modelRuntimeInstallPath string) *GPUInfo { - return &GPUInfo{ - modelRuntimeInstallPath: modelRuntimeInstallPath, - } -} - -func (g *GPUInfo) GetVRAMSize() (uint64, error) { - return getVRAMSize(g.modelRuntimeInstallPath) -} diff --git a/pkg/gpuinfo/memory_darwin_cgo.go b/pkg/gpuinfo/memory_darwin_cgo.go deleted file mode 100644 index 95a20e3da..000000000 --- a/pkg/gpuinfo/memory_darwin_cgo.go +++ /dev/null @@ -1,19 +0,0 @@ -//go:build darwin && cgo - -package gpuinfo - -/* -#cgo LDFLAGS: -framework Metal -#include "metal.h" -*/ -import "C" -import "errors" - -// getVRAMSize returns total system GPU memory in bytes -func getVRAMSize(_ string) (uint64, error) { - vramSize := C.getVRAMSize() - if vramSize == 0 { - return 0, errors.New("could not get metal VRAM size") - } - return uint64(vramSize), nil -} diff --git a/pkg/gpuinfo/memory_darwin_nocgo.go b/pkg/gpuinfo/memory_darwin_nocgo.go deleted file mode 100644 index 915af4487..000000000 --- a/pkg/gpuinfo/memory_darwin_nocgo.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build darwin && !cgo - -package gpuinfo - -import "errors" - -// getVRAMSize returns total system GPU memory in bytes -func getVRAMSize(_ string) (uint64, error) { - return 0, errors.New("unimplemented without cgo") -} diff --git a/pkg/gpuinfo/memory_linux_cgo.go b/pkg/gpuinfo/memory_linux_cgo.go deleted file mode 100644 index 041219ed2..000000000 --- a/pkg/gpuinfo/memory_linux_cgo.go +++ /dev/null @@ -1,19 +0,0 @@ -//go:build linux && cgo - -package gpuinfo - -/* -#cgo LDFLAGS: -ldl -#include "nvidia.h" -*/ -import "C" -import "errors" - -// getVRAMSize returns total system GPU memory in bytes -func getVRAMSize(_ string) (uint64, error) { - vramSize := C.getVRAMSize() - if vramSize == 0 { - return 0, errors.New("could not get nvidia VRAM size") - } - return uint64(vramSize), nil -} diff --git a/pkg/gpuinfo/memory_linux_nocgo.go b/pkg/gpuinfo/memory_linux_nocgo.go deleted file mode 100644 index abe74c18e..000000000 --- a/pkg/gpuinfo/memory_linux_nocgo.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build linux && !cgo - -package gpuinfo - -import "errors" - -// getVRAMSize returns total system GPU memory in bytes -func getVRAMSize(_ string) (uint64, error) { - return 0, errors.New("unimplemented without cgo") -} diff --git a/pkg/gpuinfo/memory_windows.go b/pkg/gpuinfo/memory_windows.go deleted file mode 100644 index f627ed130..000000000 --- a/pkg/gpuinfo/memory_windows.go +++ /dev/null @@ -1,41 +0,0 @@ -package gpuinfo - -import ( - "bufio" - "context" - "errors" - "os/exec" - "path/filepath" - "runtime" - "strconv" - "strings" - "time" -) - -// getVRAMSize returns total system GPU memory in bytes -func getVRAMSize(modelRuntimeInstallPath string) (uint64, error) { - if runtime.GOARCH == "arm64" { - // TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will - // require backend.GetRequiredMemoryForModel to return 1 as well. - return 1, nil - } - - nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "bin", "com.docker.nv-gpu-info.exe") - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - cmd := exec.CommandContext(ctx, nvGPUInfoBin) - out, err := cmd.CombinedOutput() - if err != nil { - return 0, err - } - sc := bufio.NewScanner(strings.NewReader(string(out))) - for sc.Scan() { - vram, found := strings.CutPrefix(sc.Text(), "GPU[0]: dedicated memory:") - if found { - vram = strings.TrimSpace(vram) - return strconv.ParseUint(vram, 10, 64) - } - } - return 0, errors.New("unexpected nv-gpu-info output format") -} diff --git a/pkg/gpuinfo/metal.h b/pkg/gpuinfo/metal.h deleted file mode 100644 index d7e96a5e9..000000000 --- a/pkg/gpuinfo/metal.h +++ /dev/null @@ -1,5 +0,0 @@ -//go:build darwin - -#include - -size_t getVRAMSize(); \ No newline at end of file diff --git a/pkg/gpuinfo/metal.m b/pkg/gpuinfo/metal.m deleted file mode 100644 index edcfce1ec..000000000 --- a/pkg/gpuinfo/metal.m +++ /dev/null @@ -1,15 +0,0 @@ -//go:build darwin - -#include - -#include "metal.h" - -size_t getVRAMSize() { - id device = MTLCreateSystemDefaultDevice(); - if (device) { - size_t vramsz = [device recommendedMaxWorkingSetSize]; - [device release]; - return vramsz; - } - return 0; -} \ No newline at end of file diff --git a/pkg/gpuinfo/nvidia.c b/pkg/gpuinfo/nvidia.c deleted file mode 100644 index e00aeb189..000000000 --- a/pkg/gpuinfo/nvidia.c +++ /dev/null @@ -1,71 +0,0 @@ -//go:build linux - -#include "nvidia.h" - -typedef enum { - NVML_SUCCESS = 0 -} nvmlReturn_t; - -typedef struct { - unsigned long long total; - unsigned long long free; - unsigned long long used; -} nvmlMemory_t; - -typedef void* nvmlDevice_t; - -size_t getVRAMSize() { - void* handle; - nvmlReturn_t (*nvmlInit)(void); - nvmlReturn_t (*nvmlShutdown)(void); - nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device); - nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t* memory); - - nvmlReturn_t result; - nvmlDevice_t device; - nvmlMemory_t memory; - - // Try to load libnvidia-ml.so.1 first, then fallback to libnvidia-ml.so - handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY); - if (!handle) { - handle = dlopen("libnvidia-ml.so", RTLD_LAZY); - if (!handle) { - return 0; - } - } - - // Load required functions - nvmlInit = dlsym(handle, "nvmlInit"); - nvmlShutdown = dlsym(handle, "nvmlShutdown"); - nvmlDeviceGetHandleByIndex = dlsym(handle, "nvmlDeviceGetHandleByIndex"); - nvmlDeviceGetMemoryInfo = dlsym(handle, "nvmlDeviceGetMemoryInfo"); - - if (!nvmlInit || !nvmlShutdown || !nvmlDeviceGetHandleByIndex || !nvmlDeviceGetMemoryInfo) { - dlclose(handle); - return 0; - } - - result = nvmlInit(); - if (result != NVML_SUCCESS) { - dlclose(handle); - return 0; - } - - result = nvmlDeviceGetHandleByIndex(0, &device); - if (result != NVML_SUCCESS) { - nvmlShutdown(); - dlclose(handle); - return 0; - } - - result = nvmlDeviceGetMemoryInfo(device, &memory); - if (result != NVML_SUCCESS) { - nvmlShutdown(); - dlclose(handle); - return 0; - } - - nvmlShutdown(); - dlclose(handle); - return memory.total; -} \ No newline at end of file diff --git a/pkg/gpuinfo/nvidia.h b/pkg/gpuinfo/nvidia.h deleted file mode 100644 index 302673b5e..000000000 --- a/pkg/gpuinfo/nvidia.h +++ /dev/null @@ -1,6 +0,0 @@ -//go:build linux - -#include -#include - -size_t getVRAMSize(); \ No newline at end of file diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go index 36b7580a1..716de07a4 100644 --- a/pkg/inference/backend.go +++ b/pkg/inference/backend.go @@ -159,7 +159,4 @@ type Backend interface { Status() string // GetDiskUsage returns the disk usage of the backend. GetDiskUsage() (int64, error) - // GetRequiredMemoryForModel returns the required working memory for a given - // model. - GetRequiredMemoryForModel(ctx context.Context, model string, config *BackendConfiguration) (RequiredMemory, error) } diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go index 8c8ba6f76..b7fbabba8 100644 --- a/pkg/inference/backends/llamacpp/llamacpp.go +++ b/pkg/inference/backends/llamacpp/llamacpp.go @@ -14,11 +14,8 @@ import ( "strconv" "strings" - "github.com/docker/model-runner/pkg/distribution/types" - v1 "github.com/docker/model-runner/pkg/go-containerregistry/pkg/v1" - parser "github.com/gpustack/gguf-parser-go" - "github.com/docker/model-runner/pkg/diskusage" + "github.com/docker/model-runner/pkg/distribution/types" "github.com/docker/model-runner/pkg/inference" "github.com/docker/model-runner/pkg/inference/backends" "github.com/docker/model-runner/pkg/inference/config" @@ -192,143 +189,6 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) { return size, nil } -func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (inference.RequiredMemory, error) { - mdlGguf, mdlConfig, err := l.parseModel(ctx, model) - if err != nil { - return inference.RequiredMemory{}, &inference.ErrGGUFParse{Err: err} - } - - configuredContextSize := GetContextSize(mdlConfig, config) - contextSize := int32(4096) // default context size - if configuredContextSize != nil { - contextSize = *configuredContextSize - } - - var ngl uint64 - if l.gpuSupported { - ngl = 999 - if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" { - ngl = 0 // only Q4_0 models can be accelerated on Adreno - } - } - - memory := l.estimateMemoryFromGGUF(mdlGguf, contextSize, ngl) - - if config != nil && config.Speculative != nil && config.Speculative.DraftModel != "" { - draftGguf, _, err := l.parseModel(ctx, config.Speculative.DraftModel) - if err != nil { - return inference.RequiredMemory{}, fmt.Errorf("estimating draft model memory: %w", &inference.ErrGGUFParse{Err: err}) - } - draftMemory := l.estimateMemoryFromGGUF(draftGguf, contextSize, ngl) - memory.RAM += draftMemory.RAM - memory.VRAM += draftMemory.VRAM - } - - if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" { - memory.VRAM = 1 - } - - return memory, nil -} - -// parseModel parses a model (local or remote) and returns the GGUF file and config. -func (l *llamaCpp) parseModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) { - inStore, err := l.modelManager.InStore(model) - if err != nil { - return nil, types.Config{}, fmt.Errorf("checking if model is in local store: %w", err) - } - if inStore { - return l.parseLocalModel(model) - } - return l.parseRemoteModel(ctx, model) -} - -// estimateMemoryFromGGUF estimates memory requirements from a parsed GGUF file. -func (l *llamaCpp) estimateMemoryFromGGUF(ggufFile *parser.GGUFFile, contextSize int32, ngl uint64) inference.RequiredMemory { - estimate := ggufFile.EstimateLLaMACppRun( - parser.WithLLaMACppContextSize(contextSize), - parser.WithLLaMACppLogicalBatchSize(2048), - parser.WithLLaMACppOffloadLayers(ngl), - ) - ram := uint64(estimate.Devices[0].Weight.Sum() + estimate.Devices[0].KVCache.Sum() + estimate.Devices[0].Computation.Sum()) - var vram uint64 - if len(estimate.Devices) > 1 { - vram = uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum()) - } - - return inference.RequiredMemory{ - RAM: ram, - VRAM: vram, - } -} - -func (l *llamaCpp) parseLocalModel(model string) (*parser.GGUFFile, types.Config, error) { - bundle, err := l.modelManager.GetBundle(model) - if err != nil { - return nil, types.Config{}, fmt.Errorf("getting model(%s): %w", model, err) - } - modelGGUF, err := parser.ParseGGUFFile(bundle.GGUFPath()) - if err != nil { - return nil, types.Config{}, fmt.Errorf("parsing gguf(%s): %w", bundle.GGUFPath(), err) - } - return modelGGUF, bundle.RuntimeConfig(), nil -} - -func (l *llamaCpp) parseRemoteModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) { - mdl, err := l.modelManager.GetRemote(ctx, model) - if err != nil { - return nil, types.Config{}, fmt.Errorf("getting remote model(%s): %w", model, err) - } - layers, err := mdl.Layers() - if err != nil { - return nil, types.Config{}, fmt.Errorf("getting layers of model(%s): %w", model, err) - } - ggufLayers := getGGUFLayers(layers) - if len(ggufLayers) != 1 { - return nil, types.Config{}, fmt.Errorf( - "remote memory estimation only supported for models with single GGUF layer, found %d layers", len(ggufLayers), - ) - } - ggufDigest, err := ggufLayers[0].Digest() - if err != nil { - return nil, types.Config{}, fmt.Errorf("getting digest of GGUF layer for model(%s): %w", model, err) - } - if ggufDigest.String() == "" { - return nil, types.Config{}, fmt.Errorf("model(%s) has no GGUF layer", model) - } - blobURL, err := l.modelManager.GetRemoteBlobURL(model, ggufDigest) - if err != nil { - return nil, types.Config{}, fmt.Errorf("getting GGUF blob URL for model(%s): %w", model, err) - } - tok, err := l.modelManager.BearerTokenForModel(ctx, model) - if err != nil { - return nil, types.Config{}, fmt.Errorf("getting bearer token for model(%s): %w", model, err) - } - mdlGguf, err := parser.ParseGGUFFileRemote(ctx, blobURL, parser.UseBearerAuth(tok)) - if err != nil { - return nil, types.Config{}, fmt.Errorf("parsing GGUF for model(%s): %w", model, err) - } - config, err := mdl.Config() - if err != nil { - return nil, types.Config{}, fmt.Errorf("getting config for model(%s): %w", model, err) - } - return mdlGguf, config, nil -} - -func getGGUFLayers(layers []v1.Layer) []v1.Layer { - var filtered []v1.Layer - for _, layer := range layers { - mt, err := layer.MediaType() - if err != nil { - continue - } - if mt == types.MediaTypeGGUF { - filtered = append(filtered, layer) - } - } - return filtered -} - func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool { binPath := l.vendoredServerStoragePath if l.updatedLlamaCpp { diff --git a/pkg/inference/backends/mlx/mlx.go b/pkg/inference/backends/mlx/mlx.go index 420c7c5f3..ea4b71105 100644 --- a/pkg/inference/backends/mlx/mlx.go +++ b/pkg/inference/backends/mlx/mlx.go @@ -138,12 +138,3 @@ func (m *mlx) GetDiskUsage() (int64, error) { // It's installed via pip in the system Python environment return 0, nil } - -func (m *mlx) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (inference.RequiredMemory, error) { - // TODO: Implement accurate memory estimation based on model size. - // MLX runs on unified memory architecture (Apple Silicon), so memory estimation - // will need to account for the unified nature of RAM and VRAM on Apple Silicon. - // Returning an error prevents the scheduler from making incorrect decisions based - // on placeholder values. - return inference.RequiredMemory{}, errors.New("not implemented") -} diff --git a/pkg/inference/backends/vllm/vllm.go b/pkg/inference/backends/vllm/vllm.go index aa50bb280..0155707e9 100644 --- a/pkg/inference/backends/vllm/vllm.go +++ b/pkg/inference/backends/vllm/vllm.go @@ -165,17 +165,6 @@ func (v *vLLM) GetDiskUsage() (int64, error) { return size, nil } -func (v *vLLM) GetRequiredMemoryForModel(_ context.Context, _ string, _ *inference.BackendConfiguration) (inference.RequiredMemory, error) { - if !platform.SupportsVLLM() { - return inference.RequiredMemory{}, errors.New("not implemented") - } - - return inference.RequiredMemory{ - RAM: 1, - VRAM: 1, - }, nil -} - func (v *vLLM) binaryPath() string { return filepath.Join(vllmDir, "vllm") } diff --git a/pkg/inference/scheduling/loader_test.go b/pkg/inference/scheduling/loader_test.go index 7ac5841b2..ffa5b666e 100644 --- a/pkg/inference/scheduling/loader_test.go +++ b/pkg/inference/scheduling/loader_test.go @@ -39,10 +39,6 @@ func (m *mockBackend) GetDiskUsage() (int64, error) { return 0, nil } -func (m *mockBackend) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (inference.RequiredMemory, error) { - return m.requiredMemory, nil -} - func (m *mockBackend) UsesExternalModelManagement() bool { return m.usesExternalModelMgmt }