Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions pkg/gpuinfo/gpuinfo.go

This file was deleted.

19 changes: 0 additions & 19 deletions pkg/gpuinfo/memory_darwin_cgo.go

This file was deleted.

10 changes: 0 additions & 10 deletions pkg/gpuinfo/memory_darwin_nocgo.go

This file was deleted.

19 changes: 0 additions & 19 deletions pkg/gpuinfo/memory_linux_cgo.go

This file was deleted.

10 changes: 0 additions & 10 deletions pkg/gpuinfo/memory_linux_nocgo.go

This file was deleted.

41 changes: 0 additions & 41 deletions pkg/gpuinfo/memory_windows.go

This file was deleted.

5 changes: 0 additions & 5 deletions pkg/gpuinfo/metal.h

This file was deleted.

15 changes: 0 additions & 15 deletions pkg/gpuinfo/metal.m

This file was deleted.

71 changes: 0 additions & 71 deletions pkg/gpuinfo/nvidia.c

This file was deleted.

6 changes: 0 additions & 6 deletions pkg/gpuinfo/nvidia.h

This file was deleted.

3 changes: 0 additions & 3 deletions pkg/inference/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,4 @@ type Backend interface {
Status() string
// GetDiskUsage returns the disk usage of the backend.
GetDiskUsage() (int64, error)
// GetRequiredMemoryForModel returns the required working memory for a given
// model.
GetRequiredMemoryForModel(ctx context.Context, model string, config *BackendConfiguration) (RequiredMemory, error)
}
142 changes: 1 addition & 141 deletions pkg/inference/backends/llamacpp/llamacpp.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@ import (
"strconv"
"strings"

"github.com/docker/model-runner/pkg/distribution/types"
v1 "github.com/docker/model-runner/pkg/go-containerregistry/pkg/v1"
parser "github.com/gpustack/gguf-parser-go"

"github.com/docker/model-runner/pkg/diskusage"
"github.com/docker/model-runner/pkg/distribution/types"
"github.com/docker/model-runner/pkg/inference"
"github.com/docker/model-runner/pkg/inference/backends"
"github.com/docker/model-runner/pkg/inference/config"
Expand Down Expand Up @@ -192,143 +189,6 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) {
return size, nil
}

func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (inference.RequiredMemory, error) {
mdlGguf, mdlConfig, err := l.parseModel(ctx, model)
if err != nil {
return inference.RequiredMemory{}, &inference.ErrGGUFParse{Err: err}
}

configuredContextSize := GetContextSize(mdlConfig, config)
contextSize := int32(4096) // default context size
if configuredContextSize != nil {
contextSize = *configuredContextSize
}

var ngl uint64
if l.gpuSupported {
ngl = 999
if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" {
ngl = 0 // only Q4_0 models can be accelerated on Adreno
}
}

memory := l.estimateMemoryFromGGUF(mdlGguf, contextSize, ngl)

if config != nil && config.Speculative != nil && config.Speculative.DraftModel != "" {
draftGguf, _, err := l.parseModel(ctx, config.Speculative.DraftModel)
if err != nil {
return inference.RequiredMemory{}, fmt.Errorf("estimating draft model memory: %w", &inference.ErrGGUFParse{Err: err})
}
draftMemory := l.estimateMemoryFromGGUF(draftGguf, contextSize, ngl)
memory.RAM += draftMemory.RAM
memory.VRAM += draftMemory.VRAM
}

if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
memory.VRAM = 1
}

return memory, nil
}

// parseModel parses a model (local or remote) and returns the GGUF file and config.
func (l *llamaCpp) parseModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) {
inStore, err := l.modelManager.InStore(model)
if err != nil {
return nil, types.Config{}, fmt.Errorf("checking if model is in local store: %w", err)
}
if inStore {
return l.parseLocalModel(model)
}
return l.parseRemoteModel(ctx, model)
}

// estimateMemoryFromGGUF estimates memory requirements from a parsed GGUF file.
func (l *llamaCpp) estimateMemoryFromGGUF(ggufFile *parser.GGUFFile, contextSize int32, ngl uint64) inference.RequiredMemory {
estimate := ggufFile.EstimateLLaMACppRun(
parser.WithLLaMACppContextSize(contextSize),
parser.WithLLaMACppLogicalBatchSize(2048),
parser.WithLLaMACppOffloadLayers(ngl),
)
ram := uint64(estimate.Devices[0].Weight.Sum() + estimate.Devices[0].KVCache.Sum() + estimate.Devices[0].Computation.Sum())
var vram uint64
if len(estimate.Devices) > 1 {
vram = uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum())
}

return inference.RequiredMemory{
RAM: ram,
VRAM: vram,
}
}

func (l *llamaCpp) parseLocalModel(model string) (*parser.GGUFFile, types.Config, error) {
bundle, err := l.modelManager.GetBundle(model)
if err != nil {
return nil, types.Config{}, fmt.Errorf("getting model(%s): %w", model, err)
}
modelGGUF, err := parser.ParseGGUFFile(bundle.GGUFPath())
if err != nil {
return nil, types.Config{}, fmt.Errorf("parsing gguf(%s): %w", bundle.GGUFPath(), err)
}
return modelGGUF, bundle.RuntimeConfig(), nil
}

func (l *llamaCpp) parseRemoteModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) {
mdl, err := l.modelManager.GetRemote(ctx, model)
if err != nil {
return nil, types.Config{}, fmt.Errorf("getting remote model(%s): %w", model, err)
}
layers, err := mdl.Layers()
if err != nil {
return nil, types.Config{}, fmt.Errorf("getting layers of model(%s): %w", model, err)
}
ggufLayers := getGGUFLayers(layers)
if len(ggufLayers) != 1 {
return nil, types.Config{}, fmt.Errorf(
"remote memory estimation only supported for models with single GGUF layer, found %d layers", len(ggufLayers),
)
}
ggufDigest, err := ggufLayers[0].Digest()
if err != nil {
return nil, types.Config{}, fmt.Errorf("getting digest of GGUF layer for model(%s): %w", model, err)
}
if ggufDigest.String() == "" {
return nil, types.Config{}, fmt.Errorf("model(%s) has no GGUF layer", model)
}
blobURL, err := l.modelManager.GetRemoteBlobURL(model, ggufDigest)
if err != nil {
return nil, types.Config{}, fmt.Errorf("getting GGUF blob URL for model(%s): %w", model, err)
}
tok, err := l.modelManager.BearerTokenForModel(ctx, model)
if err != nil {
return nil, types.Config{}, fmt.Errorf("getting bearer token for model(%s): %w", model, err)
}
mdlGguf, err := parser.ParseGGUFFileRemote(ctx, blobURL, parser.UseBearerAuth(tok))
if err != nil {
return nil, types.Config{}, fmt.Errorf("parsing GGUF for model(%s): %w", model, err)
}
config, err := mdl.Config()
if err != nil {
return nil, types.Config{}, fmt.Errorf("getting config for model(%s): %w", model, err)
}
return mdlGguf, config, nil
}

func getGGUFLayers(layers []v1.Layer) []v1.Layer {
var filtered []v1.Layer
for _, layer := range layers {
mt, err := layer.MediaType()
if err != nil {
continue
}
if mt == types.MediaTypeGGUF {
filtered = append(filtered, layer)
}
}
return filtered
}

func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool {
binPath := l.vendoredServerStoragePath
if l.updatedLlamaCpp {
Expand Down
Loading