diff --git a/pkg/gpuinfo/gpuinfo.go b/pkg/gpuinfo/gpuinfo.go
deleted file mode 100644
index 3bc8f66ee..000000000
--- a/pkg/gpuinfo/gpuinfo.go
+++ /dev/null
@@ -1,17 +0,0 @@
-package gpuinfo
-
-type GPUInfo struct {
-	// modelRuntimeInstallPath is the location where DMR installed it's llama-server
-	// and accompanying tools
-	modelRuntimeInstallPath string
-}
-
-func New(modelRuntimeInstallPath string) *GPUInfo {
-	return &GPUInfo{
-		modelRuntimeInstallPath: modelRuntimeInstallPath,
-	}
-}
-
-func (g *GPUInfo) GetVRAMSize() (uint64, error) {
-	return getVRAMSize(g.modelRuntimeInstallPath)
-}
diff --git a/pkg/gpuinfo/memory_darwin_cgo.go b/pkg/gpuinfo/memory_darwin_cgo.go
deleted file mode 100644
index 95a20e3da..000000000
--- a/pkg/gpuinfo/memory_darwin_cgo.go
+++ /dev/null
@@ -1,19 +0,0 @@
-//go:build darwin && cgo
-
-package gpuinfo
-
-/*
-#cgo LDFLAGS: -framework Metal
-#include "metal.h"
-*/
-import "C"
-import "errors"
-
-// getVRAMSize returns total system GPU memory in bytes
-func getVRAMSize(_ string) (uint64, error) {
-	vramSize := C.getVRAMSize()
-	if vramSize == 0 {
-		return 0, errors.New("could not get metal VRAM size")
-	}
-	return uint64(vramSize), nil
-}
diff --git a/pkg/gpuinfo/memory_darwin_nocgo.go b/pkg/gpuinfo/memory_darwin_nocgo.go
deleted file mode 100644
index 915af4487..000000000
--- a/pkg/gpuinfo/memory_darwin_nocgo.go
+++ /dev/null
@@ -1,10 +0,0 @@
-//go:build darwin && !cgo
-
-package gpuinfo
-
-import "errors"
-
-// getVRAMSize returns total system GPU memory in bytes
-func getVRAMSize(_ string) (uint64, error) {
-	return 0, errors.New("unimplemented without cgo")
-}
diff --git a/pkg/gpuinfo/memory_linux_cgo.go b/pkg/gpuinfo/memory_linux_cgo.go
deleted file mode 100644
index 041219ed2..000000000
--- a/pkg/gpuinfo/memory_linux_cgo.go
+++ /dev/null
@@ -1,19 +0,0 @@
-//go:build linux && cgo
-
-package gpuinfo
-
-/*
-#cgo LDFLAGS: -ldl
-#include "nvidia.h"
-*/
-import "C"
-import "errors"
-
-// getVRAMSize returns total system GPU memory in bytes
-func getVRAMSize(_ string) (uint64, error) {
-	vramSize := C.getVRAMSize()
-	if vramSize == 0 {
-		return 0, errors.New("could not get nvidia VRAM size")
-	}
-	return uint64(vramSize), nil
-}
diff --git a/pkg/gpuinfo/memory_linux_nocgo.go b/pkg/gpuinfo/memory_linux_nocgo.go
deleted file mode 100644
index abe74c18e..000000000
--- a/pkg/gpuinfo/memory_linux_nocgo.go
+++ /dev/null
@@ -1,10 +0,0 @@
-//go:build linux && !cgo
-
-package gpuinfo
-
-import "errors"
-
-// getVRAMSize returns total system GPU memory in bytes
-func getVRAMSize(_ string) (uint64, error) {
-	return 0, errors.New("unimplemented without cgo")
-}
diff --git a/pkg/gpuinfo/memory_windows.go b/pkg/gpuinfo/memory_windows.go
deleted file mode 100644
index f627ed130..000000000
--- a/pkg/gpuinfo/memory_windows.go
+++ /dev/null
@@ -1,41 +0,0 @@
-package gpuinfo
-
-import (
-	"bufio"
-	"context"
-	"errors"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-	"strconv"
-	"strings"
-	"time"
-)
-
-// getVRAMSize returns total system GPU memory in bytes
-func getVRAMSize(modelRuntimeInstallPath string) (uint64, error) {
-	if runtime.GOARCH == "arm64" {
-		// TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will
-		// require backend.GetRequiredMemoryForModel to return 1 as well.
-		return 1, nil
-	}
-
-	nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "bin", "com.docker.nv-gpu-info.exe")
-
-	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
-	defer cancel()
-	cmd := exec.CommandContext(ctx, nvGPUInfoBin)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return 0, err
-	}
-	sc := bufio.NewScanner(strings.NewReader(string(out)))
-	for sc.Scan() {
-		vram, found := strings.CutPrefix(sc.Text(), "GPU[0]: dedicated memory:")
-		if found {
-			vram = strings.TrimSpace(vram)
-			return strconv.ParseUint(vram, 10, 64)
-		}
-	}
-	return 0, errors.New("unexpected nv-gpu-info output format")
-}
diff --git a/pkg/gpuinfo/metal.h b/pkg/gpuinfo/metal.h
deleted file mode 100644
index d7e96a5e9..000000000
--- a/pkg/gpuinfo/metal.h
+++ /dev/null
@@ -1,5 +0,0 @@
-//go:build darwin
-
-#include <stddef.h>
-
-size_t getVRAMSize();
\ No newline at end of file
diff --git a/pkg/gpuinfo/metal.m b/pkg/gpuinfo/metal.m
deleted file mode 100644
index edcfce1ec..000000000
--- a/pkg/gpuinfo/metal.m
+++ /dev/null
@@ -1,15 +0,0 @@
-//go:build darwin
-
-#include <Metal/Metal.h>
-
-#include "metal.h"
-
-size_t getVRAMSize() {
-    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    if (device) {
-        size_t vramsz = [device recommendedMaxWorkingSetSize];
-        [device release];
-        return vramsz;
-    }
-    return 0;
-}
\ No newline at end of file
diff --git a/pkg/gpuinfo/nvidia.c b/pkg/gpuinfo/nvidia.c
deleted file mode 100644
index e00aeb189..000000000
--- a/pkg/gpuinfo/nvidia.c
+++ /dev/null
@@ -1,71 +0,0 @@
-//go:build linux
-
-#include "nvidia.h"
-
-typedef enum {
-    NVML_SUCCESS = 0
-} nvmlReturn_t;
-
-typedef struct {
-    unsigned long long total;
-    unsigned long long free;
-    unsigned long long used;
-} nvmlMemory_t;
-
-typedef void* nvmlDevice_t;
-
-size_t getVRAMSize() {
-    void* handle;
-    nvmlReturn_t (*nvmlInit)(void);
-    nvmlReturn_t (*nvmlShutdown)(void);
-    nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
-    nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t* memory);
-    
-    nvmlReturn_t result;
-    nvmlDevice_t device;
-    nvmlMemory_t memory;
-    
-    // Try to load libnvidia-ml.so.1 first, then fallback to libnvidia-ml.so
-    handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
-    if (!handle) {
-        handle = dlopen("libnvidia-ml.so", RTLD_LAZY);
-        if (!handle) {
-            return 0;
-        }
-    }
-    
-    // Load required functions
-    nvmlInit = dlsym(handle, "nvmlInit");
-    nvmlShutdown = dlsym(handle, "nvmlShutdown");
-    nvmlDeviceGetHandleByIndex = dlsym(handle, "nvmlDeviceGetHandleByIndex");
-    nvmlDeviceGetMemoryInfo = dlsym(handle, "nvmlDeviceGetMemoryInfo");
-    
-    if (!nvmlInit || !nvmlShutdown || !nvmlDeviceGetHandleByIndex || !nvmlDeviceGetMemoryInfo) {
-        dlclose(handle);
-        return 0;
-    }
-    
-    result = nvmlInit();
-    if (result != NVML_SUCCESS) {
-        dlclose(handle);
-        return 0;
-    }
-    
-    result = nvmlDeviceGetHandleByIndex(0, &device);
-    if (result != NVML_SUCCESS) {
-        nvmlShutdown();
-        dlclose(handle);
-        return 0;
-    }
-    
-    result = nvmlDeviceGetMemoryInfo(device, &memory);
-    if (result != NVML_SUCCESS) {
-        nvmlShutdown();
-        dlclose(handle);
-        return 0;
-    }
-    
-    nvmlShutdown();
-    dlclose(handle);
-    return memory.total;
-}
\ No newline at end of file
diff --git a/pkg/gpuinfo/nvidia.h b/pkg/gpuinfo/nvidia.h
deleted file mode 100644
index 302673b5e..000000000
--- a/pkg/gpuinfo/nvidia.h
+++ /dev/null
@@ -1,6 +0,0 @@
-//go:build linux
-
-#include <stddef.h>
-#include <dlfcn.h>
-
-size_t getVRAMSize();
\ No newline at end of file
diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
index 36b7580a1..716de07a4 100644
--- a/pkg/inference/backend.go
+++ b/pkg/inference/backend.go
@@ -159,7 +159,4 @@ type Backend interface {
 	Status() string
 	// GetDiskUsage returns the disk usage of the backend.
 	GetDiskUsage() (int64, error)
-	// GetRequiredMemoryForModel returns the required working memory for a given
-	// model.
-	GetRequiredMemoryForModel(ctx context.Context, model string, config *BackendConfiguration) (RequiredMemory, error)
 }
diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
index 8c8ba6f76..b7fbabba8 100644
--- a/pkg/inference/backends/llamacpp/llamacpp.go
+++ b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -14,11 +14,8 @@ import (
 	"strconv"
 	"strings"
 
-	"github.com/docker/model-runner/pkg/distribution/types"
-	v1 "github.com/docker/model-runner/pkg/go-containerregistry/pkg/v1"
-	parser "github.com/gpustack/gguf-parser-go"
-
 	"github.com/docker/model-runner/pkg/diskusage"
+	"github.com/docker/model-runner/pkg/distribution/types"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/backends"
 	"github.com/docker/model-runner/pkg/inference/config"
@@ -192,143 +189,6 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) {
 	return size, nil
 }
 
-func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (inference.RequiredMemory, error) {
-	mdlGguf, mdlConfig, err := l.parseModel(ctx, model)
-	if err != nil {
-		return inference.RequiredMemory{}, &inference.ErrGGUFParse{Err: err}
-	}
-
-	configuredContextSize := GetContextSize(mdlConfig, config)
-	contextSize := int32(4096) // default context size
-	if configuredContextSize != nil {
-		contextSize = *configuredContextSize
-	}
-
-	var ngl uint64
-	if l.gpuSupported {
-		ngl = 999
-		if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" {
-			ngl = 0 // only Q4_0 models can be accelerated on Adreno
-		}
-	}
-
-	memory := l.estimateMemoryFromGGUF(mdlGguf, contextSize, ngl)
-
-	if config != nil && config.Speculative != nil && config.Speculative.DraftModel != "" {
-		draftGguf, _, err := l.parseModel(ctx, config.Speculative.DraftModel)
-		if err != nil {
-			return inference.RequiredMemory{}, fmt.Errorf("estimating draft model memory: %w", &inference.ErrGGUFParse{Err: err})
-		}
-		draftMemory := l.estimateMemoryFromGGUF(draftGguf, contextSize, ngl)
-		memory.RAM += draftMemory.RAM
-		memory.VRAM += draftMemory.VRAM
-	}
-
-	if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
-		memory.VRAM = 1
-	}
-
-	return memory, nil
-}
-
-// parseModel parses a model (local or remote) and returns the GGUF file and config.
-func (l *llamaCpp) parseModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) {
-	inStore, err := l.modelManager.InStore(model)
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("checking if model is in local store: %w", err)
-	}
-	if inStore {
-		return l.parseLocalModel(model)
-	}
-	return l.parseRemoteModel(ctx, model)
-}
-
-// estimateMemoryFromGGUF estimates memory requirements from a parsed GGUF file.
-func (l *llamaCpp) estimateMemoryFromGGUF(ggufFile *parser.GGUFFile, contextSize int32, ngl uint64) inference.RequiredMemory {
-	estimate := ggufFile.EstimateLLaMACppRun(
-		parser.WithLLaMACppContextSize(contextSize),
-		parser.WithLLaMACppLogicalBatchSize(2048),
-		parser.WithLLaMACppOffloadLayers(ngl),
-	)
-	ram := uint64(estimate.Devices[0].Weight.Sum() + estimate.Devices[0].KVCache.Sum() + estimate.Devices[0].Computation.Sum())
-	var vram uint64
-	if len(estimate.Devices) > 1 {
-		vram = uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum())
-	}
-
-	return inference.RequiredMemory{
-		RAM:  ram,
-		VRAM: vram,
-	}
-}
-
-func (l *llamaCpp) parseLocalModel(model string) (*parser.GGUFFile, types.Config, error) {
-	bundle, err := l.modelManager.GetBundle(model)
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("getting model(%s): %w", model, err)
-	}
-	modelGGUF, err := parser.ParseGGUFFile(bundle.GGUFPath())
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("parsing gguf(%s): %w", bundle.GGUFPath(), err)
-	}
-	return modelGGUF, bundle.RuntimeConfig(), nil
-}
-
-func (l *llamaCpp) parseRemoteModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) {
-	mdl, err := l.modelManager.GetRemote(ctx, model)
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("getting remote model(%s): %w", model, err)
-	}
-	layers, err := mdl.Layers()
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("getting layers of model(%s): %w", model, err)
-	}
-	ggufLayers := getGGUFLayers(layers)
-	if len(ggufLayers) != 1 {
-		return nil, types.Config{}, fmt.Errorf(
-			"remote memory estimation only supported for models with single GGUF layer, found %d layers", len(ggufLayers),
-		)
-	}
-	ggufDigest, err := ggufLayers[0].Digest()
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("getting digest of GGUF layer for model(%s): %w", model, err)
-	}
-	if ggufDigest.String() == "" {
-		return nil, types.Config{}, fmt.Errorf("model(%s) has no GGUF layer", model)
-	}
-	blobURL, err := l.modelManager.GetRemoteBlobURL(model, ggufDigest)
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("getting GGUF blob URL for model(%s): %w", model, err)
-	}
-	tok, err := l.modelManager.BearerTokenForModel(ctx, model)
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("getting bearer token for model(%s): %w", model, err)
-	}
-	mdlGguf, err := parser.ParseGGUFFileRemote(ctx, blobURL, parser.UseBearerAuth(tok))
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("parsing GGUF for model(%s): %w", model, err)
-	}
-	config, err := mdl.Config()
-	if err != nil {
-		return nil, types.Config{}, fmt.Errorf("getting config for model(%s): %w", model, err)
-	}
-	return mdlGguf, config, nil
-}
-
-func getGGUFLayers(layers []v1.Layer) []v1.Layer {
-	var filtered []v1.Layer
-	for _, layer := range layers {
-		mt, err := layer.MediaType()
-		if err != nil {
-			continue
-		}
-		if mt == types.MediaTypeGGUF {
-			filtered = append(filtered, layer)
-		}
-	}
-	return filtered
-}
-
 func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool {
 	binPath := l.vendoredServerStoragePath
 	if l.updatedLlamaCpp {
diff --git a/pkg/inference/backends/mlx/mlx.go b/pkg/inference/backends/mlx/mlx.go
index 420c7c5f3..ea4b71105 100644
--- a/pkg/inference/backends/mlx/mlx.go
+++ b/pkg/inference/backends/mlx/mlx.go
@@ -138,12 +138,3 @@ func (m *mlx) GetDiskUsage() (int64, error) {
 	// It's installed via pip in the system Python environment
 	return 0, nil
 }
-
-func (m *mlx) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (inference.RequiredMemory, error) {
-	// TODO: Implement accurate memory estimation based on model size.
-	// MLX runs on unified memory architecture (Apple Silicon), so memory estimation
-	// will need to account for the unified nature of RAM and VRAM on Apple Silicon.
-	// Returning an error prevents the scheduler from making incorrect decisions based
-	// on placeholder values.
-	return inference.RequiredMemory{}, errors.New("not implemented")
-}
diff --git a/pkg/inference/backends/vllm/vllm.go b/pkg/inference/backends/vllm/vllm.go
index aa50bb280..0155707e9 100644
--- a/pkg/inference/backends/vllm/vllm.go
+++ b/pkg/inference/backends/vllm/vllm.go
@@ -165,17 +165,6 @@ func (v *vLLM) GetDiskUsage() (int64, error) {
 	return size, nil
 }
 
-func (v *vLLM) GetRequiredMemoryForModel(_ context.Context, _ string, _ *inference.BackendConfiguration) (inference.RequiredMemory, error) {
-	if !platform.SupportsVLLM() {
-		return inference.RequiredMemory{}, errors.New("not implemented")
-	}
-
-	return inference.RequiredMemory{
-		RAM:  1,
-		VRAM: 1,
-	}, nil
-}
-
 func (v *vLLM) binaryPath() string {
 	return filepath.Join(vllmDir, "vllm")
 }
diff --git a/pkg/inference/scheduling/loader_test.go b/pkg/inference/scheduling/loader_test.go
index 7ac5841b2..ffa5b666e 100644
--- a/pkg/inference/scheduling/loader_test.go
+++ b/pkg/inference/scheduling/loader_test.go
@@ -39,10 +39,6 @@ func (m *mockBackend) GetDiskUsage() (int64, error) {
 	return 0, nil
 }
 
-func (m *mockBackend) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (inference.RequiredMemory, error) {
-	return m.requiredMemory, nil
-}
-
 func (m *mockBackend) UsesExternalModelManagement() bool {
 	return m.usesExternalModelMgmt
 }