diff --git a/Dockerfile b/Dockerfile
index b7f028abe..67a7f91aa 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,7 +27,7 @@ COPY --link . .
 # Build the Go binary (static build)
 RUN --mount=type=cache,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
-    CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
+    CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
 
 # --- Get llama.cpp binary ---
 FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
diff --git a/Makefile b/Makefile
index 41fd154c3..93f90fd15 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,7 @@ LLAMA_ARGS ?=
 
 # Build the Go application
 build:
-	CGO_ENABLED=0 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
+	CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
 
 # Run the application locally
 run: build
diff --git a/go.mod b/go.mod
index 9b95317a8..2d93f6f36 100644
--- a/go.mod
+++ b/go.mod
@@ -29,6 +29,8 @@ require (
 	github.com/docker/cli v27.5.0+incompatible // indirect
 	github.com/docker/distribution v2.8.3+incompatible // indirect
 	github.com/docker/docker-credential-helpers v0.8.2 // indirect
+	github.com/elastic/go-sysinfo v1.15.3 // indirect
+	github.com/elastic/go-windows v1.0.2 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
@@ -45,6 +47,7 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/prometheus/procfs v0.15.1 // indirect
 	github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect
 	github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect
 	github.com/vbatts/tar-split v0.11.6 // indirect
diff --git a/go.sum b/go.sum
index 0aeec9e91..d008aff19 100644
--- a/go.sum
+++ b/go.sum
@@ -40,6 +40,10 @@ github.com/docker/docker-credential-helpers v0.8.2 h1:bX3YxiGzFP5sOXWc3bTPEXdEaZ
 github.com/docker/docker-credential-helpers v0.8.2/go.mod h1:P3ci7E3lwkZg6XiHdRKft1KckHiO9a2rNtyFbZ/ry9M=
 github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e h1:qBkjP4A20f3RXvtstitIPiStQ4p+bK8xcjosrXLBQZ0=
 github.com/docker/model-distribution v0.0.0-20250710123110-a633223e127e/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c=
+github.com/elastic/go-sysinfo v1.15.3 h1:W+RnmhKFkqPTCRoFq2VCTmsT4p/fwpo+3gKNQsn1XU0=
+github.com/elastic/go-sysinfo v1.15.3/go.mod h1:K/cNrqYTDrSoMh2oDkYEMS2+a72GRxMvNP+GC+vRIlo=
+github.com/elastic/go-windows v1.0.2 h1:yoLLsAsV5cfg9FLhZ9EXZ2n2sQFKeDYrHenkcivY4vI=
+github.com/elastic/go-windows v1.0.2/go.mod h1:bGcDpBzXgYSqM0Gx3DM4+UxFj300SZLixie9u9ixLM8=
 github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
@@ -108,6 +112,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw
 github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
 github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4=
 github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
+github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
+github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
 github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8=
diff --git a/main.go b/main.go
index ffa4dde21..287b64b80 100644
--- a/main.go
+++ b/main.go
@@ -10,6 +10,7 @@ import (
 	"strings"
 	"syscall"
 
+	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
 	"github.com/docker/model-runner/pkg/inference/config"
@@ -89,6 +90,8 @@ func main() {
 		log.Fatalf("unable to initialize %s backend: %v", llamacpp.Name, err)
 	}
 
+	gpuInfo := gpuinfo.New(llamaServerPath)
+
 	scheduler := scheduling.NewScheduler(
 		log,
 		map[string]inference.Backend{llamacpp.Name: llamaCppBackend},
@@ -102,6 +105,7 @@ func main() {
 			"",
 			false,
 		),
+		gpuInfo,
 	)
 
 	router := routing.NewNormalizedServeMux()
diff --git a/pkg/gpuinfo/gpuinfo.go b/pkg/gpuinfo/gpuinfo.go
new file mode 100644
index 000000000..3bc8f66ee
--- /dev/null
+++ b/pkg/gpuinfo/gpuinfo.go
@@ -0,0 +1,17 @@
+package gpuinfo
+
+type GPUInfo struct {
+	// modelRuntimeInstallPath is the location where DMR installed it's llama-server
+	// and accompanying tools
+	modelRuntimeInstallPath string
+}
+
+func New(modelRuntimeInstallPath string) *GPUInfo {
+	return &GPUInfo{
+		modelRuntimeInstallPath: modelRuntimeInstallPath,
+	}
+}
+
+func (g *GPUInfo) GetVRAMSize() (uint64, error) {
+	return getVRAMSize(g.modelRuntimeInstallPath)
+}
diff --git a/pkg/gpuinfo/memory_darwin.go b/pkg/gpuinfo/memory_darwin.go
new file mode 100644
index 000000000..73677ca07
--- /dev/null
+++ b/pkg/gpuinfo/memory_darwin.go
@@ -0,0 +1,17 @@
+package gpuinfo
+
+/*
+#cgo LDFLAGS: -framework Metal
+#include "metal.h"
+*/
+import "C"
+import "errors"
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize(_ string) (uint64, error) {
+	vramSize := C.getVRAMSize()
+	if vramSize == 0 {
+		return 0, errors.New("could not get metal VRAM size")
+	}
+	return uint64(vramSize), nil
+}
diff --git a/pkg/gpuinfo/memory_linux.go b/pkg/gpuinfo/memory_linux.go
new file mode 100644
index 000000000..94525711c
--- /dev/null
+++ b/pkg/gpuinfo/memory_linux.go
@@ -0,0 +1,17 @@
+package gpuinfo
+
+/*
+#cgo LDFLAGS: -ldl
+#include "nvidia.h"
+*/
+import "C"
+import "errors"
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize(_ string) (uint64, error) {
+	vramSize := C.getVRAMSize()
+	if vramSize == 0 {
+		return 0, errors.New("could not get nvidia VRAM size")
+	}
+	return uint64(vramSize), nil
+}
diff --git a/pkg/gpuinfo/memory_windows.go b/pkg/gpuinfo/memory_windows.go
new file mode 100644
index 000000000..7ca9a0e49
--- /dev/null
+++ b/pkg/gpuinfo/memory_windows.go
@@ -0,0 +1,40 @@
+package gpuinfo
+
+import (
+	"bufio"
+	"context"
+	"errors"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// getVRAMSize returns total system GPU memory in bytes
+func getVRAMSize(modelRuntimeInstallPath string) (uint64, error) {
+	if runtime.GOARCH == "arm64" {
+		// TODO(p1-0tr): For now, on windows/arm64, stick to the old behaviour. This will
+		// require backend.GetRequiredMemoryForModel to return 1 as well.
+		return 1, nil
+	}
+
+	nvGPUInfoBin := filepath.Join(modelRuntimeInstallPath, "bin", "com.docker.nv-gpu-info.exe")
+
+	ctx, _ := context.WithTimeout(context.Background(), 30*time.Second)
+	cmd := exec.CommandContext(ctx, nvGPUInfoBin)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		return 0, err
+	}
+	sc := bufio.NewScanner(strings.NewReader(string(out)))
+	for sc.Scan() {
+		vram, found := strings.CutPrefix(sc.Text(), "GPU[0]: dedicated memory:")
+		if found {
+			vram = strings.TrimSpace(vram)
+			return strconv.ParseUint(vram, 10, 64)
+		}
+	}
+	return 0, errors.New("unexpected nv-gpu-info output format")
+}
diff --git a/pkg/gpuinfo/metal.h b/pkg/gpuinfo/metal.h
new file mode 100644
index 000000000..d7e96a5e9
--- /dev/null
+++ b/pkg/gpuinfo/metal.h
@@ -0,0 +1,5 @@
+//go:build darwin
+
+#include <stddef.h>
+
+size_t getVRAMSize();
\ No newline at end of file
diff --git a/pkg/gpuinfo/metal.m b/pkg/gpuinfo/metal.m
new file mode 100644
index 000000000..edcfce1ec
--- /dev/null
+++ b/pkg/gpuinfo/metal.m
@@ -0,0 +1,15 @@
+//go:build darwin
+
+#include <Metal/Metal.h>
+
+#include "metal.h"
+
+size_t getVRAMSize() {
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    if (device) {
+        size_t vramsz = [device recommendedMaxWorkingSetSize];
+        [device release];
+        return vramsz;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/pkg/gpuinfo/nvidia.c b/pkg/gpuinfo/nvidia.c
new file mode 100644
index 000000000..e00aeb189
--- /dev/null
+++ b/pkg/gpuinfo/nvidia.c
@@ -0,0 +1,71 @@
+//go:build linux
+
+#include "nvidia.h"
+
+typedef enum {
+    NVML_SUCCESS = 0
+} nvmlReturn_t;
+
+typedef struct {
+    unsigned long long total;
+    unsigned long long free;
+    unsigned long long used;
+} nvmlMemory_t;
+
+typedef void* nvmlDevice_t;
+
+size_t getVRAMSize() {
+    void* handle;
+    nvmlReturn_t (*nvmlInit)(void);
+    nvmlReturn_t (*nvmlShutdown)(void);
+    nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
+    nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t* memory);
+    
+    nvmlReturn_t result;
+    nvmlDevice_t device;
+    nvmlMemory_t memory;
+    
+    // Try to load libnvidia-ml.so.1 first, then fallback to libnvidia-ml.so
+    handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
+    if (!handle) {
+        handle = dlopen("libnvidia-ml.so", RTLD_LAZY);
+        if (!handle) {
+            return 0;
+        }
+    }
+    
+    // Load required functions
+    nvmlInit = dlsym(handle, "nvmlInit");
+    nvmlShutdown = dlsym(handle, "nvmlShutdown");
+    nvmlDeviceGetHandleByIndex = dlsym(handle, "nvmlDeviceGetHandleByIndex");
+    nvmlDeviceGetMemoryInfo = dlsym(handle, "nvmlDeviceGetMemoryInfo");
+    
+    if (!nvmlInit || !nvmlShutdown || !nvmlDeviceGetHandleByIndex || !nvmlDeviceGetMemoryInfo) {
+        dlclose(handle);
+        return 0;
+    }
+    
+    result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        dlclose(handle);
+        return 0;
+    }
+    
+    result = nvmlDeviceGetHandleByIndex(0, &device);
+    if (result != NVML_SUCCESS) {
+        nvmlShutdown();
+        dlclose(handle);
+        return 0;
+    }
+    
+    result = nvmlDeviceGetMemoryInfo(device, &memory);
+    if (result != NVML_SUCCESS) {
+        nvmlShutdown();
+        dlclose(handle);
+        return 0;
+    }
+    
+    nvmlShutdown();
+    dlclose(handle);
+    return memory.total;
+}
\ No newline at end of file
diff --git a/pkg/gpuinfo/nvidia.h b/pkg/gpuinfo/nvidia.h
new file mode 100644
index 000000000..302673b5e
--- /dev/null
+++ b/pkg/gpuinfo/nvidia.h
@@ -0,0 +1,6 @@
+//go:build linux
+
+#include <stddef.h>
+#include <dlfcn.h>
+
+size_t getVRAMSize();
\ No newline at end of file
diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
index 0eae5d4cc..944ec126a 100644
--- a/pkg/inference/backend.go
+++ b/pkg/inference/backend.go
@@ -34,6 +34,11 @@ type BackendConfiguration struct {
 	RuntimeFlags []string `json:"runtime-flags,omitempty"`
 }
 
+type RequiredMemory struct {
+	RAM  uint64
+	VRAM uint64 // TODO(p1-0tr): for now assume we are working with single GPU set-ups
+}
+
 // Backend is the interface implemented by inference engine backends. Backend
 // implementations need not be safe for concurrent invocation of the following
 // methods, though their underlying server implementations do need to support
@@ -76,4 +81,7 @@ type Backend interface {
 	Status() string
 	// GetDiskUsage returns the disk usage of the backend.
 	GetDiskUsage() (int64, error)
+	// GetRequiredMemoryForModel returns the required working memory for a given
+	// model.
+	GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error)
 }
diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
index f745320d0..09de11f5d 100644
--- a/pkg/inference/backends/llamacpp/llamacpp.go
+++ b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -1,6 +1,7 @@
 package llamacpp
 
 import (
+	"bufio"
 	"context"
 	"errors"
 	"fmt"
@@ -10,9 +11,12 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"runtime"
 	"strings"
 
+	parser "github.com/gpustack/gguf-parser-go"
+
 	"github.com/docker/model-runner/pkg/diskusage"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/config"
@@ -44,6 +48,8 @@ type llamaCpp struct {
 	status string
 	// config is the configuration for the llama.cpp backend.
 	config config.BackendConfig
+	// gpuSupported indicates whether the underlying llama-server is built with GPU support.
+	gpuSupported bool
 }
 
 // New creates a new llama.cpp-based backend.
@@ -116,6 +122,9 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
 		l.updatedLlamaCpp = true
 	}
 
+	l.gpuSupported = l.checkGPUSupport(ctx)
+	l.log.Infof("installed llama-server with gpuSupport=%t", l.gpuSupported)
+
 	return nil
 }
 
@@ -213,3 +222,86 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) {
 	}
 	return size, nil
 }
+
+func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+	mdl, err := l.modelManager.GetModel(model)
+	if err != nil {
+		return nil, fmt.Errorf("getting model(%s): %w", model, err)
+	}
+	mdlPath, err := mdl.GGUFPath()
+	if err != nil {
+		return nil, fmt.Errorf("getting gguf path for model(%s): %w", model, err)
+	}
+	mdlGguf, err := parser.ParseGGUFFile(mdlPath)
+	if err != nil {
+		return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err)
+	}
+	mdlConfig, err := mdl.Config()
+	if err != nil {
+		return nil, fmt.Errorf("accessing model(%s) config: %w", model, err)
+	}
+
+	contextSize := GetContextSize(&mdlConfig, config)
+
+	ngl := uint64(0)
+	if l.gpuSupported {
+		if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" {
+			ngl = 0 // only Q4_0 models can be accelerated on Adreno
+		}
+		ngl = 100
+	}
+
+	// TODO(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];
+	// sum up weights + kv cache + context for an estimate of total GPU memory needed
+	// while running inference with the given model
+	estimate := mdlGguf.EstimateLLaMACppRun(parser.WithLLaMACppContextSize(int32(contextSize)),
+		// TODO(p1-0tr): add logic for resolving other param values, instead of hardcoding them
+		parser.WithLLaMACppLogicalBatchSize(2048),
+		parser.WithLLaMACppOffloadLayers(ngl))
+	ram := uint64(estimate.Devices[0].Weight.Sum() + estimate.Devices[0].KVCache.Sum() + estimate.Devices[0].Computation.Sum())
+	var vram uint64
+	if len(estimate.Devices) > 1 {
+		vram = uint64(estimate.Devices[1].Weight.Sum() + estimate.Devices[1].KVCache.Sum() + estimate.Devices[1].Computation.Sum())
+	}
+
+	if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
+		// TODO(p1-0tr): For now on windows/arm64 stick to the old behaviour, of allowing
+		// one model at a time. This WA requires gpuinfo.GetVRAMSize to return 1.
+		vram = 1
+	}
+
+	return &inference.RequiredMemory{
+		RAM:  ram,
+		VRAM: vram,
+	}, nil
+}
+
+func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool {
+	binPath := l.vendoredServerStoragePath
+	if l.updatedLlamaCpp {
+		binPath = l.updatedServerStoragePath
+	}
+	out, err := exec.CommandContext(
+		ctx,
+		filepath.Join(binPath, "com.docker.llama-server"),
+		"--list-devices",
+	).CombinedOutput()
+	if err != nil {
+		l.log.Warnf("Failed to determine if llama-server is built with GPU support: %s", err)
+		return false
+	}
+	sc := bufio.NewScanner(strings.NewReader(string(out)))
+	expectDev := false
+	devRe := regexp.MustCompile(`\s{2}.*:\s`)
+	ndevs := 0
+	for sc.Scan() {
+		if expectDev {
+			if devRe.MatchString(sc.Text()) {
+				ndevs += 1
+			}
+		} else {
+			expectDev = strings.HasPrefix(sc.Text(), "Available devices:")
+		}
+	}
+	return ndevs > 0
+}
diff --git a/pkg/inference/backends/llamacpp/llamacpp_config.go b/pkg/inference/backends/llamacpp/llamacpp_config.go
index 5c8822d3a..becc3a1bc 100644
--- a/pkg/inference/backends/llamacpp/llamacpp_config.go
+++ b/pkg/inference/backends/llamacpp/llamacpp_config.go
@@ -57,16 +57,10 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen
 		args = append(args, "--embeddings")
 	}
 
-	// Add arguments from model config
-	if modelCfg.ContextSize != nil {
-		args = append(args, "--ctx-size", strconv.FormatUint(*modelCfg.ContextSize, 10))
-	}
+	args = append(args, "--ctx-size", strconv.FormatUint(GetContextSize(&modelCfg, config), 10))
 
 	// Add arguments from backend config
 	if config != nil {
-		if config.ContextSize > 0 && !containsArg(args, "--ctx-size") {
-			args = append(args, "--ctx-size", strconv.FormatInt(config.ContextSize, 10))
-		}
 		args = append(args, config.RuntimeFlags...)
 	}
 
@@ -79,6 +73,19 @@ func (c *Config) GetArgs(model types.Model, socket string, mode inference.Backen
 	return args, nil
 }
 
+func GetContextSize(modelCfg *types.Config, backendCfg *inference.BackendConfiguration) uint64 {
+	// Model config takes precedence
+	if modelCfg != nil && modelCfg.ContextSize != nil {
+		return *modelCfg.ContextSize
+	}
+	// else use backend config
+	if backendCfg != nil && backendCfg.ContextSize > 0 {
+		return uint64(backendCfg.ContextSize)
+	}
+	// finally return default
+	return 4096 // llama.cpp default
+}
+
 // containsArg checks if the given argument is already in the args slice.
 func containsArg(args []string, arg string) bool {
 	for _, a := range args {
diff --git a/pkg/inference/backends/llamacpp/llamacpp_config_test.go b/pkg/inference/backends/llamacpp/llamacpp_config_test.go
index 4b145ca6b..f01300916 100644
--- a/pkg/inference/backends/llamacpp/llamacpp_config_test.go
+++ b/pkg/inference/backends/llamacpp/llamacpp_config_test.go
@@ -91,6 +91,7 @@ func TestGetArgs(t *testing.T) {
 				"--metrics",
 				"--model", modelPath,
 				"--host", socket,
+				"--ctx-size", "4096",
 			},
 		},
 		{
@@ -106,6 +107,7 @@ func TestGetArgs(t *testing.T) {
 				"--model", modelPath,
 				"--host", socket,
 				"--embeddings",
+				"--ctx-size", "4096",
 			},
 		},
 		{
@@ -165,6 +167,7 @@ func TestGetArgs(t *testing.T) {
 				"--model", modelPath,
 				"--host", socket,
 				"--embeddings",
+				"--ctx-size", "4096",
 				"--some", "flag", // model config takes precedence
 			},
 		},
diff --git a/pkg/inference/backends/mlx/mlx.go b/pkg/inference/backends/mlx/mlx.go
index d6cf86e09..2bae36730 100644
--- a/pkg/inference/backends/mlx/mlx.go
+++ b/pkg/inference/backends/mlx/mlx.go
@@ -62,3 +62,7 @@ func (m *mlx) Status() string {
 func (m *mlx) GetDiskUsage() (int64, error) {
 	return 0, nil
 }
+
+func (m *mlx) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+	return nil, errors.New("not implemented")
+}
diff --git a/pkg/inference/backends/vllm/vllm.go b/pkg/inference/backends/vllm/vllm.go
index c03c367ad..86334d4e6 100644
--- a/pkg/inference/backends/vllm/vllm.go
+++ b/pkg/inference/backends/vllm/vllm.go
@@ -62,3 +62,7 @@ func (v *vLLM) Status() string {
 func (v *vLLM) GetDiskUsage() (int64, error) {
 	return 0, nil
 }
+
+func (v *vLLM) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) {
+	return nil, errors.New("not implemented")
+}
diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go
index 8efee1455..5ac78e412 100644
--- a/pkg/inference/scheduling/loader.go
+++ b/pkg/inference/scheduling/loader.go
@@ -10,10 +10,12 @@ import (
 	"time"
 
 	"github.com/docker/model-runner/pkg/environment"
+	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/models"
 	"github.com/docker/model-runner/pkg/logging"
 	"github.com/docker/model-runner/pkg/metrics"
+	"github.com/elastic/go-sysinfo"
 )
 
 const (
@@ -71,7 +73,7 @@ type loader struct {
 	// runnerIdleTimeout is the loader-specific default runner idle timeout.
 	runnerIdleTimeout time.Duration
 	// totalMemory is the total system memory allocated to the loader.
-	totalMemory uint64
+	totalMemory inference.RequiredMemory
 	// idleCheck is used to signal the run loop when timestamps have updated.
 	idleCheck chan struct{}
 	// guard is a sempahore controlling access to all subsequent fields. It is
@@ -82,7 +84,7 @@ type loader struct {
 	// loadsEnabled signals that loads are currently enabled.
 	loadsEnabled bool
 	// availableMemory is the available portion of the loader's total memory.
-	availableMemory uint64
+	availableMemory inference.RequiredMemory
 	// waiters is the set of signal channels associated with waiting loaders. We
 	// use a set of signaling channels (instead of a sync.Cond) to enable
 	// polling. Each signaling channel should be buffered (with size 1).
@@ -95,7 +97,7 @@ type loader struct {
 	// references maps slot indices to reference counts.
 	references []uint
 	// allocations maps slot indices to memory allocation sizes.
-	allocations []uint64
+	allocations []inference.RequiredMemory
 	// timestamps maps slot indices to last usage times. Values in this slice
 	// are only valid if the corresponding reference count is zero.
 	timestamps []time.Time
@@ -111,6 +113,7 @@ func newLoader(
 	backends map[string]inference.Backend,
 	modelManager *models.Manager,
 	openAIRecorder *metrics.OpenAIRecorder,
+	gpuInfo *gpuinfo.GPUInfo,
 ) *loader {
 	// Compute the number of runner slots to allocate. Because of RAM and VRAM
 	// limitations, it's unlikely that we'll ever be able to fully populate
@@ -132,20 +135,31 @@ func newLoader(
 	}
 
 	// Compute the amount of available memory.
-	//
-	// TODO: For now, we treat the system as having memory size 1 and all models
-	// as having size 1 (and thus we'll only load a single model at a time).
-	// However, the loader is designed to use "real" values for each and to
-	// schedule appropriately. Thus, we should switch to polling the system
-	// VRAM size here (and potentially even reserving a portion of it) and
-	// computing model size through estimation (using parameter count and
-	// quantization data type size).
-	//
-	// HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
-	// this once we have VRAM estimation.
-	totalMemory := uint64(1)
-	if isGPUEnabledCloudEnvironment {
-		totalMemory = 2
+	// TODO(p1-0tr): improve error handling
+	vramSize, err := gpuInfo.GetVRAMSize()
+	if err != nil {
+		vramSize = 1
+		log.Warnf("Could not read VRAM size: %s", err)
+	} else {
+		log.Infof("Running on system with %dMB VRAM", vramSize/1024/1024)
+	}
+	ramSize := uint64(1)
+	hostInfo, err := sysinfo.Host()
+	if err != nil {
+		log.Warnf("Could not read host info: %s", err)
+	} else {
+		ram, err := hostInfo.Memory()
+		if err != nil {
+			log.Warnf("Could not read host RAM size: %s", err)
+		} else {
+			ramSize = ram.Total
+			log.Infof("Running on system with %dMB RAM", ramSize/1024/1024)
+		}
+	}
+
+	totalMemory := inference.RequiredMemory{
+		RAM:  ramSize,
+		VRAM: vramSize,
 	}
 
 	// Create the loader.
@@ -162,7 +176,7 @@ func newLoader(
 		runners:           make(map[runnerKey]runnerInfo, nSlots),
 		slots:             make([]*runner, nSlots),
 		references:        make([]uint, nSlots),
-		allocations:       make([]uint64, nSlots),
+		allocations:       make([]inference.RequiredMemory, nSlots),
 		timestamps:        make([]time.Time, nSlots),
 		runnerConfigs:     make(map[runnerKey]inference.BackendConfiguration),
 		openAIRecorder:    openAIRecorder,
@@ -219,8 +233,9 @@ func (l *loader) evict(idleOnly bool) int {
 			)
 			l.slots[runnerInfo.slot].terminate()
 			l.slots[runnerInfo.slot] = nil
-			l.availableMemory += l.allocations[runnerInfo.slot]
-			l.allocations[runnerInfo.slot] = 0
+			l.availableMemory.RAM += l.allocations[runnerInfo.slot].RAM
+			l.availableMemory.VRAM += l.allocations[runnerInfo.slot].VRAM
+			l.allocations[runnerInfo.slot] = inference.RequiredMemory{RAM: 0, VRAM: 0}
 			l.timestamps[runnerInfo.slot] = time.Time{}
 			delete(l.runners, r)
 		}
@@ -240,8 +255,9 @@ func (l *loader) evictRunner(backend, model string, mode inference.BackendMode)
 			)
 			l.slots[runnerInfo.slot].terminate()
 			l.slots[runnerInfo.slot] = nil
-			l.availableMemory += l.allocations[runnerInfo.slot]
-			l.allocations[runnerInfo.slot] = 0
+			l.availableMemory.RAM += l.allocations[runnerInfo.slot].RAM
+			l.availableMemory.VRAM += l.allocations[runnerInfo.slot].VRAM
+			l.allocations[runnerInfo.slot] = inference.RequiredMemory{RAM: 0, VRAM: 0}
 			l.timestamps[runnerInfo.slot] = time.Time{}
 			delete(l.runners, r)
 		}
@@ -399,15 +415,24 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 
 	// Estimate the amount of memory that will be used by the model and check
 	// that we're even capable of loading it.
-	//
-	// TODO: For now, we treat the system as having memory size 1 and all models
-	// as having size 1 (and thus we'll only load a single model at a time).
-	// However, the loader is designed to use "real" values for each and to
-	// schedule appropriately. Thus, we should switch to computing model size
-	// here through estimation (using parameter count and quantization data type
-	// size).
-	memory := uint64(1)
-	if memory > l.totalMemory {
+	var runnerConfig *inference.BackendConfiguration
+	if rc, ok := l.runnerConfigs[runnerKey{backendName, modelID, mode}]; ok {
+		runnerConfig = &rc
+	}
+	memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig)
+	if err != nil {
+		return nil, err
+	}
+	l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024)
+	if l.totalMemory.RAM == 1 {
+		l.log.Warnf("RAM size unknown. Assume model will fit, but only one.")
+		memory.RAM = 1
+	}
+	if l.totalMemory.VRAM == 1 {
+		l.log.Warnf("VRAM size unknown. Assume model will fit, but only one.")
+		memory.VRAM = 1
+	}
+	if memory.RAM > l.totalMemory.RAM || memory.VRAM > l.totalMemory.VRAM {
 		return nil, errModelTooBig
 	}
 
@@ -454,12 +479,12 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 
 		// If there's not sufficient memory or all slots are full, then try
 		// evicting unused runners.
-		if memory > l.availableMemory || len(l.runners) == len(l.slots) {
+		if memory.RAM > l.availableMemory.RAM || memory.VRAM > l.availableMemory.VRAM || len(l.runners) == len(l.slots) {
 			l.evict(false)
 		}
 
 		// If there's sufficient memory and a free slot, then find the slot.
-		if memory <= l.availableMemory && len(l.runners) < len(l.slots) {
+		if memory.RAM <= l.availableMemory.RAM && memory.VRAM <= l.availableMemory.VRAM && len(l.runners) < len(l.slots) {
 			for s, runner := range l.slots {
 				if runner == nil {
 					slot = s
@@ -499,11 +524,13 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string
 			}
 
 			// Perform registration and return the runner.
-			l.availableMemory -= memory
+			l.availableMemory.RAM -= memory.RAM
+			l.availableMemory.VRAM -= memory.VRAM
 			l.runners[runnerKey{backendName, modelID, mode}] = runnerInfo{slot, modelRef}
 			l.slots[slot] = runner
 			l.references[slot] = 1
-			l.allocations[slot] = memory
+			l.allocations[slot].RAM = memory.RAM
+			l.allocations[slot].VRAM = memory.VRAM
 			return runner, nil
 		}
 
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
index 23477d12c..05d7926b4 100644
--- a/pkg/inference/scheduling/scheduler.go
+++ b/pkg/inference/scheduling/scheduler.go
@@ -13,6 +13,7 @@ import (
 	"time"
 
 	"github.com/docker/model-distribution/distribution"
+	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/models"
 	"github.com/docker/model-runner/pkg/logging"
@@ -55,6 +56,7 @@ func NewScheduler(
 	httpClient *http.Client,
 	allowedOrigins []string,
 	tracker *metrics.Tracker,
+	gpuInfo *gpuinfo.GPUInfo,
 ) *Scheduler {
 	openAIRecorder := metrics.NewOpenAIRecorder(log.WithField("component", "openai-recorder"), modelManager)
 
@@ -65,7 +67,7 @@ func NewScheduler(
 		defaultBackend: defaultBackend,
 		modelManager:   modelManager,
 		installer:      newInstaller(log, backends, httpClient),
-		loader:         newLoader(log, backends, modelManager, openAIRecorder),
+		loader:         newLoader(log, backends, modelManager, openAIRecorder, gpuInfo),
 		router:         http.NewServeMux(),
 		tracker:        tracker,
 		openAIRecorder: openAIRecorder,