docker
diff --git a/‎Makefile‎
Lines changed: 0 additions & 10 deletions b/‎Makefile‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎main.go‎
Lines changed: 0 additions & 62 deletions b/‎main.go‎
Lines changed: 0 additions & 62 deletions
diff --git a/‎main_test.go‎
Lines changed: 0 additions & 108 deletions b/‎main_test.go‎
Lines changed: 0 additions & 108 deletions
diff --git a/‎pkg/inference/backends/llamacpp/llamacpp.go‎
Lines changed: 29 additions & 14 deletions b/‎pkg/inference/backends/llamacpp/llamacpp.go‎
Lines changed: 29 additions & 14 deletions
diff --git a/‎pkg/inference/backends/llamacpp/llamacpp_config.go‎
Lines changed: 0 additions & 59 deletions b/‎pkg/inference/backends/llamacpp/llamacpp_config.go‎
Lines changed: 0 additions & 59 deletions
@@ -7,7 +7,6 @@ BASE_IMAGE := ubuntu:24.04
 DOCKER_IMAGE := docker/model-runner:latest
 PORT := 8080
 MODELS_PATH := $(shell pwd)/models
-LLAMA_ARGS ?=
 
 # Main targets
 .PHONY: build run clean test docker-build docker-run help
@@ -21,7 +20,6 @@ build:
 
 # Run the application locally
 run: build
-	LLAMA_ARGS="$(LLAMA_ARGS)" \
 	./$(APP_NAME)
 
 # Clean build artifacts
@@ -57,7 +55,6 @@ docker-run: docker-build
 		-e MODEL_RUNNER_PORT=$(PORT) \
 		-e LLAMA_SERVER_PATH=/app/bin \
 		-e MODELS_PATH=/models \
-		-e LLAMA_ARGS="$(LLAMA_ARGS)" \
 		$(DOCKER_IMAGE)
 
 # Show help
@@ -70,10 +67,3 @@ help:
 	@echo "  docker-build   	- Build Docker image"
 	@echo "  docker-run     	- Run in Docker container with TCP port access and mounted model storage"
 	@echo "  help           	- Show this help message"
-	@echo ""
-	@echo "Backend configuration options:"
-	@echo "  LLAMA_ARGS    - Arguments for llama.cpp (e.g., \"--verbose --jinja -ngl 100 --ctx-size 2048\")"
-	@echo ""
-	@echo "Example usage:"
-	@echo "  make run LLAMA_ARGS=\"--verbose --jinja -ngl 100 --ctx-size 2048\""
-	@echo "  make docker-run LLAMA_ARGS=\"--verbose --jinja -ngl 100 --threads 4 --ctx-size 2048\""
@@ -7,12 +7,10 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
-	"strings"
 	"syscall"
 
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
-	"github.com/docker/model-runner/pkg/inference/config"
 	"github.com/docker/model-runner/pkg/inference/models"
 	"github.com/docker/model-runner/pkg/inference/scheduling"
 	"github.com/docker/model-runner/pkg/routing"
@@ -52,9 +50,6 @@ func main() {
 
 	log.Infof("LLAMA_SERVER_PATH: %s", llamaServerPath)
 
-	// Create llama.cpp configuration from environment variables
-	llamaCppConfig := createLlamaCppConfigFromEnv()
-
 	llamaCppBackend, err := llamacpp.New(
 		log,
 		modelManager,
@@ -66,7 +61,6 @@ func main() {
 			_ = os.MkdirAll(d, 0o755)
 			return d
 		}(),
-		llamaCppConfig,
 	)
 	if err != nil {
 		log.Fatalf("unable to initialize %s backend: %v", llamacpp.Name, err)
@@ -140,59 +134,3 @@ func main() {
 	}
 	log.Infoln("Docker Model Runner stopped")
 }
-
-// createLlamaCppConfigFromEnv creates a LlamaCppConfig from environment variables
-func createLlamaCppConfigFromEnv() config.BackendConfig {
-	// Check if any configuration environment variables are set
-	argsStr := os.Getenv("LLAMA_ARGS")
-
-	// If no environment variables are set, use default configuration
-	if argsStr == "" {
-		return nil // nil will cause the backend to use its default configuration
-	}
-
-	// Split the string by spaces, respecting quoted arguments
-	args := splitArgs(argsStr)
-
-	// Check for disallowed arguments
-	disallowedArgs := []string{"--model", "--host", "--embeddings", "--mmproj"}
-	for _, arg := range args {
-		for _, disallowed := range disallowedArgs {
-			if arg == disallowed {
-				log.Fatalf("LLAMA_ARGS cannot override the %s argument as it is controlled by the model runner", disallowed)
-			}
-		}
-	}
-
-	log.Infof("Using custom arguments: %v", args)
-	return &llamacpp.Config{
-		Args: args,
-	}
-}
-
-// splitArgs splits a string into arguments, respecting quoted arguments
-func splitArgs(s string) []string {
-	var args []string
-	var currentArg strings.Builder
-	inQuotes := false
-
-	for _, r := range s {
-		switch {
-		case r == '"' || r == '\'':
-			inQuotes = !inQuotes
-		case r == ' ' && !inQuotes:
-			if currentArg.Len() > 0 {
-				args = append(args, currentArg.String())
-				currentArg.Reset()
-			}
-		default:
-			currentArg.WriteRune(r)
-		}
-	}
-
-	if currentArg.Len() > 0 {
-		args = append(args, currentArg.String())
-	}
-
-	return args
-}
@@ -10,10 +10,10 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
+	"strconv"
 
 	"github.com/docker/model-runner/pkg/diskusage"
 	"github.com/docker/model-runner/pkg/inference"
-	"github.com/docker/model-runner/pkg/inference/config"
 	"github.com/docker/model-runner/pkg/inference/models"
 	"github.com/docker/model-runner/pkg/logging"
 )
@@ -39,8 +39,6 @@ type llamaCpp struct {
 	updatedServerStoragePath string
 	// status is the state in which the llama.cpp backend is in.
 	status string
-	// config is the configuration for the llama.cpp backend.
-	config config.BackendConfig
 }
 
 // New creates a new llama.cpp-based backend.
@@ -50,20 +48,13 @@ func New(
 	serverLog logging.Logger,
 	vendoredServerStoragePath string,
 	updatedServerStoragePath string,
-	conf config.BackendConfig,
 ) (inference.Backend, error) {
-	// If no config is provided, use the default configuration
-	if conf == nil {
-		conf = NewDefaultLlamaCppConfig()
-	}
-
 	return &llamaCpp{
 		log:                       log,
 		modelManager:              modelManager,
 		serverLog:                 serverLog,
 		vendoredServerStoragePath: vendoredServerStoragePath,
 		updatedServerStoragePath:  updatedServerStoragePath,
-		config:                    conf,
 	}, nil
 }
 
@@ -124,6 +115,11 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference
 		return fmt.Errorf("failed to get model path: %w", err)
 	}
 
+	modelDesc, err := l.modelManager.GetModel(model)
+	if err != nil {
+		return fmt.Errorf("failed to get model: %w", err)
+	}
+
 	if err := os.RemoveAll(socket); err != nil && !errors.Is(err, fs.ErrNotExist) {
 		l.log.Warnf("failed to remove socket file %s: %w\n", socket, err)
 		l.log.Warnln("llama.cpp may not be able to start")
@@ -133,13 +129,32 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference
 	if l.updatedLlamaCpp {
 		binPath = l.updatedServerStoragePath
 	}
-
-	args := l.config.GetArgs(modelPath, socket, mode)
-	l.log.Infof("llamaCppArgs: %v", args)
+	llamaCppArgs := []string{"--model", modelPath, "--jinja", "--host", socket}
+	if mode == inference.BackendModeEmbedding {
+		llamaCppArgs = append(llamaCppArgs, "--embeddings")
+	}
+	if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
+		// Using a thread count equal to core count results in bad performance, and there seems to be little to no gain
+		// in going beyond core_count/2.
+		// TODO(p1-0tr): dig into why the defaults don't work well on windows/arm64
+		nThreads := min(2, runtime.NumCPU()/2)
+		llamaCppArgs = append(llamaCppArgs, "--threads", strconv.Itoa(nThreads))
+
+		modelConfig, err := modelDesc.Config()
+		if err != nil {
+			return fmt.Errorf("failed to get model config: %w", err)
+		}
+		// The Adreno OpenCL implementation currently only supports Q4_0
+		if modelConfig.Quantization == "Q4_0" {
+			llamaCppArgs = append(llamaCppArgs, "-ngl", "100")
+		}
+	} else {
+		llamaCppArgs = append(llamaCppArgs, "-ngl", "100")
+	}
 	llamaCppProcess := exec.CommandContext(
 		ctx,
 		filepath.Join(binPath, "com.docker.llama-server"),
-		args...,
+		llamaCppArgs...,
 	)
 	llamaCppProcess.Cancel = func() error {
 		if runtime.GOOS == "windows" {