WiP: Support runner configuration

Piotr Stankiewicz · Piotr Stankiewicz · commit 5bcf3e59d30d · 2025-06-11T16:07:01.000+02:00
Signed-off-by: Piotr Stankiewicz &lt;piotr.stankiewicz@docker.com&gt;
diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
@@ -29,6 +29,11 @@ func (m BackendMode) String() string {
 	}
 }
 
+type BackendConfiguration struct {
+	ContextSize uint64
+	RawFlags    string
+}
+
 // Backend is the interface implemented by inference engine backends. Backend
 // implementations need not be safe for concurrent invocation of the following
 // methods, though their underlying server implementations do need to support
@@ -66,7 +71,7 @@ type Backend interface {
 	// to be loaded. Backends should not load multiple models at once and should
 	// instead load only the specified model. Backends should still respond to
 	// OpenAI API requests for other models with a 421 error code.
-	Run(ctx context.Context, socket, model string, mode BackendMode) error
+	Run(ctx context.Context, socket, model string, mode BackendMode, config *BackendConfiguration) error
 	// Status returns a description of the backend's state.
 	Status() string
 	// GetDiskUsage returns the disk usage of the backend.
diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 
 	"github.com/docker/model-runner/pkg/diskusage"
@@ -120,7 +121,7 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
 }
 
 // Run implements inference.Backend.Run.
-func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference.BackendMode) error {
+func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
 	modelPath, err := l.modelManager.GetModelPath(model)
 	l.log.Infof("Model path: %s", modelPath)
 	if err != nil {
@@ -138,6 +139,13 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference
 	}
 
 	args := l.config.GetArgs(modelPath, socket, mode)
+
+	if config != nil {
+		args = append(args, "--ctx-size", strconv.Itoa(int(config.ContextSize)))
+		// FIXME(p1-0tr): this needs to be parsed, to respect quoted values etc.
+		args = append(args, strings.Split(config.RawFlags, " ")...)
+	}
+
 	l.log.Infof("llamaCppArgs: %v", args)
 	llamaCppProcess := exec.CommandContext(
 		ctx,
diff --git a/pkg/inference/backends/mlx/mlx.go b/pkg/inference/backends/mlx/mlx.go
@@ -49,7 +49,7 @@ func (m *mlx) Install(ctx context.Context, httpClient *http.Client) error {
 }
 
 // Run implements inference.Backend.Run.
-func (m *mlx) Run(ctx context.Context, socket, model string, mode inference.BackendMode) error {
+func (m *mlx) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
 	// TODO: Implement.
 	m.log.Warn("MLX backend is not yet supported")
 	return errors.New("not implemented")
diff --git a/pkg/inference/backends/vllm/vllm.go b/pkg/inference/backends/vllm/vllm.go
@@ -49,7 +49,7 @@ func (v *vLLM) Install(ctx context.Context, httpClient *http.Client) error {
 }
 
 // Run implements inference.Backend.Run.
-func (v *vLLM) Run(ctx context.Context, socket, model string, mode inference.BackendMode) error {
+func (v *vLLM) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
 	// TODO: Implement.
 	v.log.Warn("vLLM backend is not yet supported")
 	return errors.New("not implemented")
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
@@ -73,3 +73,10 @@ type UnloadRequest struct {
 type UnloadResponse struct {
 	UnloadedRunners int `json:"unloaded_runners"`
 }
+
+// ConfigureRequest specifies per-model runtime configuration options.
+type ConfigureRequest struct {
+	Model           string `json:"model"`
+	ContextSize     uint64 `json:"context-size"`
+	RawRuntimeFlags string `json:"raw-runtime-flags"`
+}
diff --git a/pkg/inference/scheduling/loader.go b/pkg/inference/scheduling/loader.go
@@ -82,6 +82,8 @@ type loader struct {
 	// timestamps maps slot indices to last usage times. Values in this slice
 	// are only valid if the corresponding reference count is zero.
 	timestamps []time.Time
+	// runnerConfigs maps model names to runner configurations
+	runnerConfigs map[runnerKey]inference.BackendConfiguration
 }
 
 // newLoader creates a new loader.
@@ -122,6 +124,7 @@ func newLoader(
 		references:      make([]uint, nSlots),
 		allocations:     make([]uint64, nSlots),
 		timestamps:      make([]time.Time, nSlots),
+		runnerConfigs:   make(map[runnerKey]inference.BackendConfiguration),
 	}
 	l.guard <- struct{}{}
 	return l
@@ -214,9 +217,11 @@ func (l *loader) Unload(ctx context.Context, unload UnloadRequest) int {
 
 	return len(l.runners) - func() int {
 		if unload.All {
+			l.runnerConfigs = make(map[runnerKey]inference.BackendConfiguration)
 			return l.evict(false)
 		} else {
 			for _, model := range unload.Models {
+				delete(l.runnerConfigs, runnerKey{unload.Backend, model, inference.BackendModeCompletion})
 				// Evict both, completion and embedding models. We should consider
 				// accepting a mode parameter in unload requests.
 				l.evictRunner(unload.Backend, model, inference.BackendModeCompletion)
@@ -413,9 +418,13 @@ func (l *loader) load(ctx context.Context, backendName, model string, mode infer
 
 		// If we've identified a slot, then we're ready to start a runner.
 		if slot >= 0 {
+			var runnerConfig *inference.BackendConfiguration
+			if rc, ok := l.runnerConfigs[runnerKey{backendName, model, mode}]; ok {
+				runnerConfig = &rc
+			}
 			// Create the runner.
 			l.log.Infof("Loading %s backend runner with model %s in %s mode", backendName, model, mode)
-			runner, err := run(l.log, backend, model, mode, slot)
+			runner, err := run(l.log, backend, model, mode, slot, runnerConfig)
 			if err != nil {
 				l.log.Warnf("Unable to start %s backend runner with model %s in %s mode: %v",
 					backendName, model, mode, err,
@@ -492,3 +501,11 @@ func (l *loader) release(runner *runner) {
 	// Signal waiters.
 	l.broadcast()
 }
+
+func (l *loader) setRunnerConfig(ctx context.Context, backendName, model string, mode inference.BackendMode, runnerConfig inference.BackendConfiguration) {
+	l.lock(ctx)
+	defer l.unlock()
+
+	l.log.Infof("Configuring %s runner for %s", backendName, model)
+	l.runnerConfigs[runnerKey{backendName, model, mode}] = runnerConfig
+}
diff --git a/pkg/inference/scheduling/runner.go b/pkg/inference/scheduling/runner.go
@@ -73,6 +73,7 @@ func run(
 	model string,
 	mode inference.BackendMode,
 	slot int,
+	runnerConfig *inference.BackendConfiguration,
 ) (*runner, error) {
 	// Create a dialer / transport that target backend on the specified slot.
 	socket, err := RunnerSocketPath(slot)
@@ -152,7 +153,7 @@ func run(
 
 	// Start the backend run loop.
 	go func() {
-		if err := backend.Run(runCtx, socket, model, mode); err != nil {
+		if err := backend.Run(runCtx, socket, model, mode, runnerConfig); err != nil {
 			log.Warnf("Backend %s running model %s exited with error: %v",
 				backend.Name(), model, err,
 			)
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
@@ -112,6 +112,8 @@ func (s *Scheduler) routeHandlers(allowedOrigins []string) map[string]http.Handl
 	m["GET "+inference.InferencePrefix+"/ps"] = s.GetRunningBackends
 	m["GET "+inference.InferencePrefix+"/df"] = s.GetDiskUsage
 	m["POST "+inference.InferencePrefix+"/unload"] = s.Unload
+	m["POST "+inference.InferencePrefix+"/{backend}/configure"] = s.Configure
+	m["POST "+inference.InferencePrefix+"/configure"] = s.Configure
 	return m
 }
 
@@ -347,6 +349,44 @@ func (s *Scheduler) Unload(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
+func (s *Scheduler) Configure(w http.ResponseWriter, r *http.Request) {
+	// Determine the requested backend and ensure that it's valid.
+	var backend inference.Backend
+	if b := r.PathValue("backend"); b == "" {
+		backend = s.defaultBackend
+	} else {
+		backend = s.backends[b]
+	}
+	if backend == nil {
+		http.Error(w, ErrBackendNotFound.Error(), http.StatusNotFound)
+		return
+	}
+
+	body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
+	if err != nil {
+		if _, ok := err.(*http.MaxBytesError); ok {
+			http.Error(w, "request too large", http.StatusBadRequest)
+		} else {
+			http.Error(w, "unknown error", http.StatusInternalServerError)
+		}
+		return
+	}
+
+	var configureRequest ConfigureRequest
+	if err := json.Unmarshal(body, &configureRequest); err != nil {
+		http.Error(w, "invalid request", http.StatusBadRequest)
+		return
+	}
+
+	var runnerConfig inference.BackendConfiguration
+	runnerConfig.ContextSize = configureRequest.ContextSize
+	runnerConfig.RawFlags = configureRequest.RawRuntimeFlags
+
+	s.loader.setRunnerConfig(r.Context(), backend.Name(), configureRequest.Model, inference.BackendModeCompletion, runnerConfig)
+
+	w.WriteHeader(http.StatusOK)
+}
+
 // ServeHTTP implements net/http.Handler.ServeHTTP.
 func (s *Scheduler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	s.lock.Lock()

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ func (m mlx) Install(ctx context.Context, httpClient http.Client) error {`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`// Run implements inference.Backend.Run.`
`52`		`-func (m *mlx) Run(ctx context.Context, socket, model string, mode inference.BackendMode) error {`
	`52`	`+func (m mlx) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config inference.BackendConfiguration) error {`
`53`	`53`	`// TODO: Implement.`
`54`	`54`	`m.log.Warn("MLX backend is not yet supported")`
`55`	`55`	`return errors.New("not implemented")`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ func (v vLLM) Install(ctx context.Context, httpClient http.Client) error {`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`// Run implements inference.Backend.Run.`
`52`		`-func (v *vLLM) Run(ctx context.Context, socket, model string, mode inference.BackendMode) error {`
	`52`	`+func (v vLLM) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config inference.BackendConfiguration) error {`
`53`	`53`	`// TODO: Implement.`
`54`	`54`	`v.log.Warn("vLLM backend is not yet supported")`
`55`	`55`	`return errors.New("not implemented")`