get --runtime-args back

ilopezluna · ilopezluna · commit b2a7495e70cf · 2025-12-16T11:07:30.000+01:00
diff --git a/cmd/cli/commands/compose.go b/cmd/cli/commands/compose.go
@@ -37,6 +37,7 @@ func newComposeCmd() *cobra.Command {
 func newUpCommand() *cobra.Command {
 	var models []string
 	var ctxSize int64
+	var rawRuntimeFlags string
 	var backend string
 	var draftModel string
 	var numTokens int
@@ -69,6 +70,9 @@ func newUpCommand() *cobra.Command {
 			if ctxSize > 0 {
 				sendInfo(fmt.Sprintf("Setting context size to %d", ctxSize))
 			}
+			if rawRuntimeFlags != "" {
+				sendInfo("Setting raw runtime flags to " + rawRuntimeFlags)
+			}
 
 			// Build speculative config if any speculative flags are set
 			var speculativeConfig *inference.SpeculativeDecodingConfig
@@ -89,10 +93,11 @@ func newUpCommand() *cobra.Command {
 						ContextSize: &size,
 						Speculative: speculativeConfig,
 					},
+					RawRuntimeFlags: rawRuntimeFlags,
 				}); err != nil {
-					configErrFmtString := "failed to configure backend for model %s with context-size %d"
-					_ = sendErrorf(configErrFmtString+": %v", model, ctxSize, err)
-					return fmt.Errorf(configErrFmtString+": %w", model, ctxSize, err)
+					configErrFmtString := "failed to configure backend for model %s with context-size %d  and runtime-flags %s"
+					_ = sendErrorf(configErrFmtString+": %v", model, rawRuntimeFlags, ctxSize, err)
+					return fmt.Errorf(configErrFmtString+": %w", model, ctxSize, rawRuntimeFlags, err)
 				}
 				sendInfo("Successfully configured backend for model " + model)
 			}
@@ -114,6 +119,7 @@ func newUpCommand() *cobra.Command {
 	}
 	c.Flags().StringArrayVar(&models, "model", nil, "model to use")
 	c.Flags().Int64Var(&ctxSize, "context-size", -1, "context size for the model")
+	c.Flags().StringVar(&rawRuntimeFlags, "runtime-flags", "", "raw runtime flags to pass to the inference engine")
 	c.Flags().StringVar(&backend, "backend", llamacpp.Name, "inference backend to use")
 	c.Flags().StringVar(&draftModel, "speculative-draft-model", "", "draft model for speculative decoding")
 	c.Flags().IntVar(&numTokens, "speculative-num-tokens", 0, "number of tokens to predict speculatively")
diff --git a/cmd/cli/commands/configure.go b/cmd/cli/commands/configure.go
@@ -11,15 +11,27 @@ func newConfigureCmd() *cobra.Command {
 	var flags ConfigureFlags
 
 	c := &cobra.Command{
-		Use:    "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL",
+		Use:    "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL [-- <runtime-flags...>]",
 		Short:  "Configure runtime options for a model",
 		Hidden: true,
 		Args: func(cmd *cobra.Command, args []string) error {
-			if len(args) != 1 {
-				return fmt.Errorf(
-					"Exactly one model must be specified, got %d: %v\n\n"+
-						"See 'docker model configure --help' for more information",
-					len(args), args)
+			argsBeforeDash := cmd.ArgsLenAtDash()
+			if argsBeforeDash == -1 {
+				// No "--" used, so we need exactly 1 total argument.
+				if len(args) != 1 {
+					return fmt.Errorf(
+						"Exactly one model must be specified, got %d: %v\n\n"+
+							"See 'docker model configure --help' for more information",
+						len(args), args)
+				}
+			} else {
+				// Has "--", so we need exactly 1 argument before it.
+				if argsBeforeDash != 1 {
+					return fmt.Errorf(
+						"Exactly one model must be specified before --, got %d\n\n"+
+							"See 'docker model configure --help' for more information",
+						argsBeforeDash)
+				}
 			}
 			return nil
 		},
@@ -29,6 +41,7 @@ func newConfigureCmd() *cobra.Command {
 			if err != nil {
 				return err
 			}
+			opts.RuntimeFlags = args[1:]
 			return desktopClient.ConfigureBackend(opts)
 		},
 		ValidArgsFunction: completion.ModelNames(getDesktopClient, -1),
diff --git a/cmd/cli/docs/reference/docker_model_compose_up.yaml b/cmd/cli/docs/reference/docker_model_compose_up.yaml
@@ -33,6 +33,15 @@ options:
       experimentalcli: false
       kubernetes: false
       swarm: false
+    - option: runtime-flags
+      value_type: string
+      description: raw runtime flags to pass to the inference engine
+      deprecated: false
+      hidden: false
+      experimental: false
+      experimentalcli: false
+      kubernetes: false
+      swarm: false
     - option: speculative-draft-model
       value_type: string
       description: draft model for speculative decoding
diff --git a/cmd/cli/docs/reference/docker_model_configure.yaml b/cmd/cli/docs/reference/docker_model_configure.yaml
@@ -1,7 +1,7 @@
 command: docker model configure
 short: Configure runtime options for a model
 long: Configure runtime options for a model
-usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL
+usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL [-- <runtime-flags...>]
 pname: docker model
 plink: docker_model.yaml
 options:
diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
@@ -82,8 +82,9 @@ type LlamaCppConfig struct {
 
 type BackendConfiguration struct {
 	// Shared configuration across all backends
-	ContextSize *int32                     `json:"context-size,omitempty"`
-	Speculative *SpeculativeDecodingConfig `json:"speculative,omitempty"`
+	ContextSize  *int32                     `json:"context-size,omitempty"`
+	RuntimeFlags []string                   `json:"runtime-flags,omitempty"`
+	Speculative  *SpeculativeDecodingConfig `json:"speculative,omitempty"`
 
 	// Backend-specific configuration
 	VLLM     *VLLMConfig     `json:"vllm,omitempty"`
diff --git a/pkg/inference/backends/llamacpp/llamacpp_config.go b/pkg/inference/backends/llamacpp/llamacpp_config.go
@@ -79,6 +79,11 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
 		args = append(args, "--ctx-size", strconv.FormatInt(int64(*contextSize), 10))
 	}
 
+	// Add arguments from backend config
+	if config != nil {
+		args = append(args, config.RuntimeFlags...)
+	}
+
 	// Add arguments for Multimodal projector or jinja (they are mutually exclusive)
 	if path := bundle.MMPROJPath(); path != "" {
 		args = append(args, "--mmproj", path)
diff --git a/pkg/inference/backends/llamacpp/llamacpp_config_test.go b/pkg/inference/backends/llamacpp/llamacpp_config_test.go
@@ -225,6 +225,23 @@ func TestGetArgs(t *testing.T) {
 				"--jinja",
 			),
 		},
+		{
+			name: "raw flags from backend config",
+			mode: inference.BackendModeEmbedding,
+			bundle: &fakeBundle{
+				ggufPath: modelPath,
+			},
+			config: &inference.BackendConfiguration{
+				RuntimeFlags: []string{"--some", "flag"},
+			},
+			expected: append(slices.Clone(baseArgs),
+				"--model", modelPath,
+				"--host", socket,
+				"--embeddings",
+				"--some", "flag",
+				"--jinja",
+			),
+		},
 		{
 			name: "multimodal projector removes jinja",
 			mode: inference.BackendModeCompletion,
diff --git a/pkg/inference/backends/vllm/vllm_config.go b/pkg/inference/backends/vllm/vllm_config.go
@@ -56,7 +56,11 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
 	if maxLen := GetMaxModelLen(bundle.RuntimeConfig(), config); maxLen != nil {
 		args = append(args, "--max-model-len", strconv.FormatInt(int64(*maxLen), 10))
 	}
-	// If nil, vLLM will automatically derive from the model config
+
+	// Add runtime flags from backend config
+	if config != nil {
+		args = append(args, config.RuntimeFlags...)
+	}
 
 	// Add vLLM-specific arguments from backend config
 	if config != nil && config.VLLM != nil {
diff --git a/pkg/inference/backends/vllm/vllm_config_test.go b/pkg/inference/backends/vllm/vllm_config_test.go
@@ -83,6 +83,23 @@ func TestGetArgs(t *testing.T) {
 				"8192",
 			},
 		},
+		{
+			name: "with runtime flags",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				RuntimeFlags: []string{"--gpu-memory-utilization", "0.9"},
+			},
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+				"--gpu-memory-utilization",
+				"0.9",
+			},
+		},
 		{
 			name: "with model context size (takes precedence)",
 			bundle: &mockModelBundle{
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
@@ -93,7 +93,8 @@ type UnloadResponse struct {
 
 // ConfigureRequest specifies per-model runtime configuration options.
 type ConfigureRequest struct {
-	Model string                 `json:"model"`
-	Mode  *inference.BackendMode `json:"mode,omitempty"`
+	Model           string                 `json:"model"`
+	Mode            *inference.BackendMode `json:"mode,omitempty"`
+	RawRuntimeFlags string                 `json:"raw-runtime-flags,omitempty"`
 	inference.BackendConfiguration
 }
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
@@ -3,7 +3,9 @@ package scheduling
 import (
 	"context"
 	"errors"
+	"fmt"
 	"net/http"
+	"slices"
 	"time"
 
 	"github.com/docker/model-runner/pkg/distribution/types"
@@ -14,6 +16,7 @@ import (
 	"github.com/docker/model-runner/pkg/internal/utils"
 	"github.com/docker/model-runner/pkg/logging"
 	"github.com/docker/model-runner/pkg/metrics"
+	"github.com/mattn/go-shellwords"
 	"golang.org/x/sync/errgroup"
 )
 
@@ -225,10 +228,23 @@ func (s *Scheduler) ConfigureRunner(ctx context.Context, backend inference.Backe
 		backend = s.defaultBackend
 	}
 
+	// Parse runtime flags from either array or raw string
+	var runtimeFlags []string
+	if len(req.RuntimeFlags) > 0 {
+		runtimeFlags = req.RuntimeFlags
+	} else if req.RawRuntimeFlags != "" {
+		var err error
+		runtimeFlags, err = shellwords.Parse(req.RawRuntimeFlags)
+		if err != nil {
+			return nil, fmt.Errorf("invalid runtime flags: %w", err)
+		}
+	}
+
 	// Build runner configuration with shared settings
 	var runnerConfig inference.BackendConfiguration
 	runnerConfig.ContextSize = req.ContextSize
 	runnerConfig.Speculative = req.Speculative
+	runnerConfig.RuntimeFlags = runtimeFlags
 
 	// Set vLLM-specific configuration if provided
 	if req.VLLM != nil {
@@ -255,6 +271,8 @@ func (s *Scheduler) ConfigureRunner(ctx context.Context, backend inference.Backe
 	mode := inference.BackendModeCompletion
 	if req.Mode != nil {
 		mode = *req.Mode
+	} else if slices.Contains(runnerConfig.RuntimeFlags, "--embeddings") {
+		mode = inference.BackendModeEmbedding
 	}
 
 	// Get model, track usage, and select appropriate backend