Skip to content

Commit b2a7495

Browse files
committed
get --runtime-args back
1 parent 8a8455c commit b2a7495

File tree

11 files changed

+106
-15
lines changed

11 files changed

+106
-15
lines changed

cmd/cli/commands/compose.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ func newComposeCmd() *cobra.Command {
3737
func newUpCommand() *cobra.Command {
3838
var models []string
3939
var ctxSize int64
40+
var rawRuntimeFlags string
4041
var backend string
4142
var draftModel string
4243
var numTokens int
@@ -69,6 +70,9 @@ func newUpCommand() *cobra.Command {
6970
if ctxSize > 0 {
7071
sendInfo(fmt.Sprintf("Setting context size to %d", ctxSize))
7172
}
73+
if rawRuntimeFlags != "" {
74+
sendInfo("Setting raw runtime flags to " + rawRuntimeFlags)
75+
}
7276

7377
// Build speculative config if any speculative flags are set
7478
var speculativeConfig *inference.SpeculativeDecodingConfig
@@ -89,10 +93,11 @@ func newUpCommand() *cobra.Command {
8993
ContextSize: &size,
9094
Speculative: speculativeConfig,
9195
},
96+
RawRuntimeFlags: rawRuntimeFlags,
9297
}); err != nil {
93-
configErrFmtString := "failed to configure backend for model %s with context-size %d"
94-
_ = sendErrorf(configErrFmtString+": %v", model, ctxSize, err)
95-
return fmt.Errorf(configErrFmtString+": %w", model, ctxSize, err)
98+
configErrFmtString := "failed to configure backend for model %s with context-size %d and runtime-flags %s"
99+
_ = sendErrorf(configErrFmtString+": %v", model, rawRuntimeFlags, ctxSize, err)
100+
return fmt.Errorf(configErrFmtString+": %w", model, ctxSize, rawRuntimeFlags, err)
96101
}
97102
sendInfo("Successfully configured backend for model " + model)
98103
}
@@ -114,6 +119,7 @@ func newUpCommand() *cobra.Command {
114119
}
115120
c.Flags().StringArrayVar(&models, "model", nil, "model to use")
116121
c.Flags().Int64Var(&ctxSize, "context-size", -1, "context size for the model")
122+
c.Flags().StringVar(&rawRuntimeFlags, "runtime-flags", "", "raw runtime flags to pass to the inference engine")
117123
c.Flags().StringVar(&backend, "backend", llamacpp.Name, "inference backend to use")
118124
c.Flags().StringVar(&draftModel, "speculative-draft-model", "", "draft model for speculative decoding")
119125
c.Flags().IntVar(&numTokens, "speculative-num-tokens", 0, "number of tokens to predict speculatively")

cmd/cli/commands/configure.go

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,27 @@ func newConfigureCmd() *cobra.Command {
1111
var flags ConfigureFlags
1212

1313
c := &cobra.Command{
14-
Use: "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL",
14+
Use: "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL [-- <runtime-flags...>]",
1515
Short: "Configure runtime options for a model",
1616
Hidden: true,
1717
Args: func(cmd *cobra.Command, args []string) error {
18-
if len(args) != 1 {
19-
return fmt.Errorf(
20-
"Exactly one model must be specified, got %d: %v\n\n"+
21-
"See 'docker model configure --help' for more information",
22-
len(args), args)
18+
argsBeforeDash := cmd.ArgsLenAtDash()
19+
if argsBeforeDash == -1 {
20+
// No "--" used, so we need exactly 1 total argument.
21+
if len(args) != 1 {
22+
return fmt.Errorf(
23+
"Exactly one model must be specified, got %d: %v\n\n"+
24+
"See 'docker model configure --help' for more information",
25+
len(args), args)
26+
}
27+
} else {
28+
// Has "--", so we need exactly 1 argument before it.
29+
if argsBeforeDash != 1 {
30+
return fmt.Errorf(
31+
"Exactly one model must be specified before --, got %d\n\n"+
32+
"See 'docker model configure --help' for more information",
33+
argsBeforeDash)
34+
}
2335
}
2436
return nil
2537
},
@@ -29,6 +41,7 @@ func newConfigureCmd() *cobra.Command {
2941
if err != nil {
3042
return err
3143
}
44+
opts.RuntimeFlags = args[1:]
3245
return desktopClient.ConfigureBackend(opts)
3346
},
3447
ValidArgsFunction: completion.ModelNames(getDesktopClient, -1),

cmd/cli/docs/reference/docker_model_compose_up.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@ options:
3333
experimentalcli: false
3434
kubernetes: false
3535
swarm: false
36+
- option: runtime-flags
37+
value_type: string
38+
description: raw runtime flags to pass to the inference engine
39+
deprecated: false
40+
hidden: false
41+
experimental: false
42+
experimentalcli: false
43+
kubernetes: false
44+
swarm: false
3645
- option: speculative-draft-model
3746
value_type: string
3847
description: draft model for speculative decoding

cmd/cli/docs/reference/docker_model_configure.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
command: docker model configure
22
short: Configure runtime options for a model
33
long: Configure runtime options for a model
4-
usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL
4+
usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL [-- <runtime-flags...>]
55
pname: docker model
66
plink: docker_model.yaml
77
options:

pkg/inference/backend.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,9 @@ type LlamaCppConfig struct {
8282

8383
type BackendConfiguration struct {
8484
// Shared configuration across all backends
85-
ContextSize *int32 `json:"context-size,omitempty"`
86-
Speculative *SpeculativeDecodingConfig `json:"speculative,omitempty"`
85+
ContextSize *int32 `json:"context-size,omitempty"`
86+
RuntimeFlags []string `json:"runtime-flags,omitempty"`
87+
Speculative *SpeculativeDecodingConfig `json:"speculative,omitempty"`
8788

8889
// Backend-specific configuration
8990
VLLM *VLLMConfig `json:"vllm,omitempty"`

pkg/inference/backends/llamacpp/llamacpp_config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
7979
args = append(args, "--ctx-size", strconv.FormatInt(int64(*contextSize), 10))
8080
}
8181

82+
// Add arguments from backend config
83+
if config != nil {
84+
args = append(args, config.RuntimeFlags...)
85+
}
86+
8287
// Add arguments for Multimodal projector or jinja (they are mutually exclusive)
8388
if path := bundle.MMPROJPath(); path != "" {
8489
args = append(args, "--mmproj", path)

pkg/inference/backends/llamacpp/llamacpp_config_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,23 @@ func TestGetArgs(t *testing.T) {
225225
"--jinja",
226226
),
227227
},
228+
{
229+
name: "raw flags from backend config",
230+
mode: inference.BackendModeEmbedding,
231+
bundle: &fakeBundle{
232+
ggufPath: modelPath,
233+
},
234+
config: &inference.BackendConfiguration{
235+
RuntimeFlags: []string{"--some", "flag"},
236+
},
237+
expected: append(slices.Clone(baseArgs),
238+
"--model", modelPath,
239+
"--host", socket,
240+
"--embeddings",
241+
"--some", "flag",
242+
"--jinja",
243+
),
244+
},
228245
{
229246
name: "multimodal projector removes jinja",
230247
mode: inference.BackendModeCompletion,

pkg/inference/backends/vllm/vllm_config.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,11 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
5656
if maxLen := GetMaxModelLen(bundle.RuntimeConfig(), config); maxLen != nil {
5757
args = append(args, "--max-model-len", strconv.FormatInt(int64(*maxLen), 10))
5858
}
59-
// If nil, vLLM will automatically derive from the model config
59+
60+
// Add runtime flags from backend config
61+
if config != nil {
62+
args = append(args, config.RuntimeFlags...)
63+
}
6064

6165
// Add vLLM-specific arguments from backend config
6266
if config != nil && config.VLLM != nil {

pkg/inference/backends/vllm/vllm_config_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,23 @@ func TestGetArgs(t *testing.T) {
8383
"8192",
8484
},
8585
},
86+
{
87+
name: "with runtime flags",
88+
bundle: &mockModelBundle{
89+
safetensorsPath: "/path/to/model",
90+
},
91+
config: &inference.BackendConfiguration{
92+
RuntimeFlags: []string{"--gpu-memory-utilization", "0.9"},
93+
},
94+
expected: []string{
95+
"serve",
96+
"/path/to",
97+
"--uds",
98+
"/tmp/socket",
99+
"--gpu-memory-utilization",
100+
"0.9",
101+
},
102+
},
86103
{
87104
name: "with model context size (takes precedence)",
88105
bundle: &mockModelBundle{

pkg/inference/scheduling/api.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ type UnloadResponse struct {
9393

9494
// ConfigureRequest specifies per-model runtime configuration options.
9595
type ConfigureRequest struct {
96-
Model string `json:"model"`
97-
Mode *inference.BackendMode `json:"mode,omitempty"`
96+
Model string `json:"model"`
97+
Mode *inference.BackendMode `json:"mode,omitempty"`
98+
RawRuntimeFlags string `json:"raw-runtime-flags,omitempty"`
9899
inference.BackendConfiguration
99100
}

0 commit comments

Comments
 (0)