Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/cli/commands/configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ func newConfigureCmd() *cobra.Command {
var flags ConfigureFlags

c := &cobra.Command{
Use: "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--mode=<mode>] [--think] MODEL",
Use: "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL",
Short: "Configure runtime options for a model",
Hidden: true,
Args: func(cmd *cobra.Command, args []string) error {
Expand Down
47 changes: 46 additions & 1 deletion cmd/cli/commands/configure_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,37 @@ func (v *BoolPtrValue) IsBoolFlag() bool {
return true
}

// Float64PtrValue implements pflag.Value interface for *float64 pointers
// This allows flags to have a nil default value instead of 0.0
type Float64PtrValue struct {
ptr **float64
}

// NewFloat64PtrValue creates a new Float64PtrValue for the given pointer
func NewFloat64PtrValue(p **float64) *Float64PtrValue {
return &Float64PtrValue{ptr: p}
}

func (v *Float64PtrValue) String() string {
if v.ptr == nil || *v.ptr == nil {
return ""
}
return strconv.FormatFloat(**v.ptr, 'f', -1, 64)
}

func (v *Float64PtrValue) Set(s string) error {
val, err := strconv.ParseFloat(s, 64)
if err != nil {
return err
}
*v.ptr = &val
return nil
}

func (v *Float64PtrValue) Type() string {
return "float64"
}

// ptr is a helper function to create a pointer to int32
func ptr(v int32) *int32 {
return &v
Expand All @@ -100,7 +131,8 @@ type ConfigureFlags struct {
NumTokens int
MinAcceptanceRate float64
// vLLM-specific flags
HFOverrides string
HFOverrides string
GPUMemoryUtilization *float64
// Think parameter for reasoning models
Think *bool
}
Expand All @@ -112,6 +144,7 @@ func (f *ConfigureFlags) RegisterFlags(cmd *cobra.Command) {
cmd.Flags().IntVar(&f.NumTokens, "speculative-num-tokens", 0, "number of tokens to predict speculatively")
cmd.Flags().Float64Var(&f.MinAcceptanceRate, "speculative-min-acceptance-rate", 0, "minimum acceptance rate for speculative decoding")
cmd.Flags().StringVar(&f.HFOverrides, "hf_overrides", "", "HuggingFace model config overrides (JSON) - vLLM only")
cmd.Flags().Var(NewFloat64PtrValue(&f.GPUMemoryUtilization), "gpu-memory-utilization", "fraction of GPU memory to use for the model executor (0.0-1.0) - vLLM only")
cmd.Flags().Var(NewBoolPtrValue(&f.Think), "think", "enable reasoning mode for thinking models")
cmd.Flags().StringVar(&f.Mode, "mode", "", "backend operation mode (completion, embedding, reranking)")
}
Expand Down Expand Up @@ -151,6 +184,18 @@ func (f *ConfigureFlags) BuildConfigureRequest(model string) (scheduling.Configu
req.VLLM.HFOverrides = hfo
}

// Set GPU memory utilization if provided (vLLM-specific)
if f.GPUMemoryUtilization != nil {
utilization := *f.GPUMemoryUtilization
if utilization < 0.0 || utilization > 1.0 {
return req, fmt.Errorf("--gpu-memory-utilization must be between 0.0 and 1.0, got %f", utilization)
}
if req.VLLM == nil {
req.VLLM = &inference.VLLMConfig{}
}
req.VLLM.GPUMemoryUtilization = f.GPUMemoryUtilization
}

// Set reasoning budget from --think flag
reasoningBudget := f.getReasoningBudget()
if reasoningBudget != nil {
Expand Down
120 changes: 120 additions & 0 deletions cmd/cli/commands/configure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,126 @@ func TestConfigureCmdThinkFlag(t *testing.T) {
}
}

func TestConfigureCmdGPUMemoryUtilizationFlag(t *testing.T) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Add a test case for non-float input to the --gpu-memory-utilization flag to exercise the error path in Float64PtrValue.Set.

Since Float64PtrValue.Set returns an error when strconv.ParseFloat fails, please add a subtest that calls cmd.Flags().Set("gpu-memory-utilization", "not-a-number") and asserts that an error is returned. This will exercise the error path and verify the custom flag’s parsing behavior end-to-end.

// Create the configure command
cmd := newConfigureCmd()

// Verify the --gpu-memory-utilization flag exists
gpuMemFlag := cmd.Flags().Lookup("gpu-memory-utilization")
if gpuMemFlag == nil {
t.Fatal("--gpu-memory-utilization flag not found")
}

// Verify the default value is empty (nil pointer)
if gpuMemFlag.DefValue != "" {
t.Errorf("Expected default gpu-memory-utilization value to be '' (nil), got '%s'", gpuMemFlag.DefValue)
}

// Verify the flag type
if gpuMemFlag.Value.Type() != "float64" {
t.Errorf("Expected gpu-memory-utilization flag type to be 'float64', got '%s'", gpuMemFlag.Value.Type())
}

// Test setting the flag value
err := cmd.Flags().Set("gpu-memory-utilization", "0.7")
if err != nil {
t.Errorf("Failed to set gpu-memory-utilization flag: %v", err)
}

// Verify the value was set
gpuMemValue := gpuMemFlag.Value.String()
if gpuMemValue != "0.7" {
t.Errorf("Expected gpu-memory-utilization flag value to be '0.7', got '%s'", gpuMemValue)
}
}

func TestGPUMemoryUtilizationBehavior(t *testing.T) {
// Helper to create float64 pointer
float64Ptr := func(f float64) *float64 { return &f }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This helper function to create a *float64 is also defined in pkg/inference/backends/vllm/vllm_config_test.go as float64ptr. To avoid code duplication across test files and improve maintainability, it would be beneficial to move such common test helpers into a shared test utility package.


tests := []struct {
name string
gpuMemValue *float64
expectError bool
expectGPUMemSet bool
expectedGPUMemUtil float64
}{
{
name: "default - not set (nil)",
gpuMemValue: nil,
expectError: false,
expectGPUMemSet: false,
},
{
name: "valid value 0.5",
gpuMemValue: float64Ptr(0.5),
expectError: false,
expectGPUMemSet: true,
expectedGPUMemUtil: 0.5,
},
{
name: "edge case 0.0",
gpuMemValue: float64Ptr(0.0),
expectError: false,
expectGPUMemSet: true,
expectedGPUMemUtil: 0.0,
},
{
name: "edge case 1.0",
gpuMemValue: float64Ptr(1.0),
expectError: false,
expectGPUMemSet: true,
expectedGPUMemUtil: 1.0,
},
{
name: "invalid - negative value",
gpuMemValue: float64Ptr(-0.1),
expectError: true,
},
{
name: "invalid - value > 1.0",
gpuMemValue: float64Ptr(1.5),
expectError: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
flags := ConfigureFlags{
GPUMemoryUtilization: tt.gpuMemValue,
}

req, err := flags.BuildConfigureRequest("test-model")

if tt.expectError {
if err == nil {
t.Fatal("Expected error but got none")
}
return
}

if err != nil {
t.Fatalf("Unexpected error: %v", err)
}

if tt.expectGPUMemSet {
// GPU memory utilization should be set
if req.VLLM == nil || req.VLLM.GPUMemoryUtilization == nil {
t.Fatal("Expected GPU memory utilization to be set")
}
if *req.VLLM.GPUMemoryUtilization != tt.expectedGPUMemUtil {
t.Errorf("Expected GPU memory utilization to be %f, got %f", tt.expectedGPUMemUtil, *req.VLLM.GPUMemoryUtilization)
}
} else {
// GPU memory utilization should NOT be set
if req.VLLM != nil && req.VLLM.GPUMemoryUtilization != nil {
t.Errorf("Expected GPU memory utilization to be nil when not set, got %f", *req.VLLM.GPUMemoryUtilization)
}
}
})
}
}

func TestThinkFlagBehavior(t *testing.T) {
// Helper to create bool pointer
boolPtr := func(b bool) *bool { return &b }
Expand Down
12 changes: 11 additions & 1 deletion cmd/cli/docs/reference/docker_model_configure.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
command: docker model configure
short: Configure runtime options for a model
long: Configure runtime options for a model
usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--mode=<mode>] [--think] MODEL
usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL
pname: docker model
plink: docker_model.yaml
options:
Expand All @@ -14,6 +14,16 @@ options:
experimentalcli: false
kubernetes: false
swarm: false
- option: gpu-memory-utilization
value_type: float64
description: |
fraction of GPU memory to use for the model executor (0.0-1.0) - vLLM only
deprecated: false
hidden: false
experimental: false
experimentalcli: false
kubernetes: false
swarm: false
- option: hf_overrides
value_type: string
description: HuggingFace model config overrides (JSON) - vLLM only
Expand Down
4 changes: 4 additions & 0 deletions pkg/inference/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ type VLLMConfig struct {
// HFOverrides contains HuggingFace model configuration overrides.
// This maps to vLLM's --hf-overrides flag which accepts a JSON dictionary.
HFOverrides HFOverrides `json:"hf-overrides,omitempty"`
// GPUMemoryUtilization sets the fraction of GPU memory to be used for the model executor.
// Must be between 0.0 and 1.0. If not specified, vLLM uses its default value of 0.9.
// This maps to vLLM's --gpu-memory-utilization flag.
GPUMemoryUtilization *float64 `json:"gpu-memory-utilization,omitempty"`
}

// LlamaCppConfig contains llama.cpp-specific configuration options.
Expand Down
9 changes: 9 additions & 0 deletions pkg/inference/backends/vllm/vllm_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference

// Add vLLM-specific arguments from backend config
if config != nil && config.VLLM != nil {
// Add GPU memory utilization if specified
if config.VLLM.GPUMemoryUtilization != nil {
utilization := *config.VLLM.GPUMemoryUtilization
if utilization < 0.0 || utilization > 1.0 {
return nil, fmt.Errorf("gpu-memory-utilization must be between 0.0 and 1.0, got %f", utilization)
}
Comment on lines +66 to +68
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The validation logic for the gpu-memory-utilization range is also present in cmd/cli/commands/configure_flags.go on lines 190-192. This duplication could lead to maintenance challenges, for instance, if the valid range changes. To improve maintainability, consider abstracting this validation into a shared function or using shared constants for the boundaries and error message. While validation at different layers can be useful, centralizing the core logic will make the code easier to manage.

args = append(args, "--gpu-memory-utilization", strconv.FormatFloat(utilization, 'f', -1, 64))
}

// Add HuggingFace overrides if specified
if len(config.VLLM.HFOverrides) > 0 {
hfOverridesJSON, err := json.Marshal(config.VLLM.HFOverrides)
Expand Down
Loading
Loading