diff --git a/cmd/cli/commands/configure.go b/cmd/cli/commands/configure.go index 6c5da30d..b42910ba 100644 --- a/cmd/cli/commands/configure.go +++ b/cmd/cli/commands/configure.go @@ -11,7 +11,7 @@ func newConfigureCmd() *cobra.Command { var flags ConfigureFlags c := &cobra.Command{ - Use: "configure [--context-size=] [--speculative-draft-model=] [--hf_overrides=] [--mode=] [--think] MODEL", + Use: "configure [--context-size=] [--speculative-draft-model=] [--hf_overrides=] [--gpu-memory-utilization=] [--mode=] [--think] MODEL", Short: "Configure runtime options for a model", Hidden: true, Args: func(cmd *cobra.Command, args []string) error { diff --git a/cmd/cli/commands/configure_flags.go b/cmd/cli/commands/configure_flags.go index abda4ffc..76991679 100644 --- a/cmd/cli/commands/configure_flags.go +++ b/cmd/cli/commands/configure_flags.go @@ -84,6 +84,37 @@ func (v *BoolPtrValue) IsBoolFlag() bool { return true } +// Float64PtrValue implements pflag.Value interface for *float64 pointers +// This allows flags to have a nil default value instead of 0.0 +type Float64PtrValue struct { + ptr **float64 +} + +// NewFloat64PtrValue creates a new Float64PtrValue for the given pointer +func NewFloat64PtrValue(p **float64) *Float64PtrValue { + return &Float64PtrValue{ptr: p} +} + +func (v *Float64PtrValue) String() string { + if v.ptr == nil || *v.ptr == nil { + return "" + } + return strconv.FormatFloat(**v.ptr, 'f', -1, 64) +} + +func (v *Float64PtrValue) Set(s string) error { + val, err := strconv.ParseFloat(s, 64) + if err != nil { + return err + } + *v.ptr = &val + return nil +} + +func (v *Float64PtrValue) Type() string { + return "float64" +} + // ptr is a helper function to create a pointer to int32 func ptr(v int32) *int32 { return &v @@ -100,7 +131,8 @@ type ConfigureFlags struct { NumTokens int MinAcceptanceRate float64 // vLLM-specific flags - HFOverrides string + HFOverrides string + GPUMemoryUtilization *float64 // Think parameter for reasoning models Think *bool } @@ -112,6 +144,7 @@ func (f *ConfigureFlags) RegisterFlags(cmd *cobra.Command) { cmd.Flags().IntVar(&f.NumTokens, "speculative-num-tokens", 0, "number of tokens to predict speculatively") cmd.Flags().Float64Var(&f.MinAcceptanceRate, "speculative-min-acceptance-rate", 0, "minimum acceptance rate for speculative decoding") cmd.Flags().StringVar(&f.HFOverrides, "hf_overrides", "", "HuggingFace model config overrides (JSON) - vLLM only") + cmd.Flags().Var(NewFloat64PtrValue(&f.GPUMemoryUtilization), "gpu-memory-utilization", "fraction of GPU memory to use for the model executor (0.0-1.0) - vLLM only") cmd.Flags().Var(NewBoolPtrValue(&f.Think), "think", "enable reasoning mode for thinking models") cmd.Flags().StringVar(&f.Mode, "mode", "", "backend operation mode (completion, embedding, reranking)") } @@ -151,6 +184,18 @@ func (f *ConfigureFlags) BuildConfigureRequest(model string) (scheduling.Configu req.VLLM.HFOverrides = hfo } + // Set GPU memory utilization if provided (vLLM-specific) + if f.GPUMemoryUtilization != nil { + utilization := *f.GPUMemoryUtilization + if utilization < 0.0 || utilization > 1.0 { + return req, fmt.Errorf("--gpu-memory-utilization must be between 0.0 and 1.0, got %f", utilization) + } + if req.VLLM == nil { + req.VLLM = &inference.VLLMConfig{} + } + req.VLLM.GPUMemoryUtilization = f.GPUMemoryUtilization + } + // Set reasoning budget from --think flag reasoningBudget := f.getReasoningBudget() if reasoningBudget != nil { diff --git a/cmd/cli/commands/configure_test.go b/cmd/cli/commands/configure_test.go index 99c238e4..40a705a1 100644 --- a/cmd/cli/commands/configure_test.go +++ b/cmd/cli/commands/configure_test.go @@ -128,6 +128,126 @@ func TestConfigureCmdThinkFlag(t *testing.T) { } } +func TestConfigureCmdGPUMemoryUtilizationFlag(t *testing.T) { + // Create the configure command + cmd := newConfigureCmd() + + // Verify the --gpu-memory-utilization flag exists + gpuMemFlag := cmd.Flags().Lookup("gpu-memory-utilization") + if gpuMemFlag == nil { + t.Fatal("--gpu-memory-utilization flag not found") + } + + // Verify the default value is empty (nil pointer) + if gpuMemFlag.DefValue != "" { + t.Errorf("Expected default gpu-memory-utilization value to be '' (nil), got '%s'", gpuMemFlag.DefValue) + } + + // Verify the flag type + if gpuMemFlag.Value.Type() != "float64" { + t.Errorf("Expected gpu-memory-utilization flag type to be 'float64', got '%s'", gpuMemFlag.Value.Type()) + } + + // Test setting the flag value + err := cmd.Flags().Set("gpu-memory-utilization", "0.7") + if err != nil { + t.Errorf("Failed to set gpu-memory-utilization flag: %v", err) + } + + // Verify the value was set + gpuMemValue := gpuMemFlag.Value.String() + if gpuMemValue != "0.7" { + t.Errorf("Expected gpu-memory-utilization flag value to be '0.7', got '%s'", gpuMemValue) + } +} + +func TestGPUMemoryUtilizationBehavior(t *testing.T) { + // Helper to create float64 pointer + float64Ptr := func(f float64) *float64 { return &f } + + tests := []struct { + name string + gpuMemValue *float64 + expectError bool + expectGPUMemSet bool + expectedGPUMemUtil float64 + }{ + { + name: "default - not set (nil)", + gpuMemValue: nil, + expectError: false, + expectGPUMemSet: false, + }, + { + name: "valid value 0.5", + gpuMemValue: float64Ptr(0.5), + expectError: false, + expectGPUMemSet: true, + expectedGPUMemUtil: 0.5, + }, + { + name: "edge case 0.0", + gpuMemValue: float64Ptr(0.0), + expectError: false, + expectGPUMemSet: true, + expectedGPUMemUtil: 0.0, + }, + { + name: "edge case 1.0", + gpuMemValue: float64Ptr(1.0), + expectError: false, + expectGPUMemSet: true, + expectedGPUMemUtil: 1.0, + }, + { + name: "invalid - negative value", + gpuMemValue: float64Ptr(-0.1), + expectError: true, + }, + { + name: "invalid - value > 1.0", + gpuMemValue: float64Ptr(1.5), + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + flags := ConfigureFlags{ + GPUMemoryUtilization: tt.gpuMemValue, + } + + req, err := flags.BuildConfigureRequest("test-model") + + if tt.expectError { + if err == nil { + t.Fatal("Expected error but got none") + } + return + } + + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + if tt.expectGPUMemSet { + // GPU memory utilization should be set + if req.VLLM == nil || req.VLLM.GPUMemoryUtilization == nil { + t.Fatal("Expected GPU memory utilization to be set") + } + if *req.VLLM.GPUMemoryUtilization != tt.expectedGPUMemUtil { + t.Errorf("Expected GPU memory utilization to be %f, got %f", tt.expectedGPUMemUtil, *req.VLLM.GPUMemoryUtilization) + } + } else { + // GPU memory utilization should NOT be set + if req.VLLM != nil && req.VLLM.GPUMemoryUtilization != nil { + t.Errorf("Expected GPU memory utilization to be nil when not set, got %f", *req.VLLM.GPUMemoryUtilization) + } + } + }) + } +} + func TestThinkFlagBehavior(t *testing.T) { // Helper to create bool pointer boolPtr := func(b bool) *bool { return &b } diff --git a/cmd/cli/docs/reference/docker_model_configure.yaml b/cmd/cli/docs/reference/docker_model_configure.yaml index f82527c9..4f941717 100644 --- a/cmd/cli/docs/reference/docker_model_configure.yaml +++ b/cmd/cli/docs/reference/docker_model_configure.yaml @@ -1,7 +1,7 @@ command: docker model configure short: Configure runtime options for a model long: Configure runtime options for a model -usage: docker model configure [--context-size=] [--speculative-draft-model=] [--hf_overrides=] [--mode=] [--think] MODEL +usage: docker model configure [--context-size=] [--speculative-draft-model=] [--hf_overrides=] [--gpu-memory-utilization=] [--mode=] [--think] MODEL pname: docker model plink: docker_model.yaml options: @@ -14,6 +14,16 @@ options: experimentalcli: false kubernetes: false swarm: false + - option: gpu-memory-utilization + value_type: float64 + description: | + fraction of GPU memory to use for the model executor (0.0-1.0) - vLLM only + deprecated: false + hidden: false + experimental: false + experimentalcli: false + kubernetes: false + swarm: false - option: hf_overrides value_type: string description: HuggingFace model config overrides (JSON) - vLLM only diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go index c4858b5d..4ab1ad79 100644 --- a/pkg/inference/backend.go +++ b/pkg/inference/backend.go @@ -67,6 +67,10 @@ type VLLMConfig struct { // HFOverrides contains HuggingFace model configuration overrides. // This maps to vLLM's --hf-overrides flag which accepts a JSON dictionary. HFOverrides HFOverrides `json:"hf-overrides,omitempty"` + // GPUMemoryUtilization sets the fraction of GPU memory to be used for the model executor. + // Must be between 0.0 and 1.0. If not specified, vLLM uses its default value of 0.9. + // This maps to vLLM's --gpu-memory-utilization flag. + GPUMemoryUtilization *float64 `json:"gpu-memory-utilization,omitempty"` } // LlamaCppConfig contains llama.cpp-specific configuration options. diff --git a/pkg/inference/backends/vllm/vllm_config.go b/pkg/inference/backends/vllm/vllm_config.go index d1e692ae..92bb6fb4 100644 --- a/pkg/inference/backends/vllm/vllm_config.go +++ b/pkg/inference/backends/vllm/vllm_config.go @@ -60,6 +60,15 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference // Add vLLM-specific arguments from backend config if config != nil && config.VLLM != nil { + // Add GPU memory utilization if specified + if config.VLLM.GPUMemoryUtilization != nil { + utilization := *config.VLLM.GPUMemoryUtilization + if utilization < 0.0 || utilization > 1.0 { + return nil, fmt.Errorf("gpu-memory-utilization must be between 0.0 and 1.0, got %f", utilization) + } + args = append(args, "--gpu-memory-utilization", strconv.FormatFloat(utilization, 'f', -1, 64)) + } + // Add HuggingFace overrides if specified if len(config.VLLM.HFOverrides) > 0 { hfOverridesJSON, err := json.Marshal(config.VLLM.HFOverrides) diff --git a/pkg/inference/backends/vllm/vllm_config_test.go b/pkg/inference/backends/vllm/vllm_config_test.go index afdd3196..e4538293 100644 --- a/pkg/inference/backends/vllm/vllm_config_test.go +++ b/pkg/inference/backends/vllm/vllm_config_test.go @@ -203,6 +203,135 @@ func TestGetArgs(t *testing.T) { `{"model_type":"bert"}`, }, }, + { + name: "with GPU memory utilization 0.5", + bundle: &mockModelBundle{ + safetensorsPath: "/path/to/model", + }, + config: &inference.BackendConfiguration{ + VLLM: &inference.VLLMConfig{ + GPUMemoryUtilization: float64ptr(0.5), + }, + }, + expected: []string{ + "serve", + "/path/to", + "--uds", + "/tmp/socket", + "--gpu-memory-utilization", + "0.5", + }, + }, + { + name: "with GPU memory utilization 0.0 (edge case)", + bundle: &mockModelBundle{ + safetensorsPath: "/path/to/model", + }, + config: &inference.BackendConfiguration{ + VLLM: &inference.VLLMConfig{ + GPUMemoryUtilization: float64ptr(0.0), + }, + }, + expected: []string{ + "serve", + "/path/to", + "--uds", + "/tmp/socket", + "--gpu-memory-utilization", + "0", + }, + }, + { + name: "with GPU memory utilization 1.0 (edge case)", + bundle: &mockModelBundle{ + safetensorsPath: "/path/to/model", + }, + config: &inference.BackendConfiguration{ + VLLM: &inference.VLLMConfig{ + GPUMemoryUtilization: float64ptr(1.0), + }, + }, + expected: []string{ + "serve", + "/path/to", + "--uds", + "/tmp/socket", + "--gpu-memory-utilization", + "1", + }, + }, + { + name: "with GPU memory utilization negative (invalid)", + bundle: &mockModelBundle{ + safetensorsPath: "/path/to/model", + }, + config: &inference.BackendConfiguration{ + VLLM: &inference.VLLMConfig{ + GPUMemoryUtilization: float64ptr(-0.1), + }, + }, + expectError: true, + }, + { + name: "with GPU memory utilization > 1.0 (invalid)", + bundle: &mockModelBundle{ + safetensorsPath: "/path/to/model", + }, + config: &inference.BackendConfiguration{ + VLLM: &inference.VLLMConfig{ + GPUMemoryUtilization: float64ptr(1.5), + }, + }, + expectError: true, + }, + { + name: "with GPU memory utilization and other parameters", + bundle: &mockModelBundle{ + safetensorsPath: "/path/to/model", + }, + config: &inference.BackendConfiguration{ + ContextSize: int32ptr(8192), + VLLM: &inference.VLLMConfig{ + GPUMemoryUtilization: float64ptr(0.7), + HFOverrides: inference.HFOverrides{ + "architectures": []interface{}{"LlamaForCausalLM"}, + }, + }, + }, + expected: []string{ + "serve", + "/path/to", + "--uds", + "/tmp/socket", + "--max-model-len", + "8192", + "--gpu-memory-utilization", + "0.7", + "--hf-overrides", + `{"architectures":["LlamaForCausalLM"]}`, + }, + }, + { + name: "without GPU memory utilization (should not add flag)", + bundle: &mockModelBundle{ + safetensorsPath: "/path/to/model", + }, + config: &inference.BackendConfiguration{ + VLLM: &inference.VLLMConfig{ + HFOverrides: inference.HFOverrides{ + "model_type": "llama", + }, + }, + }, + expected: []string{ + "serve", + "/path/to", + "--uds", + "/tmp/socket", + "--hf-overrides", + `{"model_type":"llama"}`, + }, + }, } for _, tt := range tests { @@ -290,3 +419,7 @@ func TestGetMaxModelLen(t *testing.T) { func int32ptr(n int32) *int32 { return &n } + +func float64ptr(n float64) *float64 { + return &n +}