add GPU memory utilization configuration for model executor

ilopezluna · ilopezluna · commit 90e631fdaef7 · 2025-12-11T13:36:30.000+01:00
diff --git a/cmd/cli/commands/configure.go b/cmd/cli/commands/configure.go
@@ -11,7 +11,7 @@ func newConfigureCmd() *cobra.Command {
 	var flags ConfigureFlags
 
 	c := &cobra.Command{
-		Use:    "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--mode=<mode>] [--think] MODEL",
+		Use:    "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL",
 		Short:  "Configure runtime options for a model",
 		Hidden: true,
 		Args: func(cmd *cobra.Command, args []string) error {
diff --git a/cmd/cli/commands/configure_flags.go b/cmd/cli/commands/configure_flags.go
@@ -84,6 +84,37 @@ func (v *BoolPtrValue) IsBoolFlag() bool {
 	return true
 }
 
+// Float64PtrValue implements pflag.Value interface for *float64 pointers
+// This allows flags to have a nil default value instead of 0.0
+type Float64PtrValue struct {
+	ptr **float64
+}
+
+// NewFloat64PtrValue creates a new Float64PtrValue for the given pointer
+func NewFloat64PtrValue(p **float64) *Float64PtrValue {
+	return &Float64PtrValue{ptr: p}
+}
+
+func (v *Float64PtrValue) String() string {
+	if v.ptr == nil || *v.ptr == nil {
+		return ""
+	}
+	return strconv.FormatFloat(**v.ptr, 'f', -1, 64)
+}
+
+func (v *Float64PtrValue) Set(s string) error {
+	val, err := strconv.ParseFloat(s, 64)
+	if err != nil {
+		return err
+	}
+	*v.ptr = &val
+	return nil
+}
+
+func (v *Float64PtrValue) Type() string {
+	return "float64"
+}
+
 // ptr is a helper function to create a pointer to int32
 func ptr(v int32) *int32 {
 	return &v
@@ -100,7 +131,8 @@ type ConfigureFlags struct {
 	NumTokens         int
 	MinAcceptanceRate float64
 	// vLLM-specific flags
-	HFOverrides string
+	HFOverrides          string
+	GPUMemoryUtilization *float64
 	// Think parameter for reasoning models
 	Think *bool
 }
@@ -112,6 +144,7 @@ func (f *ConfigureFlags) RegisterFlags(cmd *cobra.Command) {
 	cmd.Flags().IntVar(&f.NumTokens, "speculative-num-tokens", 0, "number of tokens to predict speculatively")
 	cmd.Flags().Float64Var(&f.MinAcceptanceRate, "speculative-min-acceptance-rate", 0, "minimum acceptance rate for speculative decoding")
 	cmd.Flags().StringVar(&f.HFOverrides, "hf_overrides", "", "HuggingFace model config overrides (JSON) - vLLM only")
+	cmd.Flags().Var(NewFloat64PtrValue(&f.GPUMemoryUtilization), "gpu-memory-utilization", "fraction of GPU memory to use for the model executor (0.0-1.0) - vLLM only")
 	cmd.Flags().Var(NewBoolPtrValue(&f.Think), "think", "enable reasoning mode for thinking models")
 	cmd.Flags().StringVar(&f.Mode, "mode", "", "backend operation mode (completion, embedding, reranking)")
 }
@@ -151,6 +184,18 @@ func (f *ConfigureFlags) BuildConfigureRequest(model string) (scheduling.Configu
 		req.VLLM.HFOverrides = hfo
 	}
 
+	// Set GPU memory utilization if provided (vLLM-specific)
+	if f.GPUMemoryUtilization != nil {
+		utilization := *f.GPUMemoryUtilization
+		if utilization < 0.0 || utilization > 1.0 {
+			return req, fmt.Errorf("--gpu-memory-utilization must be between 0.0 and 1.0, got %f", utilization)
+		}
+		if req.VLLM == nil {
+			req.VLLM = &inference.VLLMConfig{}
+		}
+		req.VLLM.GPUMemoryUtilization = f.GPUMemoryUtilization
+	}
+
 	// Set reasoning budget from --think flag
 	reasoningBudget := f.getReasoningBudget()
 	if reasoningBudget != nil {
diff --git a/cmd/cli/commands/configure_test.go b/cmd/cli/commands/configure_test.go
@@ -128,6 +128,126 @@ func TestConfigureCmdThinkFlag(t *testing.T) {
 	}
 }
 
+func TestConfigureCmdGPUMemoryUtilizationFlag(t *testing.T) {
+	// Create the configure command
+	cmd := newConfigureCmd()
+
+	// Verify the --gpu-memory-utilization flag exists
+	gpuMemFlag := cmd.Flags().Lookup("gpu-memory-utilization")
+	if gpuMemFlag == nil {
+		t.Fatal("--gpu-memory-utilization flag not found")
+	}
+
+	// Verify the default value is empty (nil pointer)
+	if gpuMemFlag.DefValue != "" {
+		t.Errorf("Expected default gpu-memory-utilization value to be '' (nil), got '%s'", gpuMemFlag.DefValue)
+	}
+
+	// Verify the flag type
+	if gpuMemFlag.Value.Type() != "float64" {
+		t.Errorf("Expected gpu-memory-utilization flag type to be 'float64', got '%s'", gpuMemFlag.Value.Type())
+	}
+
+	// Test setting the flag value
+	err := cmd.Flags().Set("gpu-memory-utilization", "0.7")
+	if err != nil {
+		t.Errorf("Failed to set gpu-memory-utilization flag: %v", err)
+	}
+
+	// Verify the value was set
+	gpuMemValue := gpuMemFlag.Value.String()
+	if gpuMemValue != "0.7" {
+		t.Errorf("Expected gpu-memory-utilization flag value to be '0.7', got '%s'", gpuMemValue)
+	}
+}
+
+func TestGPUMemoryUtilizationBehavior(t *testing.T) {
+	// Helper to create float64 pointer
+	float64Ptr := func(f float64) *float64 { return &f }
+
+	tests := []struct {
+		name               string
+		gpuMemValue        *float64
+		expectError        bool
+		expectGPUMemSet    bool
+		expectedGPUMemUtil float64
+	}{
+		{
+			name:            "default - not set (nil)",
+			gpuMemValue:     nil,
+			expectError:     false,
+			expectGPUMemSet: false,
+		},
+		{
+			name:               "valid value 0.5",
+			gpuMemValue:        float64Ptr(0.5),
+			expectError:        false,
+			expectGPUMemSet:    true,
+			expectedGPUMemUtil: 0.5,
+		},
+		{
+			name:               "edge case 0.0",
+			gpuMemValue:        float64Ptr(0.0),
+			expectError:        false,
+			expectGPUMemSet:    true,
+			expectedGPUMemUtil: 0.0,
+		},
+		{
+			name:               "edge case 1.0",
+			gpuMemValue:        float64Ptr(1.0),
+			expectError:        false,
+			expectGPUMemSet:    true,
+			expectedGPUMemUtil: 1.0,
+		},
+		{
+			name:        "invalid - negative value",
+			gpuMemValue: float64Ptr(-0.1),
+			expectError: true,
+		},
+		{
+			name:        "invalid - value > 1.0",
+			gpuMemValue: float64Ptr(1.5),
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			flags := ConfigureFlags{
+				GPUMemoryUtilization: tt.gpuMemValue,
+			}
+
+			req, err := flags.BuildConfigureRequest("test-model")
+
+			if tt.expectError {
+				if err == nil {
+					t.Fatal("Expected error but got none")
+				}
+				return
+			}
+
+			if err != nil {
+				t.Fatalf("Unexpected error: %v", err)
+			}
+
+			if tt.expectGPUMemSet {
+				// GPU memory utilization should be set
+				if req.VLLM == nil || req.VLLM.GPUMemoryUtilization == nil {
+					t.Fatal("Expected GPU memory utilization to be set")
+				}
+				if *req.VLLM.GPUMemoryUtilization != tt.expectedGPUMemUtil {
+					t.Errorf("Expected GPU memory utilization to be %f, got %f", tt.expectedGPUMemUtil, *req.VLLM.GPUMemoryUtilization)
+				}
+			} else {
+				// GPU memory utilization should NOT be set
+				if req.VLLM != nil && req.VLLM.GPUMemoryUtilization != nil {
+					t.Errorf("Expected GPU memory utilization to be nil when not set, got %f", *req.VLLM.GPUMemoryUtilization)
+				}
+			}
+		})
+	}
+}
+
 func TestThinkFlagBehavior(t *testing.T) {
 	// Helper to create bool pointer
 	boolPtr := func(b bool) *bool { return &b }
diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
@@ -67,6 +67,10 @@ type VLLMConfig struct {
 	// HFOverrides contains HuggingFace model configuration overrides.
 	// This maps to vLLM's --hf-overrides flag which accepts a JSON dictionary.
 	HFOverrides HFOverrides `json:"hf-overrides,omitempty"`
+	// GPUMemoryUtilization sets the fraction of GPU memory to be used for the model executor.
+	// Must be between 0.0 and 1.0. If not specified, vLLM uses its default value of 0.9.
+	// This maps to vLLM's --gpu-memory-utilization flag.
+	GPUMemoryUtilization *float64 `json:"gpu-memory-utilization,omitempty"`
 }
 
 // LlamaCppConfig contains llama.cpp-specific configuration options.
diff --git a/pkg/inference/backends/vllm/vllm_config.go b/pkg/inference/backends/vllm/vllm_config.go
@@ -60,6 +60,15 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
 
 	// Add vLLM-specific arguments from backend config
 	if config != nil && config.VLLM != nil {
+		// Add GPU memory utilization if specified
+		if config.VLLM.GPUMemoryUtilization != nil {
+			utilization := *config.VLLM.GPUMemoryUtilization
+			if utilization < 0.0 || utilization > 1.0 {
+				return nil, fmt.Errorf("gpu-memory-utilization must be between 0.0 and 1.0, got %f", utilization)
+			}
+			args = append(args, "--gpu-memory-utilization", strconv.FormatFloat(utilization, 'f', -1, 64))
+		}
+
 		// Add HuggingFace overrides if specified
 		if len(config.VLLM.HFOverrides) > 0 {
 			hfOverridesJSON, err := json.Marshal(config.VLLM.HFOverrides)
diff --git a/pkg/inference/backends/vllm/vllm_config_test.go b/pkg/inference/backends/vllm/vllm_config_test.go
@@ -203,6 +203,135 @@ func TestGetArgs(t *testing.T) {
 				`{"model_type":"bert"}`,
 			},
 		},
+		{
+			name: "with GPU memory utilization 0.5",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				VLLM: &inference.VLLMConfig{
+					GPUMemoryUtilization: float64ptr(0.5),
+				},
+			},
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+				"--gpu-memory-utilization",
+				"0.5",
+			},
+		},
+		{
+			name: "with GPU memory utilization 0.0 (edge case)",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				VLLM: &inference.VLLMConfig{
+					GPUMemoryUtilization: float64ptr(0.0),
+				},
+			},
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+				"--gpu-memory-utilization",
+				"0",
+			},
+		},
+		{
+			name: "with GPU memory utilization 1.0 (edge case)",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				VLLM: &inference.VLLMConfig{
+					GPUMemoryUtilization: float64ptr(1.0),
+				},
+			},
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+				"--gpu-memory-utilization",
+				"1",
+			},
+		},
+		{
+			name: "with GPU memory utilization negative (invalid)",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				VLLM: &inference.VLLMConfig{
+					GPUMemoryUtilization: float64ptr(-0.1),
+				},
+			},
+			expectError: true,
+		},
+		{
+			name: "with GPU memory utilization > 1.0 (invalid)",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				VLLM: &inference.VLLMConfig{
+					GPUMemoryUtilization: float64ptr(1.5),
+				},
+			},
+			expectError: true,
+		},
+		{
+			name: "with GPU memory utilization and other parameters",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				ContextSize: int32ptr(8192),
+				VLLM: &inference.VLLMConfig{
+					GPUMemoryUtilization: float64ptr(0.7),
+					HFOverrides: inference.HFOverrides{
+						"architectures": []interface{}{"LlamaForCausalLM"},
+					},
+				},
+			},
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+				"--max-model-len",
+				"8192",
+				"--gpu-memory-utilization",
+				"0.7",
+				"--hf-overrides",
+				`{"architectures":["LlamaForCausalLM"]}`,
+			},
+		},
+		{
+			name: "without GPU memory utilization (should not add flag)",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				VLLM: &inference.VLLMConfig{
+					HFOverrides: inference.HFOverrides{
+						"model_type": "llama",
+					},
+				},
+			},
+			expected: []string{
+				"serve",
+				"/path/to",
+				"--uds",
+				"/tmp/socket",
+				"--hf-overrides",
+				`{"model_type":"llama"}`,
+			},
+		},
 	}
 
 	for _, tt := range tests {
@@ -290,3 +419,7 @@ func TestGetMaxModelLen(t *testing.T) {
 func int32ptr(n int32) *int32 {
 	return &n
 }
+
+func float64ptr(n float64) *float64 {
+	return &n
+}