feat(vllm): enhance argument handling for vLLM backend configuration

ilopezluna · ilopezluna · commit bf8d9e7a2761 · 2025-10-21T12:58:11.000+02:00
diff --git a/pkg/inference/backends/vllm/vllm.go b/pkg/inference/backends/vllm/vllm.go
@@ -93,7 +93,7 @@ func (v *vLLM) Install(_ context.Context, _ *http.Client) error {
 	return nil
 }
 
-func (v *vLLM) Run(ctx context.Context, socket, model string, modelRef string, _ inference.BackendMode, _ *inference.BackendConfiguration) error {
+func (v *vLLM) Run(ctx context.Context, socket, model string, modelRef string, mode inference.BackendMode, backendConfig *inference.BackendConfiguration) error {
 	if !platform.SupportsVLLM() {
 		v.log.Warn("vLLM backend is not yet supported")
 		return errors.New("not implemented")
@@ -109,13 +109,14 @@ func (v *vLLM) Run(ctx context.Context, socket, model string, modelRef string, _
 		v.log.Warnln("vLLM may not be able to start")
 	}
 
-	args := []string{
-		"serve",
-		filepath.Dir(bundle.SafetensorsPath()),
-		"--uds", socket,
-		"--served-model-name", modelRef,
+	// Get arguments from config
+	args, err := v.config.GetArgs(bundle, socket, mode, backendConfig)
+	if err != nil {
+		return fmt.Errorf("failed to get vLLM arguments: %w", err)
 	}
-	// TODO: Add inference.BackendConfiguration.
+
+	// Add served model name
+	args = append(args, "--served-model-name", modelRef)
 
 	v.log.Infof("vLLM args: %v", args)
 	tailBuf := tailbuffer.NewTailBuffer(1024)
diff --git a/pkg/inference/backends/vllm/vllm_config.go b/pkg/inference/backends/vllm/vllm_config.go
@@ -1,10 +1,81 @@
 package vllm
 
+import (
+	"fmt"
+	"strconv"
+
+	"github.com/docker/model-runner/pkg/distribution/types"
+	"github.com/docker/model-runner/pkg/inference"
+)
+
 // Config is the configuration for the vLLM backend.
 type Config struct {
+	// Args are the base arguments that are always included.
+	Args []string
 }
 
 // NewDefaultVLLMConfig creates a new VLLMConfig with default values.
 func NewDefaultVLLMConfig() *Config {
-	return &Config{}
+	return &Config{
+		Args: []string{},
+	}
+}
+
+// GetArgs implements BackendConfig.GetArgs.
+func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference.BackendMode, config *inference.BackendConfiguration) ([]string, error) {
+	// Start with the arguments from VLLMConfig
+	args := append([]string{}, c.Args...)
+
+	// Add the serve command and model path (use directory for safetensors)
+	modelPath := bundle.SafetensorsPath()
+	if modelPath != "" {
+		// vLLM expects the directory containing the safetensors files
+		args = append(args, "serve", modelPath)
+	} else {
+		return nil, fmt.Errorf("safetensors path required by vLLM backend")
+	}
+
+	// Add socket arguments
+	args = append(args, "--uds", socket)
+
+	// Add mode-specific arguments
+	switch mode {
+	case inference.BackendModeCompletion:
+		// Default mode for vLLM
+	case inference.BackendModeEmbedding:
+		// vLLM doesn't have a specific embedding flag like llama.cpp
+		// Embedding models are detected automatically
+	default:
+		return nil, fmt.Errorf("unsupported backend mode %q", mode)
+	}
+
+	// Add max-model-len if specified in model config or backend config
+	if maxLen := GetMaxModelLen(bundle.RuntimeConfig(), config); maxLen != nil {
+		args = append(args, "--max-model-len", strconv.FormatUint(*maxLen, 10))
+	}
+	// If nil, vLLM will automatically derive from the model config
+
+	// Add arguments from backend config
+	if config != nil {
+		args = append(args, config.RuntimeFlags...)
+	}
+
+	return args, nil
+}
+
+// GetMaxModelLen returns the max model length (context size) from model config or backend config.
+// Model config takes precedence over backend config.
+// Returns nil if neither is specified (vLLM will auto-derive from model).
+func GetMaxModelLen(modelCfg types.Config, backendCfg *inference.BackendConfiguration) *uint64 {
+	// Model config takes precedence
+	if modelCfg.ContextSize != nil {
+		return modelCfg.ContextSize
+	}
+	// else use backend config
+	if backendCfg != nil && backendCfg.ContextSize > 0 {
+		val := uint64(backendCfg.ContextSize)
+		return &val
+	}
+	// Return nil to let vLLM auto-derive from model config
+	return nil
 }
diff --git a/pkg/inference/backends/vllm/vllm_config_test.go b/pkg/inference/backends/vllm/vllm_config_test.go
@@ -0,0 +1,191 @@
+package vllm
+
+import (
+	"testing"
+
+	"github.com/docker/model-runner/pkg/distribution/types"
+	"github.com/docker/model-runner/pkg/inference"
+)
+
+type mockModelBundle struct {
+	safetensorsPath string
+	runtimeConfig   types.Config
+}
+
+func (m *mockModelBundle) GGUFPath() string {
+	return ""
+}
+
+func (m *mockModelBundle) SafetensorsPath() string {
+	return m.safetensorsPath
+}
+
+func (m *mockModelBundle) ChatTemplatePath() string {
+	return ""
+}
+
+func (m *mockModelBundle) MMPROJPath() string {
+	return ""
+}
+
+func (m *mockModelBundle) RuntimeConfig() types.Config {
+	return m.runtimeConfig
+}
+
+func (m *mockModelBundle) RootDir() string {
+	return "/path/to/bundle"
+}
+
+func TestGetArgs(t *testing.T) {
+	tests := []struct {
+		name     string
+		config   *inference.BackendConfiguration
+		bundle   *mockModelBundle
+		expected []string
+	}{
+		{
+			name: "basic args without context size",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: nil,
+			expected: []string{
+				"serve",
+				"/path/to/model",
+				"--uds",
+				"/tmp/socket",
+			},
+		},
+		{
+			name: "with backend context size",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				ContextSize: 8192,
+			},
+			expected: []string{
+				"serve",
+				"/path/to/model",
+				"--uds",
+				"/tmp/socket",
+				"--max-model-len",
+				"8192",
+			},
+		},
+		{
+			name: "with runtime flags",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+			},
+			config: &inference.BackendConfiguration{
+				RuntimeFlags: []string{"--gpu-memory-utilization", "0.9"},
+			},
+			expected: []string{
+				"serve",
+				"/path/to/model",
+				"--uds",
+				"/tmp/socket",
+				"--gpu-memory-utilization",
+				"0.9",
+			},
+		},
+		{
+			name: "with model context size (takes precedence)",
+			bundle: &mockModelBundle{
+				safetensorsPath: "/path/to/model",
+				runtimeConfig: types.Config{
+					ContextSize: ptrUint64(16384),
+				},
+			},
+			config: &inference.BackendConfiguration{
+				ContextSize: 8192,
+			},
+			expected: []string{
+				"serve",
+				"/path/to/model",
+				"--uds",
+				"/tmp/socket",
+				"--max-model-len",
+				"16384",
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			config := NewDefaultVLLMConfig()
+			args, err := config.GetArgs(tt.bundle, "/tmp/socket", inference.BackendModeCompletion, tt.config)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			if len(args) != len(tt.expected) {
+				t.Fatalf("expected %d args, got %d\nexpected: %v\ngot: %v", len(tt.expected), len(args), tt.expected, args)
+			}
+
+			for i, arg := range args {
+				if arg != tt.expected[i] {
+					t.Errorf("arg[%d]: expected %q, got %q", i, tt.expected[i], arg)
+				}
+			}
+		})
+	}
+}
+
+func TestGetMaxModelLen(t *testing.T) {
+	tests := []struct {
+		name          string
+		modelCfg      types.Config
+		backendCfg    *inference.BackendConfiguration
+		expectedValue *uint64
+	}{
+		{
+			name:          "no config",
+			modelCfg:      types.Config{},
+			backendCfg:    nil,
+			expectedValue: nil,
+		},
+		{
+			name:     "backend config only",
+			modelCfg: types.Config{},
+			backendCfg: &inference.BackendConfiguration{
+				ContextSize: 4096,
+			},
+			expectedValue: ptrUint64(4096),
+		},
+		{
+			name: "model config only",
+			modelCfg: types.Config{
+				ContextSize: ptrUint64(8192),
+			},
+			backendCfg:    nil,
+			expectedValue: ptrUint64(8192),
+		},
+		{
+			name: "model config takes precedence",
+			modelCfg: types.Config{
+				ContextSize: ptrUint64(16384),
+			},
+			backendCfg: &inference.BackendConfiguration{
+				ContextSize: 4096,
+			},
+			expectedValue: ptrUint64(16384),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := GetMaxModelLen(tt.modelCfg, tt.backendCfg)
+			if (result == nil) != (tt.expectedValue == nil) {
+				t.Errorf("expected nil=%v, got nil=%v", tt.expectedValue == nil, result == nil)
+			} else if result != nil && *result != *tt.expectedValue {
+				t.Errorf("expected %d, got %d", *tt.expectedValue, *result)
+			}
+		})
+	}
+}
+
+func ptrUint64(v uint64) *uint64 {
+	return &v
+}