Add vLLM backend support for high-throughput GPU inference

sozercan · sozercan · commit 173245d69fd5 · 2026-03-09T13:29:28.000-07:00
Add vLLM as a third backend option alongside llama-cpp and diffusers,
enabling HuggingFace safetensors model inference via vLLM on NVIDIA GPUs.

- Add BackendVLLM constant and wire through backend selection, OCI tag
  resolution, alias mapping, and metadata generation
- Install Python base dependencies + gcc/libc6-dev for Triton JIT compilation
- Install CUDA apt packages (libcublas, cuda-cudart) for vLLM runtime
- Support HuggingFace repo-level downloads (huggingface://namespace/model)
  in addition to existing single-file downloads
- Add build-time patches for pre-built vLLM backend image compatibility
  (flash_attn ABI fix, AsyncLLM API update)
- Add validation: vLLM requires CUDA runtime, amd64-only
- Add test aikitfile, unit tests, GPU CI workflow matrix entry, and docs

Validated end-to-end on NVIDIA A100 80GB with Qwen2.5-0.5B-Instruct.
diff --git a/.github/workflows/test-docker-gpu.yaml b/.github/workflows/test-docker-gpu.yaml
@@ -12,6 +12,7 @@ on:
           - all
           - llama-cuda
           - diffusers
+          - vllm
 
 permissions: read-all
 
@@ -23,7 +24,7 @@ jobs:
       fail-fast: false
       max-parallel: 1
       matrix:
-        backend: ${{ inputs.backend == 'all' && fromJson('["llama-cuda", "diffusers"]') || fromJson(format('["{0}"]', inputs.backend)) }}
+        backend: ${{ inputs.backend == 'all' && fromJson('["llama-cuda", "diffusers", "vllm"]') || fromJson(format('["{0}"]', inputs.backend)) }}
     steps:
       - name: cleanup workspace
         run: |
@@ -85,6 +86,20 @@ jobs:
         if: matrix.backend == 'diffusers'
         run: docker cp testmodel:/tmp/generated/content/images /tmp
 
+      - name: run test (vllm)
+        if: matrix.backend == 'vllm'
+        run: |
+          result=$(curl --fail --retry 30 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+            "model": "Qwen2.5-0.5B-Instruct",
+            "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
+          }')
+          echo $result
+
+          choices=$(echo "$result" | jq '.choices')
+          if [ -z "$choices" ]; then
+            exit 1
+          fi
+
       - name: save logs
         if: always()
         run: docker logs testmodel > /tmp/docker-${{ matrix.backend }}.log
diff --git a/pkg/aikit2llb/inference/backend.go b/pkg/aikit2llb/inference/backend.go
@@ -25,6 +25,7 @@ func getBackendTag(backend, runtime string, platform specs.Platform) string {
 	backendMap := map[string]string{
 		utils.BackendDiffusers: "diffusers",
 		utils.BackendLlamaCpp:  "llama-cpp",
+		utils.BackendVLLM:      "vllm",
 	}
 
 	backendName, exists := backendMap[backend]
@@ -43,6 +44,8 @@ func getBackendTag(backend, runtime string, platform specs.Platform) string {
 		switch backendName {
 		case "diffusers":
 			return fmt.Sprintf("%s-gpu-nvidia-cuda-12-diffusers", baseTag)
+		case "vllm":
+			return fmt.Sprintf("%s-gpu-nvidia-cuda-12-vllm", baseTag)
 		case defaultBackendName:
 			return fmt.Sprintf("%s-gpu-nvidia-cuda-12-llama-cpp", baseTag)
 		default:
@@ -67,6 +70,7 @@ func getBackendAlias(backend string) string {
 	aliasMap := map[string]string{
 		utils.BackendDiffusers: "diffusers",
 		utils.BackendLlamaCpp:  "llama-cpp",
+		utils.BackendVLLM:      "vllm",
 	}
 
 	if alias, exists := aliasMap[backend]; exists {
@@ -88,6 +92,8 @@ func getBackendName(backend, runtime string, platform specs.Platform) string {
 		switch backend {
 		case utils.BackendDiffusers:
 			return "cuda12-diffusers"
+		case utils.BackendVLLM:
+			return "cuda12-vllm"
 		case utils.BackendLlamaCpp:
 			return cuda12LlamaCppBackend
 		default:
@@ -108,6 +114,9 @@ func installBackend(backend string, c *config.InferenceConfig, platform specs.Pl
 	if backend == utils.BackendDiffusers {
 		merge = installDiffusersDependencies(s, merge)
 	}
+	if backend == utils.BackendVLLM {
+		merge = installVLLMDependencies(s, merge)
+	}
 
 	// Build the OCI image reference
 	ociImage := fmt.Sprintf("%s:%s", utils.BackendOCIRegistry, tag)
@@ -143,6 +152,19 @@ func installBackend(backend string, c *config.InferenceConfig, platform specs.Pl
 		llb.WithCustomName(fmt.Sprintf("Creating metadata.json for backend %s", backendName)),
 	)
 
+	// Apply workarounds for the pre-built vLLM backend image.
+	if backend == utils.BackendVLLM {
+		// Remove broken flash_attn package (PyTorch ABI incompatibility).
+		// Patch backend.py to use the current vLLM AsyncLLM API
+		// (get_model_config() was replaced by the model_config property).
+		s = s.Run(utils.Shf(
+			"rm -rf %[1]s/venv/lib/python*/site-packages/flash_attn* && "+
+				"sed -i 's/await self.llm.get_model_config()/self.llm.model_config/' %[1]s/backend.py",
+			backendDir),
+			llb.WithCustomNamef("Patching vLLM backend %s for compatibility", backendName),
+		).Root()
+	}
+
 	diff := llb.Diff(savedState, s)
 	return llb.Merge([]llb.State{merge, diff})
 }
diff --git a/pkg/aikit2llb/inference/backend_test.go b/pkg/aikit2llb/inference/backend_test.go
@@ -43,6 +43,15 @@ func TestGetBackendTag(t *testing.T) {
 			},
 			want: fmt.Sprintf("%s-gpu-nvidia-cuda-12-diffusers", localAIVersion),
 		},
+		{
+			name:    "CUDA vllm",
+			backend: utils.BackendVLLM,
+			runtime: utils.RuntimeNVIDIA,
+			platform: specs.Platform{
+				Architecture: utils.PlatformAMD64,
+			},
+			want: fmt.Sprintf("%s-gpu-nvidia-cuda-12-vllm", localAIVersion),
+		},
 		{
 			name:    "Apple Silicon llama-cpp",
 			backend: utils.BackendLlamaCpp,
@@ -155,6 +164,11 @@ func TestGetBackendAlias(t *testing.T) {
 			backend: utils.BackendLlamaCpp,
 			want:    "llama-cpp",
 		},
+		{
+			name:    "vllm backend",
+			backend: utils.BackendVLLM,
+			want:    "vllm",
+		},
 		{
 			name:    "unknown backend defaults to llama-cpp",
 			backend: "unknown",
@@ -212,6 +226,15 @@ func TestGetBackendName(t *testing.T) {
 			},
 			want: "cuda12-diffusers",
 		},
+		{
+			name:    "CUDA vllm",
+			backend: utils.BackendVLLM,
+			runtime: utils.RuntimeNVIDIA,
+			platform: specs.Platform{
+				Architecture: utils.PlatformAMD64,
+			},
+			want: "cuda12-vllm",
+		},
 		{
 			name:    "Apple Silicon llama-cpp",
 			backend: utils.BackendLlamaCpp,
diff --git a/pkg/aikit2llb/inference/convert.go b/pkg/aikit2llb/inference/convert.go
@@ -3,6 +3,7 @@ package inference
 import (
 	"fmt"
 	"net/url"
+	"slices"
 	"strings"
 
 	"github.com/kaito-project/aikit/pkg/aikit/config"
@@ -119,8 +120,8 @@ func installCuda(c *config.InferenceConfig, s llb.State, merge llb.State) (llb.S
 	// running apt-get update twice due to nvidia repo
 	s = s.Run(utils.Sh("apt-get update && apt-get install --no-install-recommends -y ca-certificates && apt-get update"), llb.IgnoreCache).Root()
 
-	// default llama.cpp backend is being used
-	if len(c.Backends) == 0 {
+	// install cuda libraries for llama-cpp (default) and vllm backends
+	if len(c.Backends) == 0 || slices.Contains(c.Backends, utils.BackendLlamaCpp) || slices.Contains(c.Backends, utils.BackendVLLM) {
 		// install cuda libraries and pciutils for gpu detection
 		s = s.Run(utils.Shf("apt-get install -y --no-install-recommends pciutils libcublas-%[1]s cuda-cudart-%[1]s && apt-get clean", cudaVersion)).Root()
 		// TODO: clean up /var/lib/dpkg/status
diff --git a/pkg/aikit2llb/inference/download.go b/pkg/aikit2llb/inference/download.go
@@ -18,6 +18,7 @@ import (
 const (
 	orasImage         = "ghcr.io/oras-project/oras:v1.2.0"
 	ollamaRegistryURL = "registry.ollama.ai"
+	hfCLIImage        = "ghcr.io/kaito-project/aikit/hf-cli:latest"
 )
 
 // handleOCI handles OCI artifact downloading and processing.
@@ -151,24 +152,51 @@ func ParseHuggingFaceURL(source string) (string, string, error) {
 }
 
 // handleHuggingFace handles Hugging Face model downloads with branch support.
+// Supports both single-file downloads (huggingface://namespace/model/file) and
+// full repo downloads (huggingface://namespace/model) for backends like vLLM.
 func handleHuggingFace(source string, s llb.State) (llb.State, error) {
-	// Translate the Hugging Face URL, extracting the branch if provided
+	// Try single-file download first (3+ parts)
 	hfURL, modelName, err := ParseHuggingFaceURL(source)
+	if err == nil {
+		// Single-file download via HTTP
+		opts := []llb.HTTPOption{llb.Filename(modelName)}
+		m := llb.HTTP(hfURL, opts...)
+		modelPath := fmt.Sprintf("/models/%s", modelName)
+		s = s.File(
+			llb.Copy(m, modelName, modelPath, createCopyOptions()...),
+			llb.WithCustomName("Copying "+modelName+" from Hugging Face to "+modelPath),
+		)
+		return s, nil
+	}
+
+	// Fall back to full repo download (2 parts: namespace/model)
+	spec, err := ParseHuggingFaceSpec(source)
 	if err != nil {
-		return llb.State{}, err
+		return llb.State{}, fmt.Errorf("invalid Hugging Face URL format: %w", err)
 	}
 
-	// Perform the HTTP download
-	opts := []llb.HTTPOption{llb.Filename(modelName)}
-	m := llb.HTTP(hfURL, opts...)
+	dlScript := fmt.Sprintf(`set -euo pipefail
+if [ -f /run/secrets/hf-token ]; then export HF_TOKEN="$(cat /run/secrets/hf-token)"; fi
+mkdir -p /out
+hf download %s/%s --revision %s --local-dir /out
+rm -rf /out/.cache || true
+find /out -type f -name '*.lock' -delete || true
+`, spec.Namespace, spec.Model, spec.Revision)
 
-	// Determine the model path in the /models directory
-	modelPath := fmt.Sprintf("/models/%s", modelName)
+	runOpts := []llb.RunOption{
+		llb.Args([]string{"bash", "-c", dlScript}),
+		llb.AddSecret("/run/secrets/hf-token", llb.SecretID("hf-token"), llb.SecretOptional),
+		llb.WithCustomNamef("Downloading HuggingFace repo %s/%s", spec.Namespace, spec.Model),
+	}
+	run := llb.Image(hfCLIImage).Run(runOpts...)
 
-	// Copy the downloaded file to the desired location
+	modelDir := fmt.Sprintf("/models/%s", spec.Model)
 	s = s.File(
-		llb.Copy(m, modelName, modelPath, createCopyOptions()...),
-		llb.WithCustomName("Copying "+modelName+" from Hugging Face to "+modelPath),
+		llb.Copy(run.Root(), "/out/", modelDir+"/", &llb.CopyInfo{
+			CopyDirContentsOnly: true,
+			CreateDestPath:      true,
+		}),
+		llb.WithCustomNamef("Copying HuggingFace repo %s/%s to %s", spec.Namespace, spec.Model, modelDir),
 	)
 	return s, nil
 }
diff --git a/pkg/aikit2llb/inference/vllm.go b/pkg/aikit2llb/inference/vllm.go
@@ -0,0 +1,20 @@
+package inference
+
+import (
+	"github.com/kaito-project/aikit/pkg/utils"
+	"github.com/moby/buildkit/client/llb"
+)
+
+// installVLLMDependencies installs Python dependencies and a C compiler required for vLLM backend.
+// vLLM's Triton kernels need a C compiler (gcc) for JIT compilation at runtime.
+func installVLLMDependencies(s llb.State, merge llb.State) llb.State {
+	merge = installPythonBaseDependencies(s, merge)
+
+	savedState := s
+	s = s.Run(utils.Sh("apt-get update && apt-get install --no-install-recommends -y gcc libc6-dev && apt-get clean"),
+		llb.WithCustomName("Installing C compiler for vLLM Triton JIT"),
+	).Root()
+
+	diff := llb.Diff(savedState, s)
+	return llb.Merge([]llb.State{merge, diff})
+}
diff --git a/pkg/aikit2llb/inference/vllm_test.go b/pkg/aikit2llb/inference/vllm_test.go
@@ -0,0 +1,28 @@
+package inference
+
+import (
+	"testing"
+
+	"github.com/moby/buildkit/client/llb"
+)
+
+func TestInstallVLLMDependencies(t *testing.T) {
+	// Create a simple base state for testing
+	baseState := llb.Image("ubuntu:22.04")
+	mergeState := baseState
+
+	// Call the function to install dependencies
+	// This should execute without panicking
+	defer func() {
+		if r := recover(); r != nil {
+			t.Errorf("installVLLMDependencies panicked: %v", r)
+		}
+	}()
+
+	result := installVLLMDependencies(baseState, mergeState)
+
+	// The function should return a valid LLB state
+	// We can't easily test the actual installation without running BuildKit,
+	// but we can verify the function executes without panicking
+	_ = result // Use the result to avoid unused variable warning
+}
diff --git a/pkg/build/build.go b/pkg/build/build.go
@@ -473,6 +473,10 @@ func validateInferenceConfig(c *config.InferenceConfig) error {
 		return errors.New("diffusers backend only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
 	}
 
+	if slices.Contains(c.Backends, utils.BackendVLLM) && c.Runtime != utils.RuntimeNVIDIA {
+		return errors.New("vllm backend only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
+	}
+
 	if c.Runtime == utils.RuntimeAppleSilicon && len(c.Backends) > 0 {
 		for _, backend := range c.Backends {
 			if backend != utils.BackendLlamaCpp {
@@ -481,7 +485,7 @@ func validateInferenceConfig(c *config.InferenceConfig) error {
 		}
 	}
 
-	backends := []string{utils.BackendLlamaCpp, utils.BackendDiffusers}
+	backends := []string{utils.BackendLlamaCpp, utils.BackendDiffusers, utils.BackendVLLM}
 	for _, b := range c.Backends {
 		if !slices.Contains(backends, b) {
 			return errors.Errorf("backend %s is not supported", b)
diff --git a/pkg/build/build_test.go b/pkg/build/build_test.go
@@ -80,6 +80,35 @@ func Test_validateConfig(t *testing.T) {
 			}},
 			wantErr: true,
 		},
+		{
+			name: "valid vllm backend with cuda runtime",
+			args: args{c: &config.InferenceConfig{
+				APIVersion: "v1alpha1",
+				Runtime:    "cuda",
+				Backends:   []string{"vllm"},
+				Models: []config.Model{
+					{
+						Name:   "test",
+						Source: "foo",
+					},
+				},
+			}},
+			wantErr: false,
+		},
+		{
+			name: "vllm backend requires cuda runtime",
+			args: args{c: &config.InferenceConfig{
+				APIVersion: "v1alpha1",
+				Backends:   []string{"vllm"},
+				Models: []config.Model{
+					{
+						Name:   "test",
+						Source: "foo",
+					},
+				},
+			}},
+			wantErr: true,
+		},
 		{
 			name: "invalid backend name",
 			args: args{c: &config.InferenceConfig{
@@ -157,6 +186,17 @@ func Test_validateBackendPlatformCompatibility(t *testing.T) {
 			},
 			wantErr: false,
 		},
+		{
+			name: "vllm backend with arm64 platform - should fail",
+			config: &config.InferenceConfig{
+				APIVersion: "v1alpha1",
+				Backends:   []string{"vllm"},
+			},
+			targetPlatforms: []*specs.Platform{
+				{Architecture: "arm64", OS: "linux"},
+			},
+			wantErr: true,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
diff --git a/pkg/utils/const.go b/pkg/utils/const.go
@@ -6,6 +6,7 @@ const (
 
 	BackendDiffusers = "diffusers"
 	BackendLlamaCpp  = "llama-cpp"
+	BackendVLLM      = "vllm"
 
 	BackendOCIRegistry = "quay.io/go-skynet/local-ai-backends"
 
diff --git a/test/aikitfile-vllm.yaml b/test/aikitfile-vllm.yaml
@@ -0,0 +1,12 @@
+#syntax=aikit:test
+apiVersion: v1alpha1
+debug: true
+runtime: cuda
+backends:
+  - vllm
+config: |
+  - name: Qwen2.5-0.5B-Instruct
+    backend: vllm
+    parameters:
+      model: Qwen/Qwen2.5-0.5B-Instruct
+    use_tokenizer_template: true
diff --git a/website/docs/gpu.md b/website/docs/gpu.md
diff --git a/website/docs/specs-inference.md b/website/docs/specs-inference.md

Original file line number	Diff line number	Diff line change
`@@ -473,6 +473,10 @@ func validateInferenceConfig(c *config.InferenceConfig) error {`
`473`	`473`	`return errors.New("diffusers backend only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")`
`474`	`474`	`}`
`475`	`475`
	`476`	`+ if slices.Contains(c.Backends, utils.BackendVLLM) && c.Runtime != utils.RuntimeNVIDIA {`
	`477`	`+ return errors.New("vllm backend only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")`
	`478`	`+ }`
	`479`	`+`
`476`	`480`	`if c.Runtime == utils.RuntimeAppleSilicon && len(c.Backends) > 0 {`
`477`	`481`	`for _, backend := range c.Backends {`
`478`	`482`	`if backend != utils.BackendLlamaCpp {`
`@@ -481,7 +485,7 @@ func validateInferenceConfig(c *config.InferenceConfig) error {`
`481`	`485`	`}`
`482`	`486`	`}`
`483`	`487`
`484`		`- backends := []string{utils.BackendLlamaCpp, utils.BackendDiffusers}`
	`488`	`+ backends := []string{utils.BackendLlamaCpp, utils.BackendDiffusers, utils.BackendVLLM}`
`485`	`489`	`for _, b := range c.Backends {`
`486`	`490`	`if !slices.Contains(backends, b) {`
`487`	`491`	`return errors.Errorf("backend %s is not supported", b)`