Skip to content

Commit 173245d

Browse files
committed
Add vLLM backend support for high-throughput GPU inference
Add vLLM as a third backend option alongside llama-cpp and diffusers, enabling HuggingFace safetensors model inference via vLLM on NVIDIA GPUs. - Add BackendVLLM constant and wire through backend selection, OCI tag resolution, alias mapping, and metadata generation - Install Python base dependencies + gcc/libc6-dev for Triton JIT compilation - Install CUDA apt packages (libcublas, cuda-cudart) for vLLM runtime - Support HuggingFace repo-level downloads (huggingface://namespace/model) in addition to existing single-file downloads - Add build-time patches for pre-built vLLM backend image compatibility (flash_attn ABI fix, AsyncLLM API update) - Add validation: vLLM requires CUDA runtime, amd64-only - Add test aikitfile, unit tests, GPU CI workflow matrix entry, and docs Validated end-to-end on NVIDIA A100 80GB with Qwen2.5-0.5B-Instruct.
1 parent d004f96 commit 173245d

File tree

13 files changed

+246
-15
lines changed

13 files changed

+246
-15
lines changed

.github/workflows/test-docker-gpu.yaml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ on:
1212
- all
1313
- llama-cuda
1414
- diffusers
15+
- vllm
1516

1617
permissions: read-all
1718

@@ -23,7 +24,7 @@ jobs:
2324
fail-fast: false
2425
max-parallel: 1
2526
matrix:
26-
backend: ${{ inputs.backend == 'all' && fromJson('["llama-cuda", "diffusers"]') || fromJson(format('["{0}"]', inputs.backend)) }}
27+
backend: ${{ inputs.backend == 'all' && fromJson('["llama-cuda", "diffusers", "vllm"]') || fromJson(format('["{0}"]', inputs.backend)) }}
2728
steps:
2829
- name: cleanup workspace
2930
run: |
@@ -85,6 +86,20 @@ jobs:
8586
if: matrix.backend == 'diffusers'
8687
run: docker cp testmodel:/tmp/generated/content/images /tmp
8788

89+
- name: run test (vllm)
90+
if: matrix.backend == 'vllm'
91+
run: |
92+
result=$(curl --fail --retry 30 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
93+
"model": "Qwen2.5-0.5B-Instruct",
94+
"messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
95+
}')
96+
echo $result
97+
98+
choices=$(echo "$result" | jq '.choices')
99+
if [ -z "$choices" ]; then
100+
exit 1
101+
fi
102+
88103
- name: save logs
89104
if: always()
90105
run: docker logs testmodel > /tmp/docker-${{ matrix.backend }}.log

pkg/aikit2llb/inference/backend.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ func getBackendTag(backend, runtime string, platform specs.Platform) string {
2525
backendMap := map[string]string{
2626
utils.BackendDiffusers: "diffusers",
2727
utils.BackendLlamaCpp: "llama-cpp",
28+
utils.BackendVLLM: "vllm",
2829
}
2930

3031
backendName, exists := backendMap[backend]
@@ -43,6 +44,8 @@ func getBackendTag(backend, runtime string, platform specs.Platform) string {
4344
switch backendName {
4445
case "diffusers":
4546
return fmt.Sprintf("%s-gpu-nvidia-cuda-12-diffusers", baseTag)
47+
case "vllm":
48+
return fmt.Sprintf("%s-gpu-nvidia-cuda-12-vllm", baseTag)
4649
case defaultBackendName:
4750
return fmt.Sprintf("%s-gpu-nvidia-cuda-12-llama-cpp", baseTag)
4851
default:
@@ -67,6 +70,7 @@ func getBackendAlias(backend string) string {
6770
aliasMap := map[string]string{
6871
utils.BackendDiffusers: "diffusers",
6972
utils.BackendLlamaCpp: "llama-cpp",
73+
utils.BackendVLLM: "vllm",
7074
}
7175

7276
if alias, exists := aliasMap[backend]; exists {
@@ -88,6 +92,8 @@ func getBackendName(backend, runtime string, platform specs.Platform) string {
8892
switch backend {
8993
case utils.BackendDiffusers:
9094
return "cuda12-diffusers"
95+
case utils.BackendVLLM:
96+
return "cuda12-vllm"
9197
case utils.BackendLlamaCpp:
9298
return cuda12LlamaCppBackend
9399
default:
@@ -108,6 +114,9 @@ func installBackend(backend string, c *config.InferenceConfig, platform specs.Pl
108114
if backend == utils.BackendDiffusers {
109115
merge = installDiffusersDependencies(s, merge)
110116
}
117+
if backend == utils.BackendVLLM {
118+
merge = installVLLMDependencies(s, merge)
119+
}
111120

112121
// Build the OCI image reference
113122
ociImage := fmt.Sprintf("%s:%s", utils.BackendOCIRegistry, tag)
@@ -143,6 +152,19 @@ func installBackend(backend string, c *config.InferenceConfig, platform specs.Pl
143152
llb.WithCustomName(fmt.Sprintf("Creating metadata.json for backend %s", backendName)),
144153
)
145154

155+
// Apply workarounds for the pre-built vLLM backend image.
156+
if backend == utils.BackendVLLM {
157+
// Remove broken flash_attn package (PyTorch ABI incompatibility).
158+
// Patch backend.py to use the current vLLM AsyncLLM API
159+
// (get_model_config() was replaced by the model_config property).
160+
s = s.Run(utils.Shf(
161+
"rm -rf %[1]s/venv/lib/python*/site-packages/flash_attn* && "+
162+
"sed -i 's/await self.llm.get_model_config()/self.llm.model_config/' %[1]s/backend.py",
163+
backendDir),
164+
llb.WithCustomNamef("Patching vLLM backend %s for compatibility", backendName),
165+
).Root()
166+
}
167+
146168
diff := llb.Diff(savedState, s)
147169
return llb.Merge([]llb.State{merge, diff})
148170
}

pkg/aikit2llb/inference/backend_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ func TestGetBackendTag(t *testing.T) {
4343
},
4444
want: fmt.Sprintf("%s-gpu-nvidia-cuda-12-diffusers", localAIVersion),
4545
},
46+
{
47+
name: "CUDA vllm",
48+
backend: utils.BackendVLLM,
49+
runtime: utils.RuntimeNVIDIA,
50+
platform: specs.Platform{
51+
Architecture: utils.PlatformAMD64,
52+
},
53+
want: fmt.Sprintf("%s-gpu-nvidia-cuda-12-vllm", localAIVersion),
54+
},
4655
{
4756
name: "Apple Silicon llama-cpp",
4857
backend: utils.BackendLlamaCpp,
@@ -155,6 +164,11 @@ func TestGetBackendAlias(t *testing.T) {
155164
backend: utils.BackendLlamaCpp,
156165
want: "llama-cpp",
157166
},
167+
{
168+
name: "vllm backend",
169+
backend: utils.BackendVLLM,
170+
want: "vllm",
171+
},
158172
{
159173
name: "unknown backend defaults to llama-cpp",
160174
backend: "unknown",
@@ -212,6 +226,15 @@ func TestGetBackendName(t *testing.T) {
212226
},
213227
want: "cuda12-diffusers",
214228
},
229+
{
230+
name: "CUDA vllm",
231+
backend: utils.BackendVLLM,
232+
runtime: utils.RuntimeNVIDIA,
233+
platform: specs.Platform{
234+
Architecture: utils.PlatformAMD64,
235+
},
236+
want: "cuda12-vllm",
237+
},
215238
{
216239
name: "Apple Silicon llama-cpp",
217240
backend: utils.BackendLlamaCpp,

pkg/aikit2llb/inference/convert.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package inference
33
import (
44
"fmt"
55
"net/url"
6+
"slices"
67
"strings"
78

89
"github.com/kaito-project/aikit/pkg/aikit/config"
@@ -119,8 +120,8 @@ func installCuda(c *config.InferenceConfig, s llb.State, merge llb.State) (llb.S
119120
// running apt-get update twice due to nvidia repo
120121
s = s.Run(utils.Sh("apt-get update && apt-get install --no-install-recommends -y ca-certificates && apt-get update"), llb.IgnoreCache).Root()
121122

122-
// default llama.cpp backend is being used
123-
if len(c.Backends) == 0 {
123+
// install cuda libraries for llama-cpp (default) and vllm backends
124+
if len(c.Backends) == 0 || slices.Contains(c.Backends, utils.BackendLlamaCpp) || slices.Contains(c.Backends, utils.BackendVLLM) {
124125
// install cuda libraries and pciutils for gpu detection
125126
s = s.Run(utils.Shf("apt-get install -y --no-install-recommends pciutils libcublas-%[1]s cuda-cudart-%[1]s && apt-get clean", cudaVersion)).Root()
126127
// TODO: clean up /var/lib/dpkg/status

pkg/aikit2llb/inference/download.go

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
const (
1919
orasImage = "ghcr.io/oras-project/oras:v1.2.0"
2020
ollamaRegistryURL = "registry.ollama.ai"
21+
hfCLIImage = "ghcr.io/kaito-project/aikit/hf-cli:latest"
2122
)
2223

2324
// handleOCI handles OCI artifact downloading and processing.
@@ -151,24 +152,51 @@ func ParseHuggingFaceURL(source string) (string, string, error) {
151152
}
152153

153154
// handleHuggingFace handles Hugging Face model downloads with branch support.
155+
// Supports both single-file downloads (huggingface://namespace/model/file) and
156+
// full repo downloads (huggingface://namespace/model) for backends like vLLM.
154157
func handleHuggingFace(source string, s llb.State) (llb.State, error) {
155-
// Translate the Hugging Face URL, extracting the branch if provided
158+
// Try single-file download first (3+ parts)
156159
hfURL, modelName, err := ParseHuggingFaceURL(source)
160+
if err == nil {
161+
// Single-file download via HTTP
162+
opts := []llb.HTTPOption{llb.Filename(modelName)}
163+
m := llb.HTTP(hfURL, opts...)
164+
modelPath := fmt.Sprintf("/models/%s", modelName)
165+
s = s.File(
166+
llb.Copy(m, modelName, modelPath, createCopyOptions()...),
167+
llb.WithCustomName("Copying "+modelName+" from Hugging Face to "+modelPath),
168+
)
169+
return s, nil
170+
}
171+
172+
// Fall back to full repo download (2 parts: namespace/model)
173+
spec, err := ParseHuggingFaceSpec(source)
157174
if err != nil {
158-
return llb.State{}, err
175+
return llb.State{}, fmt.Errorf("invalid Hugging Face URL format: %w", err)
159176
}
160177

161-
// Perform the HTTP download
162-
opts := []llb.HTTPOption{llb.Filename(modelName)}
163-
m := llb.HTTP(hfURL, opts...)
178+
dlScript := fmt.Sprintf(`set -euo pipefail
179+
if [ -f /run/secrets/hf-token ]; then export HF_TOKEN="$(cat /run/secrets/hf-token)"; fi
180+
mkdir -p /out
181+
hf download %s/%s --revision %s --local-dir /out
182+
rm -rf /out/.cache || true
183+
find /out -type f -name '*.lock' -delete || true
184+
`, spec.Namespace, spec.Model, spec.Revision)
164185

165-
// Determine the model path in the /models directory
166-
modelPath := fmt.Sprintf("/models/%s", modelName)
186+
runOpts := []llb.RunOption{
187+
llb.Args([]string{"bash", "-c", dlScript}),
188+
llb.AddSecret("/run/secrets/hf-token", llb.SecretID("hf-token"), llb.SecretOptional),
189+
llb.WithCustomNamef("Downloading HuggingFace repo %s/%s", spec.Namespace, spec.Model),
190+
}
191+
run := llb.Image(hfCLIImage).Run(runOpts...)
167192

168-
// Copy the downloaded file to the desired location
193+
modelDir := fmt.Sprintf("/models/%s", spec.Model)
169194
s = s.File(
170-
llb.Copy(m, modelName, modelPath, createCopyOptions()...),
171-
llb.WithCustomName("Copying "+modelName+" from Hugging Face to "+modelPath),
195+
llb.Copy(run.Root(), "/out/", modelDir+"/", &llb.CopyInfo{
196+
CopyDirContentsOnly: true,
197+
CreateDestPath: true,
198+
}),
199+
llb.WithCustomNamef("Copying HuggingFace repo %s/%s to %s", spec.Namespace, spec.Model, modelDir),
172200
)
173201
return s, nil
174202
}

pkg/aikit2llb/inference/vllm.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package inference
2+
3+
import (
4+
"github.com/kaito-project/aikit/pkg/utils"
5+
"github.com/moby/buildkit/client/llb"
6+
)
7+
8+
// installVLLMDependencies installs Python dependencies and a C compiler required for vLLM backend.
9+
// vLLM's Triton kernels need a C compiler (gcc) for JIT compilation at runtime.
10+
func installVLLMDependencies(s llb.State, merge llb.State) llb.State {
11+
merge = installPythonBaseDependencies(s, merge)
12+
13+
savedState := s
14+
s = s.Run(utils.Sh("apt-get update && apt-get install --no-install-recommends -y gcc libc6-dev && apt-get clean"),
15+
llb.WithCustomName("Installing C compiler for vLLM Triton JIT"),
16+
).Root()
17+
18+
diff := llb.Diff(savedState, s)
19+
return llb.Merge([]llb.State{merge, diff})
20+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package inference
2+
3+
import (
4+
"testing"
5+
6+
"github.com/moby/buildkit/client/llb"
7+
)
8+
9+
func TestInstallVLLMDependencies(t *testing.T) {
10+
// Create a simple base state for testing
11+
baseState := llb.Image("ubuntu:22.04")
12+
mergeState := baseState
13+
14+
// Call the function to install dependencies
15+
// This should execute without panicking
16+
defer func() {
17+
if r := recover(); r != nil {
18+
t.Errorf("installVLLMDependencies panicked: %v", r)
19+
}
20+
}()
21+
22+
result := installVLLMDependencies(baseState, mergeState)
23+
24+
// The function should return a valid LLB state
25+
// We can't easily test the actual installation without running BuildKit,
26+
// but we can verify the function executes without panicking
27+
_ = result // Use the result to avoid unused variable warning
28+
}

pkg/build/build.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,10 @@ func validateInferenceConfig(c *config.InferenceConfig) error {
473473
return errors.New("diffusers backend only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
474474
}
475475

476+
if slices.Contains(c.Backends, utils.BackendVLLM) && c.Runtime != utils.RuntimeNVIDIA {
477+
return errors.New("vllm backend only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
478+
}
479+
476480
if c.Runtime == utils.RuntimeAppleSilicon && len(c.Backends) > 0 {
477481
for _, backend := range c.Backends {
478482
if backend != utils.BackendLlamaCpp {
@@ -481,7 +485,7 @@ func validateInferenceConfig(c *config.InferenceConfig) error {
481485
}
482486
}
483487

484-
backends := []string{utils.BackendLlamaCpp, utils.BackendDiffusers}
488+
backends := []string{utils.BackendLlamaCpp, utils.BackendDiffusers, utils.BackendVLLM}
485489
for _, b := range c.Backends {
486490
if !slices.Contains(backends, b) {
487491
return errors.Errorf("backend %s is not supported", b)

pkg/build/build_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,35 @@ func Test_validateConfig(t *testing.T) {
8080
}},
8181
wantErr: true,
8282
},
83+
{
84+
name: "valid vllm backend with cuda runtime",
85+
args: args{c: &config.InferenceConfig{
86+
APIVersion: "v1alpha1",
87+
Runtime: "cuda",
88+
Backends: []string{"vllm"},
89+
Models: []config.Model{
90+
{
91+
Name: "test",
92+
Source: "foo",
93+
},
94+
},
95+
}},
96+
wantErr: false,
97+
},
98+
{
99+
name: "vllm backend requires cuda runtime",
100+
args: args{c: &config.InferenceConfig{
101+
APIVersion: "v1alpha1",
102+
Backends: []string{"vllm"},
103+
Models: []config.Model{
104+
{
105+
Name: "test",
106+
Source: "foo",
107+
},
108+
},
109+
}},
110+
wantErr: true,
111+
},
83112
{
84113
name: "invalid backend name",
85114
args: args{c: &config.InferenceConfig{
@@ -157,6 +186,17 @@ func Test_validateBackendPlatformCompatibility(t *testing.T) {
157186
},
158187
wantErr: false,
159188
},
189+
{
190+
name: "vllm backend with arm64 platform - should fail",
191+
config: &config.InferenceConfig{
192+
APIVersion: "v1alpha1",
193+
Backends: []string{"vllm"},
194+
},
195+
targetPlatforms: []*specs.Platform{
196+
{Architecture: "arm64", OS: "linux"},
197+
},
198+
wantErr: true,
199+
},
160200
}
161201
for _, tt := range tests {
162202
t.Run(tt.name, func(t *testing.T) {

pkg/utils/const.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ const (
66

77
BackendDiffusers = "diffusers"
88
BackendLlamaCpp = "llama-cpp"
9+
BackendVLLM = "vllm"
910

1011
BackendOCIRegistry = "quay.io/go-skynet/local-ai-backends"
1112

0 commit comments

Comments
 (0)