Skip to content

Commit 4d8940a

Browse files
authored
Merge pull request #477 from denyszhak/feature/add-sglang-backend
Add SGLang backend
2 parents 9eaff19 + 0b08b2c commit 4d8940a

File tree

18 files changed

+769
-31
lines changed

18 files changed

+769
-31
lines changed

.github/workflows/promote-to-latest.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ on:
55
workflow_dispatch:
66
inputs:
77
version:
8-
description: 'version'
8+
description: "version"
99
required: true
1010
type: string
1111

@@ -42,6 +42,11 @@ jobs:
4242
echo "Promoting vLLM CUDA images"
4343
crane tag "docker/model-runner:${{ inputs.version }}-vllm-cuda" "latest-vllm-cuda"
4444
45+
- name: Promote SGLang CUDA images
46+
run: |
47+
echo "Promoting SGLang CUDA images"
48+
crane tag "docker/model-runner:${{ inputs.version }}-sglang-cuda" "latest-sglang-cuda"
49+
4550
- name: Promote ROCm images
4651
run: |
4752
echo "Promoting ROCm images"

.github/workflows/release.yml

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,28 +5,33 @@ on:
55
workflow_dispatch:
66
inputs:
77
pushLatest:
8-
description: 'Tag images produced by this job as latest'
8+
description: "Tag images produced by this job as latest"
99
required: false
1010
type: boolean
1111
default: false
1212
releaseTag:
13-
description: 'Release tag'
13+
description: "Release tag"
1414
required: false
1515
type: string
1616
default: "test"
1717
llamaServerVersion:
18-
description: 'llama-server version'
18+
description: "llama-server version"
1919
required: false
2020
type: string
2121
default: "latest"
2222
vllmVersion:
23-
description: 'vLLM version'
23+
description: "vLLM version"
2424
required: false
2525
type: string
2626
default: "0.12.0"
27+
sglangVersion:
28+
description: "SGLang version"
29+
required: false
30+
type: string
31+
default: "0.4.0"
2732
# This can be removed once we have llama.cpp built for MUSA and CANN.
2833
buildMusaCann:
29-
description: 'Build MUSA and CANN images'
34+
description: "Build MUSA and CANN images"
3035
required: false
3136
type: boolean
3237
default: false
@@ -76,6 +81,12 @@ jobs:
7681
echo "docker/model-runner:latest-vllm-cuda" >> "$GITHUB_OUTPUT"
7782
fi
7883
echo 'EOF' >> "$GITHUB_OUTPUT"
84+
echo "sglang-cuda<<EOF" >> "$GITHUB_OUTPUT"
85+
echo "docker/model-runner:${{ inputs.releaseTag }}-sglang-cuda" >> "$GITHUB_OUTPUT"
86+
if [ "${{ inputs.pushLatest }}" == "true" ]; then
87+
echo "docker/model-runner:latest-sglang-cuda" >> "$GITHUB_OUTPUT"
88+
fi
89+
echo 'EOF' >> "$GITHUB_OUTPUT"
7990
echo "rocm<<EOF" >> "$GITHUB_OUTPUT"
8091
echo "docker/model-runner:${{ inputs.releaseTag }}-rocm" >> "$GITHUB_OUTPUT"
8192
if [ "${{ inputs.pushLatest }}" == "true" ]; then
@@ -155,6 +166,22 @@ jobs:
155166
provenance: mode=max
156167
tags: ${{ steps.tags.outputs.vllm-cuda }}
157168

169+
- name: Build SGLang CUDA image
170+
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83
171+
with:
172+
file: Dockerfile
173+
target: final-sglang
174+
platforms: linux/amd64
175+
build-args: |
176+
"LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
177+
"LLAMA_SERVER_VARIANT=cuda"
178+
"BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
179+
"SGLANG_VERSION=${{ inputs.sglangVersion }}"
180+
push: true
181+
sbom: true
182+
provenance: mode=max
183+
tags: ${{ steps.tags.outputs.sglang-cuda }}
184+
158185
- name: Build ROCm image
159186
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83
160187
with:

Dockerfile

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,13 @@ COPY --link . .
3333
# Build the Go binary (static build)
3434
RUN --mount=type=cache,target=/go/pkg/mod \
3535
--mount=type=cache,target=/root/.cache/go-build \
36-
CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner ./main.go
36+
CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o model-runner .
37+
38+
# Build the Go binary for SGLang (without vLLM)
39+
FROM builder AS builder-sglang
40+
RUN --mount=type=cache,target=/go/pkg/mod \
41+
--mount=type=cache,target=/root/.cache/go-build \
42+
CGO_ENABLED=1 GOOS=linux go build -tags=novllm -ldflags="-s -w" -o model-runner .
3743

3844
# --- Get llama.cpp binary ---
3945
FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
@@ -97,21 +103,58 @@ USER modelrunner
97103

98104
# Install uv and vLLM as modelrunner user
99105
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
100-
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
101-
&& if [ "$TARGETARCH" = "amd64" ]; then \
102-
WHEEL_ARCH="manylinux_2_31_x86_64"; \
103-
WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \
104-
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \
106+
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
107+
&& if [ "$TARGETARCH" = "amd64" ]; then \
108+
WHEEL_ARCH="manylinux_2_31_x86_64"; \
109+
WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \
110+
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \
105111
else \
106-
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \
112+
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \
107113
fi
108114

109115
RUN /opt/vllm-env/bin/python -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version
110116

117+
# --- SGLang variant ---
118+
FROM llamacpp AS sglang
119+
120+
ARG SGLANG_VERSION=0.5.6
121+
122+
USER root
123+
124+
# Install CUDA toolkit 13 for nvcc (needed for flashinfer JIT compilation)
125+
RUN apt update && apt install -y \
126+
python3 python3-venv python3-dev \
127+
curl ca-certificates build-essential \
128+
libnuma1 libnuma-dev numactl ninja-build \
129+
wget gnupg \
130+
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
131+
&& dpkg -i cuda-keyring_1.1-1_all.deb \
132+
&& apt update && apt install -y cuda-toolkit-13-0 \
133+
&& rm cuda-keyring_1.1-1_all.deb \
134+
&& rm -rf /var/lib/apt/lists/*
135+
136+
RUN mkdir -p /opt/sglang-env && chown -R modelrunner:modelrunner /opt/sglang-env
137+
138+
USER modelrunner
139+
140+
# Set CUDA paths for nvcc (needed during flashinfer compilation)
141+
ENV PATH=/usr/local/cuda-13.0/bin:$PATH
142+
ENV LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:$LD_LIBRARY_PATH
143+
144+
# Install uv and SGLang as modelrunner user
145+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
146+
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/sglang-env \
147+
&& ~/.local/bin/uv pip install --python /opt/sglang-env/bin/python "sglang==${SGLANG_VERSION}"
148+
149+
RUN /opt/sglang-env/bin/python -c "import sglang; print(sglang.__version__)" > /opt/sglang-env/version
111150
FROM llamacpp AS final-llamacpp
112151
# Copy the built binary from builder
113152
COPY --from=builder /app/model-runner /app/model-runner
114153

115154
FROM vllm AS final-vllm
116155
# Copy the built binary from builder
117156
COPY --from=builder /app/model-runner /app/model-runner
157+
158+
FROM sglang AS final-sglang
159+
# Copy the built binary from builder-sglang (without vLLM)
160+
COPY --from=builder-sglang /app/model-runner /app/model-runner

Makefile

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ BASE_IMAGE := ubuntu:24.04
77
VLLM_BASE_IMAGE := nvidia/cuda:13.0.2-runtime-ubuntu24.04
88
DOCKER_IMAGE := docker/model-runner:latest
99
DOCKER_IMAGE_VLLM := docker/model-runner:latest-vllm-cuda
10+
DOCKER_IMAGE_SGLANG := docker/model-runner:latest-sglang
11+
DOCKER_IMAGE_SGLANG_CUDA := docker/model-runner:latest-sglang-cuda
1012
DOCKER_TARGET ?= final-llamacpp
1113
PORT := 8080
1214
MODELS_PATH := $(shell pwd)/models-store
@@ -31,13 +33,13 @@ LICENSE ?=
3133
BUILD_DMR ?= 1
3234

3335
# Main targets
34-
.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-run-impl help validate lint model-distribution-tool
36+
.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang docker-build-sglang-cuda docker-run-sglang-cuda docker-run-impl help validate lint model-distribution-tool
3537
# Default target
3638
.DEFAULT_GOAL := build
3739

3840
# Build the Go application
3941
build:
40-
CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) ./main.go
42+
CGO_ENABLED=1 go build -ldflags="-s -w" -o $(APP_NAME) .
4143

4244
# Build model-distribution-tool
4345
model-distribution-tool:
@@ -116,6 +118,30 @@ docker-build-vllm:
116118
docker-run-vllm: docker-build-vllm
117119
@$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_VLLM)
118120

121+
# Build SGLang Docker image (CPU variant)
122+
docker-build-sglang:
123+
@$(MAKE) docker-build \
124+
DOCKER_TARGET=final-sglang \
125+
DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG) \
126+
LLAMA_SERVER_VARIANT=cpu \
127+
BASE_IMAGE=$(BASE_IMAGE)
128+
129+
# Run SGLang Docker container (CPU variant) with TCP port access and mounted model storage
130+
docker-run-sglang: docker-build-sglang
131+
@$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG)
132+
133+
# Build SGLang Docker image (CUDA variant)
134+
docker-build-sglang-cuda:
135+
@$(MAKE) docker-build \
136+
DOCKER_TARGET=final-sglang \
137+
DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG_CUDA) \
138+
LLAMA_SERVER_VARIANT=cuda \
139+
BASE_IMAGE=$(VLLM_BASE_IMAGE)
140+
141+
# Run SGLang Docker container (CUDA variant) with TCP port access and mounted model storage
142+
docker-run-sglang-cuda: docker-build-sglang-cuda
143+
@$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG_CUDA)
144+
119145
# Common implementation for running Docker container
120146
docker-run-impl:
121147
@echo ""
@@ -178,6 +204,10 @@ help:
178204
@echo " docker-run - Run in Docker container with TCP port access and mounted model storage"
179205
@echo " docker-build-vllm - Build vLLM Docker image"
180206
@echo " docker-run-vllm - Run vLLM Docker container"
207+
@echo " docker-build-sglang - Build SGLang Docker image (CPU)"
208+
@echo " docker-run-sglang - Run SGLang Docker container (CPU)"
209+
@echo " docker-build-sglang-cuda - Build SGLang Docker image (CUDA)"
210+
@echo " docker-run-sglang-cuda - Run SGLang Docker container (CUDA)"
181211
@echo " help - Show this help message"
182212
@echo ""
183213
@echo "Model distribution tool targets:"

backends_vllm.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
//go:build !novllm
2+
3+
package main
4+
5+
import (
6+
"github.com/docker/model-runner/pkg/inference"
7+
"github.com/docker/model-runner/pkg/inference/backends/vllm"
8+
"github.com/docker/model-runner/pkg/inference/models"
9+
"github.com/sirupsen/logrus"
10+
)
11+
12+
func initVLLMBackend(log *logrus.Logger, modelManager *models.Manager) (inference.Backend, error) {
13+
return vllm.New(
14+
log,
15+
modelManager,
16+
log.WithFields(logrus.Fields{"component": vllm.Name}),
17+
nil,
18+
)
19+
}
20+
21+
func registerVLLMBackend(backends map[string]inference.Backend, backend inference.Backend) {
22+
backends[vllm.Name] = backend
23+
}

backends_vllm_stub.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
//go:build novllm
2+
3+
package main
4+
5+
import (
6+
"github.com/docker/model-runner/pkg/inference"
7+
"github.com/docker/model-runner/pkg/inference/models"
8+
"github.com/sirupsen/logrus"
9+
)
10+
11+
func initVLLMBackend(log *logrus.Logger, modelManager *models.Manager) (inference.Backend, error) {
12+
return nil, nil
13+
}
14+
15+
func registerVLLMBackend(backends map[string]inference.Backend, backend inference.Backend) {
16+
// No-op when vLLM is disabled
17+
}

main.go

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/docker/model-runner/pkg/inference"
1515
"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
1616
"github.com/docker/model-runner/pkg/inference/backends/mlx"
17+
"github.com/docker/model-runner/pkg/inference/backends/sglang"
1718
"github.com/docker/model-runner/pkg/inference/backends/vllm"
1819
"github.com/docker/model-runner/pkg/inference/config"
1920
"github.com/docker/model-runner/pkg/inference/models"
@@ -106,12 +107,7 @@ func main() {
106107
log.Fatalf("unable to initialize %s backend: %v", llamacpp.Name, err)
107108
}
108109

109-
vllmBackend, err := vllm.New(
110-
log,
111-
modelManager,
112-
log.WithFields(logrus.Fields{"component": vllm.Name}),
113-
nil,
114-
)
110+
vllmBackend, err := initVLLMBackend(log, modelManager)
115111
if err != nil {
116112
log.Fatalf("unable to initialize %s backend: %v", vllm.Name, err)
117113
}
@@ -126,13 +122,26 @@ func main() {
126122
log.Fatalf("unable to initialize %s backend: %v", mlx.Name, err)
127123
}
128124

125+
sglangBackend, err := sglang.New(
126+
log,
127+
modelManager,
128+
log.WithFields(logrus.Fields{"component": sglang.Name}),
129+
nil,
130+
)
131+
if err != nil {
132+
log.Fatalf("unable to initialize %s backend: %v", sglang.Name, err)
133+
}
134+
135+
backends := map[string]inference.Backend{
136+
llamacpp.Name: llamaCppBackend,
137+
mlx.Name: mlxBackend,
138+
sglang.Name: sglangBackend,
139+
}
140+
registerVLLMBackend(backends, vllmBackend)
141+
129142
scheduler := scheduling.NewScheduler(
130143
log,
131-
map[string]inference.Backend{
132-
llamacpp.Name: llamaCppBackend,
133-
vllm.Name: vllmBackend,
134-
mlx.Name: mlxBackend,
135-
},
144+
backends,
136145
llamaCppBackend,
137146
modelManager,
138147
http.DefaultClient,

pkg/inference/backend.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ type Backend interface {
132132
// external model management system and false if the backend uses the shared
133133
// model manager.
134134
UsesExternalModelManagement() bool
135+
// UsesTCP returns true if the backend uses TCP for communication instead
136+
// of Unix sockets. When true, the scheduler will create a TCP transport
137+
// and pass a "host:port" address to Run instead of a Unix socket path.
138+
UsesTCP() bool
135139
// Install ensures that the backend is installed. It should return a nil
136140
// error if installation succeeds or if the backend is already installed.
137141
// The provided HTTP client should be used for any HTTP operations.

pkg/inference/backends/llamacpp/llamacpp.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ func (l *llamaCpp) UsesExternalModelManagement() bool {
8686
return false
8787
}
8888

89+
// UsesTCP implements inference.Backend.UsesTCP.
90+
func (l *llamaCpp) UsesTCP() bool {
91+
return false
92+
}
93+
8994
// Install implements inference.Backend.Install.
9095
func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
9196
l.updatedLlamaCpp = false

pkg/inference/backends/mlx/mlx.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ func (m *mlx) UsesExternalModelManagement() bool {
6565
return false
6666
}
6767

68+
// UsesTCP implements inference.Backend.UsesTCP.
69+
func (m *mlx) UsesTCP() bool {
70+
return false
71+
}
72+
6873
// Install implements inference.Backend.Install.
6974
func (m *mlx) Install(ctx context.Context, httpClient *http.Client) error {
7075
if !platform.SupportsMLX() {

0 commit comments

Comments
 (0)