Skip to content

Commit b600f7e

Browse files
committed
upgrade cuda took ver 13
1 parent d554c58 commit b600f7e

File tree

2 files changed

+31
-12
lines changed

2 files changed

+31
-12
lines changed

Dockerfile

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,37 +103,53 @@ USER modelrunner
103103

104104
# Install uv and vLLM as modelrunner user
105105
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
106-
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
107-
&& if [ "$TARGETARCH" = "amd64" ]; then \
108-
WHEEL_ARCH="manylinux_2_31_x86_64"; \
109-
WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \
110-
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \
106+
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
107+
&& if [ "$TARGETARCH" = "amd64" ]; then \
108+
WHEEL_ARCH="manylinux_2_31_x86_64"; \
109+
WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \
110+
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \
111111
else \
112-
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \
112+
~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \
113113
fi
114114

115115
RUN /opt/vllm-env/bin/python -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version
116116

117117
# --- SGLang variant ---
118118
FROM llamacpp AS sglang
119119

120-
ARG SGLANG_VERSION=0.4.0
120+
ARG SGLANG_VERSION=0.5.6
121121

122122
USER root
123123

124-
RUN apt update && apt install -y python3 python3-venv python3-dev curl ca-certificates build-essential && rm -rf /var/lib/apt/lists/*
124+
# Install CUDA toolkit 13 for nvcc (needed for flashinfer JIT compilation)
125+
# CUDA 13 supports Blackwell (B200) architecture with compute_100a
126+
# and Hopper (H200) architecture
127+
# Plus other SGLang dependencies
128+
RUN apt update && apt install -y \
129+
python3 python3-venv python3-dev \
130+
curl ca-certificates build-essential \
131+
libnuma1 libnuma-dev numactl ninja-build \
132+
wget gnupg \
133+
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
134+
&& dpkg -i cuda-keyring_1.1-1_all.deb \
135+
&& apt update && apt install -y cuda-toolkit-13-0 \
136+
&& rm cuda-keyring_1.1-1_all.deb \
137+
&& rm -rf /var/lib/apt/lists/*
125138

126139
RUN mkdir -p /opt/sglang-env && chown -R modelrunner:modelrunner /opt/sglang-env
127140

128141
USER modelrunner
129142

143+
# Set CUDA paths for nvcc (needed during flashinfer compilation)
144+
ENV PATH=/usr/local/cuda-13.0/bin:$PATH
145+
ENV LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:$LD_LIBRARY_PATH
146+
130147
# Install uv and SGLang as modelrunner user
131148
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
132-
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/sglang-env \
133-
&& ~/.local/bin/uv pip install --python /opt/sglang-env/bin/python "sglang==${SGLANG_VERSION}"
149+
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/sglang-env \
150+
&& ~/.local/bin/uv pip install --python /opt/sglang-env/bin/python "sglang==${SGLANG_VERSION}"
134151

135152
RUN /opt/sglang-env/bin/python -c "import sglang; print(sglang.__version__)" > /opt/sglang-env/version
136-
137153
FROM llamacpp AS final-llamacpp
138154
# Copy the built binary from builder
139155
COPY --from=builder /app/model-runner /app/model-runner

pkg/inference/backends/sglang/sglang.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,10 @@ func (s *sglang) Run(ctx context.Context, socket, model string, modelRef string,
171171

172172
// Add served model name and weight version
173173
if model != "" {
174-
args = append(args, "--served-model-name", model)
174+
// SGLang 0.5.6+ doesn't allow colons in served-model-name (reserved for LoRA syntax)
175+
// Replace colons with underscores to sanitize the model name
176+
sanitizedModel := strings.ReplaceAll(model, ":", "_")
177+
args = append(args, "--served-model-name", sanitizedModel)
175178
}
176179
if modelRef != "" {
177180
args = append(args, "--weight-version", modelRef)

0 commit comments

Comments
 (0)