Skip to content

Commit 0f2d45c

Browse files
DevakiBolleneniDevakiBolleneni
andauthored
v2 : Sglang 0.5.6 EC2 (#5586)
* initial commit * update docker file and rename sglang sagemaker * update sglang sagemaker test environment * add ec2 tests and update sagemaker tests * remove trailing space * trigger workflow * trigger workflow * update concurrency group * update docker command * fix typo * fix typo * fix typo * trigger workflow --------- Co-authored-by: DevakiBolleneni <devakib@amazon.com>
1 parent 12e1a39 commit 0f2d45c

File tree

3 files changed

+226
-21
lines changed

3 files changed

+226
-21
lines changed

.github/workflows/pr-sglang.yml

Lines changed: 201 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ env:
2121
CUDA_VERSION: "cu129"
2222
OS_VERSION: "ubuntu22.04"
2323

24+
# SGLang EC2
25+
PROD_EC2_IMAGE: sglang:0.5-gpu-py312-ec2
26+
2427
# SGLang SageMaker
2528
PROD_SAGEMAKER_IMAGE: sglang:0.5-gpu-py312
2629

@@ -82,18 +85,195 @@ jobs:
8285
test-change:
8386
- "test/sglang/**"
8487
88+
# ======================================================
89+
# =============== SGLang EC2 jobs ======================
90+
# ======================================================
91+
build-sglang-ec2-image:
92+
needs: [check-changes]
93+
if: needs.check-changes.outputs.build-change == 'true'
94+
runs-on:
95+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
96+
fleet:x86-build-runner
97+
buildspec-override:true
98+
concurrency:
99+
group: ${{ github.workflow }}-build-sglang-ec2-image-${{ github.event.pull_request.number }}
100+
cancel-in-progress: true
101+
outputs:
102+
ci-image: ${{ steps.build.outputs.image-uri }}
103+
steps:
104+
- uses: actions/checkout@v5
105+
106+
- name: Build image
107+
id: build
108+
uses: ./.github/actions/build-image
109+
with:
110+
framework: ${{ env.FRAMEWORK }}
111+
target: sglang-ec2
112+
base-image: lmsysorg/sglang:v${{ env.SGLANG_VERSION }}-${{ env.CUDA_VERSION }}-amd64
113+
framework-version: ${{ env.SGLANG_VERSION }}
114+
container-type: ${{ env.CONTAINER_TYPE }}
115+
aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
116+
aws-region: ${{ vars.AWS_REGION }}
117+
tag-pr: ${{ env.FRAMEWORK }}-${{ env.SGLANG_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-ec2-pr-${{ github.event.pull_request.number }}
118+
dockerfile-path: docker/${{ env.FRAMEWORK }}/Dockerfile
119+
120+
set-ec2-test-environment:
121+
needs: [check-changes, build-sglang-ec2-image]
122+
if: |
123+
always() && !failure() && !cancelled() &&
124+
(needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.test-change == 'true')
125+
runs-on: ubuntu-latest
126+
concurrency:
127+
group: ${{ github.workflow }}-set-ec2-test-environment-${{ github.event.pull_request.number }}
128+
cancel-in-progress: true
129+
outputs:
130+
aws-account-id: ${{ steps.set-env.outputs.AWS_ACCOUNT_ID }}
131+
image-uri: ${{ steps.set-env.outputs.IMAGE_URI }}
132+
steps:
133+
- name: Checkout code
134+
uses: actions/checkout@v4
135+
136+
- name: Set test environment
137+
id: set-env
138+
run: |
139+
if [[ "${{ needs.build-sglang-ec2-image.result }}" == "success" ]]; then
140+
AWS_ACCOUNT_ID=${{ vars.CI_AWS_ACCOUNT_ID }}
141+
IMAGE_URI=${{ needs.build-sglang-ec2-image.outputs.ci-image }}
142+
else
143+
AWS_ACCOUNT_ID=${{ vars.PROD_AWS_ACCOUNT_ID }}
144+
IMAGE_URI=${{ vars.PROD_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/${{ env.PROD_EC2_IMAGE }}
145+
fi
146+
147+
echo "Image URI to test: ${IMAGE_URI}"
148+
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
149+
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
150+
151+
sglang-ec2-local-benchmark-test:
152+
needs: [build-sglang-ec2-image, set-ec2-test-environment]
153+
if: success()
154+
runs-on:
155+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
156+
fleet:x86-g6xl-runner
157+
buildspec-override:true
158+
concurrency:
159+
group: ${{ github.workflow }}-sglang-ec2-local-benchmark-test-${{ github.event.pull_request.number }}
160+
cancel-in-progress: true
161+
steps:
162+
- name: Checkout DLC source
163+
uses: actions/checkout@v5
164+
165+
- name: Container pull
166+
uses: ./.github/actions/ecr-authenticate
167+
with:
168+
aws-account-id: ${{ needs.set-ec2-test-environment.outputs.aws-account-id }}
169+
aws-region: ${{ vars.AWS_REGION }}
170+
image-uri: ${{ needs.set-ec2-test-environment.outputs.image-uri }}
171+
172+
- name: Setup for SGLang datasets
173+
run: |
174+
mkdir -p ${TEST_ARTIFACTS_DIRECTORY}/dataset
175+
if [ ! -f ${TEST_ARTIFACTS_DIRECTORY}/dataset/ShareGPT_V3_unfiltered_cleaned_split.json ]; then
176+
echo "Downloading ShareGPT dataset..."
177+
wget -P ${TEST_ARTIFACTS_DIRECTORY}/dataset https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
178+
else
179+
echo "ShareGPT dataset already exists. Skipping download."
180+
fi
181+
182+
- name: Start container
183+
run: |
184+
CONTAINER_ID=$(docker run -d -it --rm --gpus=all \
185+
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
186+
-v ${TEST_ARTIFACTS_DIRECTORY}/dataset:/dataset \
187+
-p 30000:30000 \
188+
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
189+
${{ needs.set-ec2-test-environment.outputs.image-uri }} \
190+
--model-path Qwen/Qwen3-0.6B \
191+
--reasoning-parser qwen3 \
192+
--host 127.0.0.1 \
193+
--port 30000)
194+
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
195+
echo "Waiting for serving endpoint startup ..."
196+
sleep 120s
197+
docker logs ${CONTAINER_ID}
198+
199+
- name: Run SGLang tests
200+
run: |
201+
docker exec ${CONTAINER_ID} python3 -m sglang.bench_serving \
202+
--backend sglang \
203+
--host 127.0.0.1 --port 30000 \
204+
--num-prompts 1000 \
205+
--model Qwen/Qwen3-0.6B \
206+
--dataset-name sharegpt \
207+
--dataset-path /dataset/ShareGPT_V3_unfiltered_cleaned_split.json
208+
209+
sglang-ec2-upstream-test:
210+
needs: [build-sglang-ec2-image, set-ec2-test-environment]
211+
if: success()
212+
runs-on:
213+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
214+
fleet:x86-g6exl-runner
215+
buildspec-override:true
216+
concurrency:
217+
group: ${{ github.workflow }}-sglang-ec2-upstream-test-${{ github.event.pull_request.number }}
218+
cancel-in-progress: true
219+
steps:
220+
- name: Checkout DLC source
221+
uses: actions/checkout@v5
222+
223+
- name: Container pull
224+
uses: ./.github/actions/ecr-authenticate
225+
with:
226+
aws-account-id: ${{ needs.set-ec2-test-environment.outputs.aws-account-id }}
227+
aws-region: ${{ vars.AWS_REGION }}
228+
image-uri: ${{ needs.set-ec2-test-environment.outputs.image-uri }}
229+
230+
- name: Checkout SGLang tests
231+
uses: actions/checkout@v5
232+
with:
233+
repository: sgl-project/sglang
234+
ref: v${{ env.SGLANG_VERSION }}
235+
path: sglang_source
236+
237+
- name: Start container
238+
run: |
239+
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
240+
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
241+
-v ./sglang_source:/workdir --workdir /workdir \
242+
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
243+
${{ needs.set-ec2-test-environment.outputs.image-uri }})
244+
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
245+
246+
- name: Setup for SGLang tests
247+
run: |
248+
docker exec ${CONTAINER_ID} sh -c '
249+
set -eux
250+
251+
bash scripts/ci/ci_install_dependency.sh
252+
'
253+
254+
- name: Run SGLang tests
255+
run: |
256+
docker exec ${CONTAINER_ID} sh -c '
257+
set -eux
258+
nvidia-smi
259+
260+
# SRT backend Test
261+
cd /workdir/test
262+
python3 run_suite.py --hw cuda --suite stage-a-test-1
263+
'
264+
85265
# ======================================================
86266
# =============== SGLang SageMaker jobs ================
87267
# ======================================================
88-
build-sglang-image:
268+
build-sglang-sagemaker-image:
89269
needs: [check-changes]
90270
if: needs.check-changes.outputs.build-change == 'true'
91271
runs-on:
92272
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
93273
fleet:x86-build-runner
94274
buildspec-override:true
95275
concurrency:
96-
group: ${{ github.workflow }}-build-sglang-image-${{ github.event.pull_request.number }}
276+
group: ${{ github.workflow }}-build-sglang-sagemaker-image-${{ github.event.pull_request.number }}
97277
cancel-in-progress: true
98278
outputs:
99279
ci-image: ${{ steps.build.outputs.image-uri }}
@@ -114,14 +294,14 @@ jobs:
114294
tag-pr: ${{ env.FRAMEWORK }}-${{ env.SGLANG_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }}
115295
dockerfile-path: docker/${{ env.FRAMEWORK }}/Dockerfile
116296

117-
set-test-environment:
118-
needs: [check-changes, build-sglang-image]
297+
set-sagemaker-test-environment:
298+
needs: [check-changes, build-sglang-sagemaker-image]
119299
if: |
120300
always() && !failure() && !cancelled() &&
121301
(needs.check-changes.outputs.build-change == 'true' || needs.check-changes.outputs.test-change == 'true')
122302
runs-on: ubuntu-latest
123303
concurrency:
124-
group: ${{ github.workflow }}-set-test-environment-${{ github.event.pull_request.number }}
304+
group: ${{ github.workflow }}-set-sagemaker-test-environment-${{ github.event.pull_request.number }}
125305
cancel-in-progress: true
126306
outputs:
127307
aws-account-id: ${{ steps.set-env.outputs.AWS_ACCOUNT_ID }}
@@ -133,9 +313,9 @@ jobs:
133313
- name: Set test environment
134314
id: set-env
135315
run: |
136-
if [[ "${{ needs.build-sglang-image.result }}" == "success" ]]; then
316+
if [[ "${{ needs.build-sglang-sagemaker-image.result }}" == "success" ]]; then
137317
AWS_ACCOUNT_ID=${{ vars.CI_AWS_ACCOUNT_ID }}
138-
IMAGE_URI=${{ needs.build-sglang-image.outputs.ci-image }}
318+
IMAGE_URI=${{ needs.build-sglang-sagemaker-image.outputs.ci-image }}
139319
else
140320
AWS_ACCOUNT_ID=${{ vars.PROD_AWS_ACCOUNT_ID }}
141321
IMAGE_URI=${{ vars.PROD_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/${{ env.PROD_SAGEMAKER_IMAGE }}
@@ -145,15 +325,15 @@ jobs:
145325
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
146326
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
147327
148-
sglang-local-benchmark-test:
149-
needs: [build-sglang-image, set-test-environment]
328+
sglang-sagemaker-local-benchmark-test:
329+
needs: [build-sglang-sagemaker-image, set-sagemaker-test-environment]
150330
if: success()
151331
runs-on:
152332
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
153333
fleet:x86-g6xl-runner
154334
buildspec-override:true
155335
concurrency:
156-
group: ${{ github.workflow }}-sglang-local-benchmark-test-${{ github.event.pull_request.number }}
336+
group: ${{ github.workflow }}-sglang-sagemaker-local-benchmark-test-${{ github.event.pull_request.number }}
157337
cancel-in-progress: true
158338
steps:
159339
- name: Checkout DLC source
@@ -162,9 +342,9 @@ jobs:
162342
- name: Container pull
163343
uses: ./.github/actions/ecr-authenticate
164344
with:
165-
aws-account-id: ${{ needs.set-test-environment.outputs.aws-account-id }}
345+
aws-account-id: ${{ needs.set-sagemaker-test-environment.outputs.aws-account-id }}
166346
aws-region: ${{ vars.AWS_REGION }}
167-
image-uri: ${{ needs.set-test-environment.outputs.image-uri }}
347+
image-uri: ${{ needs.set-sagemaker-test-environment.outputs.image-uri }}
168348

169349
- name: Setup for SGLang datasets
170350
run: |
@@ -187,7 +367,7 @@ jobs:
187367
-e SM_SGLANG_HOST=127.0.0.1 \
188368
-e SM_SGLANG_PORT=30000 \
189369
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
190-
${{ needs.set-test-environment.outputs.image-uri }})
370+
${{ needs.set-sagemaker-test-environment.outputs.image-uri }})
191371
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
192372
echo "Waiting for serving endpoint startup ..."
193373
sleep 120s
@@ -203,8 +383,8 @@ jobs:
203383
--dataset-name sharegpt \
204384
--dataset-path /dataset/ShareGPT_V3_unfiltered_cleaned_split.json
205385
206-
sglang-upstream-test:
207-
needs: [build-sglang-image, set-test-environment]
386+
sglang-sagemaker-upstream-test:
387+
needs: [build-sglang-sagemaker-image, set-sagemaker-test-environment]
208388
if: success()
209389
runs-on:
210390
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -220,9 +400,9 @@ jobs:
220400
- name: Container pull
221401
uses: ./.github/actions/ecr-authenticate
222402
with:
223-
aws-account-id: ${{ needs.set-test-environment.outputs.aws-account-id }}
403+
aws-account-id: ${{ needs.set-sagemaker-test-environment.outputs.aws-account-id }}
224404
aws-region: ${{ vars.AWS_REGION }}
225-
image-uri: ${{ needs.set-test-environment.outputs.image-uri }}
405+
image-uri: ${{ needs.set-sagemaker-test-environment.outputs.image-uri }}
226406

227407
- name: Checkout SGLang tests
228408
uses: actions/checkout@v5
@@ -237,7 +417,7 @@ jobs:
237417
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
238418
-v ./sglang_source:/workdir --workdir /workdir \
239419
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
240-
${{ needs.set-test-environment.outputs.image-uri }})
420+
${{ needs.set-sagemaker-test-environment.outputs.image-uri }})
241421
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
242422
243423
- name: Setup for SGLang tests
@@ -260,10 +440,10 @@ jobs:
260440
'
261441
262442
sglang-sagemaker-endpoint-test:
263-
needs: [set-test-environment]
443+
needs: [set-sagemaker-test-environment]
264444
if: |
265445
always() && !failure() && !cancelled() &&
266-
needs.set-test-environment.result == 'success'
446+
needs.set-sagemaker-test-environment.result == 'success'
267447
runs-on:
268448
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
269449
fleet:default-runner
@@ -286,4 +466,4 @@ jobs:
286466
run: |
287467
source .venv/bin/activate
288468
cd test/
289-
python3 -m pytest -vs -rA --image-uri ${{ needs.set-test-environment.outputs.image-uri }} sglang/sagemaker
469+
python3 -m pytest -vs -rA --image-uri ${{ needs.set-sagemaker-test-environment.outputs.image-uri }} sglang/sagemaker

docker/sglang/Dockerfile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,23 @@ RUN rm -rf /tmp/*
106106
COPY ./scripts/sglang/sagemaker_entrypoint.sh /usr/bin/serve
107107
RUN chmod +x /usr/bin/serve
108108

109+
ENTRYPOINT ["/usr/bin/serve"]
110+
111+
# =======================================================
112+
# ====================== EC2 ============================
113+
# =======================================================
114+
115+
FROM base AS sglang-ec2
116+
117+
ARG CACHE_REFRESH=0
118+
RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \
119+
&& apt-get update \
120+
&& apt-get upgrade -y \
121+
&& apt-get clean
122+
123+
RUN rm -rf /tmp/*
124+
125+
COPY ./scripts/sglang/dockerd_entrypoint.sh /usr/bin/serve
126+
RUN chmod +x /usr/bin/serve
127+
109128
ENTRYPOINT ["/usr/bin/serve"]
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/usr/bin/env bash
2+
# Check if telemetry file exists before executing
3+
# Execute telemetry script if it exists, suppress errors
4+
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
5+
6+
python3 -m sglang.launch_server "$@"

0 commit comments

Comments
 (0)