Skip to content

Commit 3996f18

Browse files
authored
Merge branch 'main' into optimize-max-tokens
Signed-off-by: bin.pan <[email protected]>
2 parents 54735f3 + a629b86 commit 3996f18

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2424
-696
lines changed

.github/workflows/nightly-ci.yml

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: Nightly CI
2+
3+
on:
4+
schedule:
5+
- cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
6+
workflow_dispatch:
7+
8+
jobs:
9+
vllm:
10+
strategy:
11+
fail-fast: false
12+
matrix:
13+
platform:
14+
- { arch: amd64, runner: gpu-l40-amd64 }
15+
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
16+
name: vllm (${{ matrix.platform.arch }})
17+
runs-on: ${{ matrix.platform.runner }}
18+
steps:
19+
- name: Checkout code
20+
uses: actions/checkout@v4
21+
- name: Build vLLM Docker Image
22+
id: build-vllm
23+
uses: ./.github/actions/docker-build
24+
with:
25+
framework: vllm
26+
target: runtime
27+
platform: linux/${{ matrix.platform.arch }}
28+
base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
29+
runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
30+
cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
31+
torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
32+
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
33+
ci_token: ${{ secrets.CI_TOKEN }}
34+
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
35+
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
36+
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
37+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
38+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
39+
image_tag: nightly-vllm-${{ matrix.platform.arch }}
40+
- name: Tag and Push vLLM Nightly Image
41+
uses: ./.github/actions/docker-tag-push
42+
with:
43+
local_image: ${{ steps.build-vllm.outputs.image_tag }}
44+
# Tag the image nightly
45+
push_tag: ai-dynamo/dynamo:nightly-vllm-${{ matrix.platform.arch }}
46+
aws_push: 'false'
47+
azure_push: 'true'
48+
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
49+
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
50+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
51+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
52+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
53+
- name: Run unit tests
54+
if: ${{ matrix.platform.arch != 'arm64' }}
55+
uses: ./.github/actions/pytest
56+
with:
57+
image_tag: nightly-vllm-${{ matrix.platform.arch }}
58+
pytest_marks: "vllm and unit"
59+
framework: "vllm"
60+
test_type: "unit"
61+
platform_arch: ${{ matrix.platform.arch }}
62+
- name: Run e2e tests
63+
if: ${{ matrix.platform.arch != 'arm64' }}
64+
uses: ./.github/actions/pytest
65+
with:
66+
image_tag: nightly-vllm-${{ matrix.platform.arch }}
67+
pytest_marks: "nightly and vllm and gpu_1"
68+
framework: "vllm"
69+
test_type: "e2e"
70+
platform_arch: ${{ matrix.platform.arch }}

benchmarks/profiler/profile_sla.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
2828
from benchmarks.profiler.utils.plot import (
2929
plot_decode_performance,
30+
plot_pd_joint_results,
3031
plot_prefill_performance,
3132
)
3233
from benchmarks.profiler.utils.profile_cache import (
@@ -280,14 +281,10 @@ async def run_profile(args):
280281
prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000)
281282

282283
# Plot the results as a 2D scatter plot
284+
prefill_results = None
283285
if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu:
284-
plot_prefill_performance(
285-
prefill_num_gpus,
286-
prefill_ttft,
287-
prefill_thpt_per_gpu,
288-
args.ttft,
289-
args.output_dir,
290-
)
286+
prefill_results = (prefill_num_gpus, prefill_ttft, prefill_thpt_per_gpu)
287+
plot_prefill_performance(prefill_results, args.ttft, args.output_dir)
291288

292289
# then profile decode
293290
decode_num_gpus = []
@@ -476,6 +473,11 @@ async def run_profile(args):
476473
if decode_results:
477474
plot_decode_performance(decode_results, args.itl, args.output_dir)
478475

476+
if prefill_results and decode_results:
477+
plot_pd_joint_results(
478+
args.isl, args.osl, prefill_results, decode_results, args.output_dir
479+
)
480+
479481
if args.dry_run:
480482
logger.info("Skipping recommendations in dry run mode")
481483
else:
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
5+
def compute_pareto(x, y):
6+
"""
7+
compute the pareto front (top-left is better) for the given x and y values
8+
return sorted lists of the x and y values for the pareto front
9+
"""
10+
# Validate inputs
11+
if x is None or y is None:
12+
return [], []
13+
14+
if len(x) != len(y):
15+
raise ValueError("x and y must have the same length")
16+
17+
if len(x) == 0:
18+
return [], []
19+
20+
# Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
21+
points = list(zip(x, y))
22+
points.sort(key=lambda p: (p[0], -p[1]))
23+
24+
# Single pass to keep only non-dominated points (minimize x, maximize y).
25+
pareto = []
26+
max_y = float("-inf")
27+
for px, py in points:
28+
if py > max_y:
29+
pareto.append((px, py))
30+
max_y = py
31+
32+
# Return sorted by x ascending for convenience
33+
pareto.sort(key=lambda p: (p[0], p[1]))
34+
xs = [px for px, _ in pareto]
35+
ys = [py for _, py in pareto]
36+
return xs, ys

benchmarks/profiler/utils/plot.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
from matplotlib import cm
2121
from scipy.interpolate import griddata
2222

23+
from benchmarks.profiler.utils.pareto import compute_pareto
24+
2325
logger = logging.getLogger(__name__)
2426
logger.setLevel(logging.INFO)
2527
console_handler = logging.StreamHandler()
@@ -31,19 +33,16 @@
3133
logger.addHandler(console_handler)
3234

3335

34-
def plot_prefill_performance(
35-
prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir
36-
):
36+
def plot_prefill_performance(prefill_results, target_ttft, output_dir):
3737
"""
3838
Plot prefill performance as a 2D scatter plot with GPU count annotations.
3939
4040
Args:
41-
prefill_num_gpu: list of GPU counts
42-
prefill_ttft: list of time to first token values
43-
prefill_thpt_per_gpu: list of throughput per GPU values
41+
prefill_results: tuple of (prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu)
4442
target_ttft: target TTFT value for the vertical line
4543
output_dir: directory to save the plot
4644
"""
45+
prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu = prefill_results
4746
plt.figure(figsize=(10, 6))
4847
plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100)
4948
for i, num_gpu in enumerate(prefill_num_gpu):
@@ -252,3 +251,47 @@ def plot_decode_3d_surface(
252251
logger.info(f"Saving throughput surface plot to {thpt_plot_path}")
253252
plt.savefig(thpt_plot_path, dpi=300, bbox_inches="tight")
254253
plt.close()
254+
255+
256+
def plot_pd_joint_results(isl, osl, prefill_results, decode_results, output_dir):
257+
GPU_COST_PER_HOUR = 3.0 # $3/hour
258+
259+
# compute pareto front for prefill
260+
p_ttft, p_thpt = compute_pareto(prefill_results[1], prefill_results[2])
261+
262+
# compute pareto front for decode
263+
_d_itl, _d_thpt = [], []
264+
for _d_result in decode_results:
265+
_d_itl.extend(_d_result[1])
266+
_d_thpt.extend(_d_result[2])
267+
d_itl, d_thpt = compute_pareto(_d_itl, _d_thpt)
268+
269+
# convert to cost per thousand requests
270+
p_ttft = np.array(p_ttft)
271+
p_thpt = np.array(p_thpt)
272+
d_itl = np.array(d_itl)
273+
d_thpt = np.array(d_thpt)
274+
275+
tokens_per_user = []
276+
cost = []
277+
ttft = []
278+
for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
279+
ttft.append(_p_ttft)
280+
prefill_cost = isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
281+
tokens_per_user.append(1000 / d_itl)
282+
cost.append(osl * 1000 / d_thpt * GPU_COST_PER_HOUR / 3600 + prefill_cost)
283+
284+
# plot
285+
plt.figure(figsize=(12, 10))
286+
plt.title(
287+
f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${GPU_COST_PER_HOUR}) Under Different SLA"
288+
)
289+
for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft):
290+
line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0]
291+
plt.scatter(_tokens_per_user, _cost, marker="x", s=100, color=line.get_color())
292+
plt.xlabel("Tokens per User")
293+
plt.ylabel("Cost ($)")
294+
plt.grid(True)
295+
plt.legend()
296+
plt.savefig(f"{output_dir}/cost_sla.png", dpi=300)
297+
plt.close()

0 commit comments

Comments
 (0)