ai-dynamo
diff --git a/‎.github/workflows/nightly-ci.yml‎
Lines changed: 70 additions & 0 deletions b/‎.github/workflows/nightly-ci.yml‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎benchmarks/profiler/profile_sla.py‎
Lines changed: 9 additions & 7 deletions b/‎benchmarks/profiler/profile_sla.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎benchmarks/profiler/utils/pareto.py‎
Lines changed: 36 additions & 0 deletions b/‎benchmarks/profiler/utils/pareto.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎benchmarks/profiler/utils/plot.py‎
Lines changed: 49 additions & 6 deletions b/‎benchmarks/profiler/utils/plot.py‎
Lines changed: 49 additions & 6 deletions
@@ -0,0 +1,70 @@
+name: Nightly CI
+
+on:
+  schedule:
+    - cron: '0 8 * * *'  # Every day at 12:00 AM PST (08:00 UTC)
+  workflow_dispatch:
+
+jobs:
+  vllm:
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - { arch: amd64, runner: gpu-l40-amd64 }
+          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
+    name: vllm (${{ matrix.platform.arch }})
+    runs-on: ${{ matrix.platform.runner }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Build vLLM Docker Image
+        id: build-vllm
+        uses: ./.github/actions/docker-build
+        with:
+          framework: vllm
+          target: runtime
+          platform: linux/${{ matrix.platform.arch }}
+          base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
+          runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
+          cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
+          torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
+          ci_token: ${{ secrets.CI_TOKEN }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          image_tag: nightly-vllm-${{ matrix.platform.arch }}
+      - name: Tag and Push vLLM Nightly Image
+        uses: ./.github/actions/docker-tag-push
+        with:
+          local_image: ${{ steps.build-vllm.outputs.image_tag }}
+          # Tag the image nightly
+          push_tag: ai-dynamo/dynamo:nightly-vllm-${{ matrix.platform.arch }}
+          aws_push: 'false'
+          azure_push: 'true'
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+      - name: Run unit tests
+        if: ${{ matrix.platform.arch != 'arm64' }}
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: nightly-vllm-${{ matrix.platform.arch }}
+          pytest_marks: "vllm and unit"
+          framework: "vllm"
+          test_type: "unit"
+          platform_arch: ${{ matrix.platform.arch }}
+      - name: Run e2e tests
+        if: ${{ matrix.platform.arch != 'arm64' }}
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: nightly-vllm-${{ matrix.platform.arch }}
+          pytest_marks: "nightly and vllm and gpu_1"
+          framework: "vllm"
+          test_type: "e2e"
+          platform_arch: ${{ matrix.platform.arch }}
@@ -27,6 +27,7 @@
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
 from benchmarks.profiler.utils.plot import (
     plot_decode_performance,
+    plot_pd_joint_results,
     plot_prefill_performance,
 )
 from benchmarks.profiler.utils.profile_cache import (
@@ -280,14 +281,10 @@ async def run_profile(args):
                 prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000)
 
         # Plot the results as a 2D scatter plot
+        prefill_results = None
         if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu:
-            plot_prefill_performance(
-                prefill_num_gpus,
-                prefill_ttft,
-                prefill_thpt_per_gpu,
-                args.ttft,
-                args.output_dir,
-            )
+            prefill_results = (prefill_num_gpus, prefill_ttft, prefill_thpt_per_gpu)
+            plot_prefill_performance(prefill_results, args.ttft, args.output_dir)
 
         # then profile decode
         decode_num_gpus = []
@@ -476,6 +473,11 @@ async def run_profile(args):
         if decode_results:
             plot_decode_performance(decode_results, args.itl, args.output_dir)
 
+        if prefill_results and decode_results:
+            plot_pd_joint_results(
+                args.isl, args.osl, prefill_results, decode_results, args.output_dir
+            )
+
         if args.dry_run:
             logger.info("Skipping recommendations in dry run mode")
         else:
 
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+def compute_pareto(x, y):
+    """
+    compute the pareto front (top-left is better) for the given x and y values
+    return sorted lists of the x and y values for the pareto front
+    """
+    # Validate inputs
+    if x is None or y is None:
+        return [], []
+
+    if len(x) != len(y):
+        raise ValueError("x and y must have the same length")
+
+    if len(x) == 0:
+        return [], []
+
+    # Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
+    points = list(zip(x, y))
+    points.sort(key=lambda p: (p[0], -p[1]))
+
+    # Single pass to keep only non-dominated points (minimize x, maximize y).
+    pareto = []
+    max_y = float("-inf")
+    for px, py in points:
+        if py > max_y:
+            pareto.append((px, py))
+            max_y = py
+
+    # Return sorted by x ascending for convenience
+    pareto.sort(key=lambda p: (p[0], p[1]))
+    xs = [px for px, _ in pareto]
+    ys = [py for _, py in pareto]
+    return xs, ys
@@ -20,6 +20,8 @@
 from matplotlib import cm
 from scipy.interpolate import griddata
 
+from benchmarks.profiler.utils.pareto import compute_pareto
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 console_handler = logging.StreamHandler()
@@ -31,19 +33,16 @@
 logger.addHandler(console_handler)
 
 
-def plot_prefill_performance(
-    prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir
-):
+def plot_prefill_performance(prefill_results, target_ttft, output_dir):
     """
     Plot prefill performance as a 2D scatter plot with GPU count annotations.
 
     Args:
-        prefill_num_gpu: list of GPU counts
-        prefill_ttft: list of time to first token values
-        prefill_thpt_per_gpu: list of throughput per GPU values
+        prefill_results: tuple of (prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu)
         target_ttft: target TTFT value for the vertical line
         output_dir: directory to save the plot
     """
+    prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu = prefill_results
     plt.figure(figsize=(10, 6))
     plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100)
     for i, num_gpu in enumerate(prefill_num_gpu):
@@ -252,3 +251,47 @@ def plot_decode_3d_surface(
     logger.info(f"Saving throughput surface plot to {thpt_plot_path}")
     plt.savefig(thpt_plot_path, dpi=300, bbox_inches="tight")
     plt.close()
+
+
+def plot_pd_joint_results(isl, osl, prefill_results, decode_results, output_dir):
+    GPU_COST_PER_HOUR = 3.0  # $3/hour
+
+    # compute pareto front for prefill
+    p_ttft, p_thpt = compute_pareto(prefill_results[1], prefill_results[2])
+
+    # compute pareto front for decode
+    _d_itl, _d_thpt = [], []
+    for _d_result in decode_results:
+        _d_itl.extend(_d_result[1])
+        _d_thpt.extend(_d_result[2])
+    d_itl, d_thpt = compute_pareto(_d_itl, _d_thpt)
+
+    # convert to cost per thousand requests
+    p_ttft = np.array(p_ttft)
+    p_thpt = np.array(p_thpt)
+    d_itl = np.array(d_itl)
+    d_thpt = np.array(d_thpt)
+
+    tokens_per_user = []
+    cost = []
+    ttft = []
+    for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
+        ttft.append(_p_ttft)
+        prefill_cost = isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
+        tokens_per_user.append(1000 / d_itl)
+        cost.append(osl * 1000 / d_thpt * GPU_COST_PER_HOUR / 3600 + prefill_cost)
+
+    # plot
+    plt.figure(figsize=(12, 10))
+    plt.title(
+        f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${GPU_COST_PER_HOUR}) Under Different SLA"
+    )
+    for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft):
+        line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0]
+        plt.scatter(_tokens_per_user, _cost, marker="x", s=100, color=line.get_color())
+    plt.xlabel("Tokens per User")
+    plt.ylabel("Cost ($)")
+    plt.grid(True)
+    plt.legend()
+    plt.savefig(f"{output_dir}/cost_sla.png", dpi=300)
+    plt.close()