ai-dynamo · jasonqinzhou · Nov 6, 2025
@@ -0,0 +1,7 @@
+import importlib.metadata
+
+from benchmarks.profiler.webapp.main import main
+
+__version__ = importlib.metadata.version("aiconfigurator")
+
+main()
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Core functionality for the Dynamo SLA Profiler webapp.
+
+This package contains:
+- constants: Shared constants and configuration
+- profiling: Performance profiling logic using AI Configurator
+"""
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Constants and configuration for the Dynamo SLA Profiler webapp.
+"""
+
+# Table headers for different performance metrics
+PREFILL_TABLE_HEADERS = [
+    "GPUs",
+    "TTFT (ms)",
+    "Throughput (tokens/s/GPU)",
+]
+
+DECODE_TABLE_HEADERS = [
+    "GPUs",
+    "ITL (ms)",
+    "Throughput (tokens/s/GPU)",
+]
+
+COST_TABLE_HEADERS = [
+    "TTFT (ms)",
+    "Prefill Thpt (tokens/s/GPU)",
+    "ITL (ms)",
+    "Decode Thpt (tokens/s/GPU)",
+    "Tokens/User",
+    "Cost ($)",
+]
+
+# Backend version mapping
+BACKEND_VERSIONS = {
+    "trtllm": ["1.0.0", "0.20.0", "0.19.0", "0.18.0"],
+    "vllm": ["0.10.0"],
+    "sglang": ["0.4.5"],
+}
+
+# Supported GPU systems
+GPU_SYSTEMS = [
+    "H100_SXM",
+    "H200_SXM",
+    "A100_SXM",
+    "A100_PCIE",
+]
+
+# Supported inference backends
+INFERENCE_BACKENDS = ["vllm", "sglang", "trtllm"]
+
+# GPU count options
+MIN_GPU_OPTIONS = [1, 2, 4, 8]
+MAX_GPU_OPTIONS = [1, 2, 4, 8, 16]
+
+# Default decode interpolation granularity
+DEFAULT_DECODE_INTERPOLATION_GRANULARITY = 6
+
+# CSS styles for custom table rendering
+TABLE_CSS = """
+<style>
+    .dynamo-table-wrapper {
+        overflow-x: auto;
+        margin-top: 0.5rem;
+    }
+    .dynamo-table {
+        width: 100%;
+        border-collapse: collapse;
+        font-size: 0.95rem;
+    }
+    .dynamo-table thead {
+        background: rgba(255, 255, 255, 0.05);
+        text-transform: uppercase;
+        letter-spacing: 0.02em;
+    }
+    .dynamo-table th,
+    .dynamo-table td {
+        padding: 0.55rem 0.75rem;
+        text-align: left;
+        border-bottom: 1px solid rgba(255, 255, 255, 0.08);
+    }
+    .dynamo-table tbody tr:hover {
+        background: rgba(255, 255, 255, 0.08);
+    }
+    .dynamo-table-empty {
+        text-align: center;
+        padding: 0.85rem 0;
+        opacity: 0.7;
+    }
+</style>
+"""
+
+# Default configuration YAML placeholder
+DEFAULT_CONFIG_YAML = """apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-disagg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-disagg
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"""
+
+# Plot interaction instructions
+PLOT_INTERACTION_INSTRUCTIONS = """
+**How to interact with plots:**
+- **Hover** over points to see detailed information
+- **Click** points to select them (click again to deselect)
+- **Multiple selection**: Click multiple points with shift key or select tools from the top right corner to compare specific configurations
+- The table below each plot will filter to show only selected points, or all points if none are selected
+"""
+
+# Tab descriptions
+PREFILL_TAB_DESCRIPTION = """
+**Prefill Performance**: Interactive plot showing the relationship between Time to First Token (TTFT)
+and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled).
+Table shows selected points, or all points if none selected.
+"""
+
+DECODE_TAB_DESCRIPTION = """
+**Decode Performance**: Interactive plot showing the relationship between Inter Token Latency (ITL)
+and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled).
+Table shows selected points, or all points if none selected.
+"""
+
+COST_TAB_DESCRIPTION = """
+**Cost Analysis**: Interactive plot showing the cost per 1000 requests under different SLA configurations.
+Lower curves represent better cost efficiency for the same throughput. **Click points to select/deselect** (multi-select enabled).
+Table shows selected points, or all points if none selected.
+"""
+
+# Plotly color palette
+PLOTLY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"]
+
+# Plotly dark theme configuration
+PLOTLY_DARK_THEME = {
+    "template": "plotly_dark",
+    "plot_bgcolor": "rgba(0, 0, 0, 0)",
+    "paper_bgcolor": "rgba(0, 0, 0, 0)",
+    "modebar": dict(
+        bgcolor="rgba(0, 0, 0, 0)",
+        color="rgba(255, 255, 255, 0.5)",
+        activecolor="rgba(255, 255, 255, 0.9)",
+    ),
+    "legend": dict(
+        yanchor="top",
+        y=0.99,
+        xanchor="left",
+        x=0.01,
+        bgcolor="rgba(0, 0, 0, 0.5)",
+        font=dict(color="white"),
+    ),
+}
@@ -0,0 +1,185 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Orchestration logic for generating performance plots.
+
+This module contains the main pipeline that coordinates profiling,
+plot generation, and table building.
+"""
+
+from benchmarks.profiler.webapp.core.profiling import (
+    format_status_message,
+    generate_gpu_configurations,
+    initialize_ai_configurator,
+    profile_decode_performance,
+    profile_prefill_performance,
+    validate_inputs,
+)
+from benchmarks.profiler.webapp.ui.plots import (
+    plot_cost_sla_interactive,
+    plot_decode_performance_interactive,
+    plot_prefill_performance_interactive,
+)
+from benchmarks.profiler.webapp.ui.tables import build_all_tables, get_empty_tables
+
+
+def generate_plots(
+    aic_model_name: str,
+    backend: str,
+    config_yaml: str,
+    use_aic: bool,
+    aic_backend: str,
+    aic_backend_version: str,
+    aic_system: str,
+    min_num_gpus_per_engine: int,
+    max_num_gpus_per_engine: int,
+    num_gpus_per_node: int,
+    gpu_cost_per_hour: float,
+    isl: int,
+    osl: int,
+    max_context_length: int,
+    ttft: float,
+    itl: float,
+):
+    """
+    Generate performance plots using AI Configurator estimation.
+
+    This function profiles LLM inference performance by:
+    1. Estimating prefill performance (TTFT) across different GPU counts
+    2. Estimating decode performance (ITL) at various concurrency levels
+    3. Computing cost-vs-SLA tradeoffs based on GPU pricing
+
+    Args:
+        aic_model_name: Model name for AI Configurator (e.g., "QWEN3_32B")
+        backend: Inference backend (vllm, sglang, trtllm) - for reference only
+        config_yaml: YAML configuration string from UI (reserved for future use)
+        use_aic: Whether to use AI Configurator (must be True for webapp)
+        aic_backend: Backend for AI Configurator estimation
+        aic_backend_version: Version of the backend
+        aic_system: GPU system (e.g., "H200_SXM")
+        min_num_gpus_per_engine: Minimum TP size to profile
+        max_num_gpus_per_engine: Maximum TP size to profile
+        num_gpus_per_node: GPUs per node (for MoE models, unused for dense)
+        gpu_cost_per_hour: Cost per GPU per hour in dollars
+        isl: Input sequence length
+        osl: Output sequence length
+        max_context_length: Maximum context length (currently unused)
+        ttft: Target TTFT in milliseconds (for visualization)
+        itl: Target ITL in milliseconds (for visualization)
+
+    Returns:
+        Tuple of (prefill_plot, decode_plot, cost_plot, status_message,
+                  prefill_table_html, decode_table_html, cost_table_html)
+    """
+    empty_prefill_html, empty_decode_html, empty_cost_html = get_empty_tables()
+
+    try:
+        # Validate inputs
+        is_valid, error_msg = validate_inputs(
+            use_aic, aic_model_name, aic_system, aic_backend_version
+        )
+        if not is_valid:
+            return (
+                None,
+                None,
+                None,
+                error_msg,
+                empty_prefill_html,
+                empty_decode_html,
+                empty_cost_html,
+            )
+
+        # Initialize AI Configurator
+        ai_configurator = initialize_ai_configurator(
+            aic_model_name, aic_system, aic_backend, aic_backend_version
+        )
+
+        # Generate GPU configurations to profile
+        profile_num_gpus = generate_gpu_configurations(
+            min_num_gpus_per_engine, max_num_gpus_per_engine
+        )
+
+        if not profile_num_gpus:
+            return (
+                None,
+                None,
+                None,
+                "❌ No valid GPU configurations to profile",
+                empty_prefill_html,
+                empty_decode_html,
+                empty_cost_html,
+            )
+
+        # Profile prefill performance
+        prefill_results = profile_prefill_performance(
+            ai_configurator, profile_num_gpus, isl
+        )
+
+        if not prefill_results[0]:
+            return (
+                None,
+                None,
+                None,
+                "❌ Failed to generate prefill results",
+                empty_prefill_html,
+                empty_decode_html,
+                empty_cost_html,
+            )
+
+        # Profile decode performance
+        decode_results = profile_decode_performance(
+            ai_configurator, profile_num_gpus, isl, osl
+        )
+
+        if not decode_results:
+            return (
+                None,
+                None,
+                None,
+                "❌ Failed to generate decode results",
+                empty_prefill_html,
+                empty_decode_html,
+                empty_cost_html,
+            )
+
+        # Generate interactive plots
+        prefill_plot = plot_prefill_performance_interactive(prefill_results, ttft)
+        decode_plot = plot_decode_performance_interactive(decode_results, itl)
+        cost_plot = plot_cost_sla_interactive(
+            isl, osl, prefill_results, decode_results, gpu_cost_per_hour
+        )
+
+        # Generate success status message
+        status_msg = format_status_message(
+            profile_num_gpus, prefill_results, gpu_cost_per_hour
+        )
+
+        # Build all tables
+        prefill_table_html, decode_table_html, cost_table_html = build_all_tables(
+            prefill_results, decode_results, isl, osl, gpu_cost_per_hour
+        )
+
+        return (
+            prefill_plot,
+            decode_plot,
+            cost_plot,
+            status_msg,
+            prefill_table_html,
+            decode_table_html,
+            cost_table_html,
+        )
+
+    except Exception as e:
+        import traceback
+
+        error_msg = f"❌ Error generating plots:\n{str(e)}\n\n{traceback.format_exc()}"
+        return (
+            None,
+            None,
+            None,
+            error_msg,
+            empty_prefill_html,
+            empty_decode_html,
+            empty_cost_html,
+        )