Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions benchmarks/profiler/webapp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import importlib.metadata

from benchmarks.profiler.webapp.main import main

__version__ = importlib.metadata.version("aiconfigurator")

main()
10 changes: 10 additions & 0 deletions benchmarks/profiler/webapp/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Core functionality for the Dynamo SLA Profiler webapp.

This package contains:
- constants: Shared constants and configuration
- profiling: Performance profiling logic using AI Configurator
"""
153 changes: 153 additions & 0 deletions benchmarks/profiler/webapp/core/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Constants and configuration for the Dynamo SLA Profiler webapp.
"""

# Table headers for different performance metrics
PREFILL_TABLE_HEADERS = [
"GPUs",
"TTFT (ms)",
"Throughput (tokens/s/GPU)",
]

DECODE_TABLE_HEADERS = [
"GPUs",
"ITL (ms)",
"Throughput (tokens/s/GPU)",
]

COST_TABLE_HEADERS = [
"TTFT (ms)",
"Prefill Thpt (tokens/s/GPU)",
"ITL (ms)",
"Decode Thpt (tokens/s/GPU)",
"Tokens/User",
"Cost ($)",
]

# Backend version mapping
BACKEND_VERSIONS = {
"trtllm": ["1.0.0", "0.20.0", "0.19.0", "0.18.0"],
"vllm": ["0.10.0"],
"sglang": ["0.4.5"],
}

# Supported GPU systems
GPU_SYSTEMS = [
"H100_SXM",
"H200_SXM",
"A100_SXM",
"A100_PCIE",
]

# Supported inference backends
INFERENCE_BACKENDS = ["vllm", "sglang", "trtllm"]

# GPU count options
MIN_GPU_OPTIONS = [1, 2, 4, 8]
MAX_GPU_OPTIONS = [1, 2, 4, 8, 16]

# Default decode interpolation granularity
DEFAULT_DECODE_INTERPOLATION_GRANULARITY = 6

# CSS styles for custom table rendering
TABLE_CSS = """
<style>
.dynamo-table-wrapper {
overflow-x: auto;
margin-top: 0.5rem;
}
.dynamo-table {
width: 100%;
border-collapse: collapse;
font-size: 0.95rem;
}
.dynamo-table thead {
background: rgba(255, 255, 255, 0.05);
text-transform: uppercase;
letter-spacing: 0.02em;
}
.dynamo-table th,
.dynamo-table td {
padding: 0.55rem 0.75rem;
text-align: left;
border-bottom: 1px solid rgba(255, 255, 255, 0.08);
}
.dynamo-table tbody tr:hover {
background: rgba(255, 255, 255, 0.08);
}
.dynamo-table-empty {
text-align: center;
padding: 0.85rem 0;
opacity: 0.7;
}
</style>
"""

# Default configuration YAML placeholder
DEFAULT_CONFIG_YAML = """apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg
spec:
services:
Frontend:
dynamoNamespace: vllm-disagg
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"""

# Plot interaction instructions
PLOT_INTERACTION_INSTRUCTIONS = """
**How to interact with plots:**
- **Hover** over points to see detailed information
- **Click** points to select them (click again to deselect)
- **Multiple selection**: Click multiple points with shift key or select tools from the top right corner to compare specific configurations
- The table below each plot will filter to show only selected points, or all points if none are selected
"""

# Tab descriptions
PREFILL_TAB_DESCRIPTION = """
**Prefill Performance**: Interactive plot showing the relationship between Time to First Token (TTFT)
and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled).
Table shows selected points, or all points if none selected.
"""

DECODE_TAB_DESCRIPTION = """
**Decode Performance**: Interactive plot showing the relationship between Inter Token Latency (ITL)
and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled).
Table shows selected points, or all points if none selected.
"""

COST_TAB_DESCRIPTION = """
**Cost Analysis**: Interactive plot showing the cost per 1000 requests under different SLA configurations.
Lower curves represent better cost efficiency for the same throughput. **Click points to select/deselect** (multi-select enabled).
Table shows selected points, or all points if none selected.
"""

# Plotly color palette
PLOTLY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"]

# Plotly dark theme configuration
PLOTLY_DARK_THEME = {
"template": "plotly_dark",
"plot_bgcolor": "rgba(0, 0, 0, 0)",
"paper_bgcolor": "rgba(0, 0, 0, 0)",
"modebar": dict(
bgcolor="rgba(0, 0, 0, 0)",
color="rgba(255, 255, 255, 0.5)",
activecolor="rgba(255, 255, 255, 0.9)",
),
"legend": dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01,
bgcolor="rgba(0, 0, 0, 0.5)",
font=dict(color="white"),
),
}
185 changes: 185 additions & 0 deletions benchmarks/profiler/webapp/core/orchestrator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Orchestration logic for generating performance plots.

This module contains the main pipeline that coordinates profiling,
plot generation, and table building.
"""

from benchmarks.profiler.webapp.core.profiling import (
format_status_message,
generate_gpu_configurations,
initialize_ai_configurator,
profile_decode_performance,
profile_prefill_performance,
validate_inputs,
)
from benchmarks.profiler.webapp.ui.plots import (
plot_cost_sla_interactive,
plot_decode_performance_interactive,
plot_prefill_performance_interactive,
)
from benchmarks.profiler.webapp.ui.tables import build_all_tables, get_empty_tables


def generate_plots(
aic_model_name: str,
backend: str,
config_yaml: str,
use_aic: bool,
aic_backend: str,
aic_backend_version: str,
aic_system: str,
min_num_gpus_per_engine: int,
max_num_gpus_per_engine: int,
num_gpus_per_node: int,
gpu_cost_per_hour: float,
isl: int,
osl: int,
max_context_length: int,
ttft: float,
itl: float,
):
"""
Generate performance plots using AI Configurator estimation.

This function profiles LLM inference performance by:
1. Estimating prefill performance (TTFT) across different GPU counts
2. Estimating decode performance (ITL) at various concurrency levels
3. Computing cost-vs-SLA tradeoffs based on GPU pricing

Args:
aic_model_name: Model name for AI Configurator (e.g., "QWEN3_32B")
backend: Inference backend (vllm, sglang, trtllm) - for reference only
config_yaml: YAML configuration string from UI (reserved for future use)
use_aic: Whether to use AI Configurator (must be True for webapp)
aic_backend: Backend for AI Configurator estimation
aic_backend_version: Version of the backend
aic_system: GPU system (e.g., "H200_SXM")
min_num_gpus_per_engine: Minimum TP size to profile
max_num_gpus_per_engine: Maximum TP size to profile
num_gpus_per_node: GPUs per node (for MoE models, unused for dense)
gpu_cost_per_hour: Cost per GPU per hour in dollars
isl: Input sequence length
osl: Output sequence length
max_context_length: Maximum context length (currently unused)
ttft: Target TTFT in milliseconds (for visualization)
itl: Target ITL in milliseconds (for visualization)

Returns:
Tuple of (prefill_plot, decode_plot, cost_plot, status_message,
prefill_table_html, decode_table_html, cost_table_html)
"""
empty_prefill_html, empty_decode_html, empty_cost_html = get_empty_tables()

try:
# Validate inputs
is_valid, error_msg = validate_inputs(
use_aic, aic_model_name, aic_system, aic_backend_version
)
if not is_valid:
return (
None,
None,
None,
error_msg,
empty_prefill_html,
empty_decode_html,
empty_cost_html,
)

# Initialize AI Configurator
ai_configurator = initialize_ai_configurator(
aic_model_name, aic_system, aic_backend, aic_backend_version
)

# Generate GPU configurations to profile
profile_num_gpus = generate_gpu_configurations(
min_num_gpus_per_engine, max_num_gpus_per_engine
)

if not profile_num_gpus:
return (
None,
None,
None,
"❌ No valid GPU configurations to profile",
empty_prefill_html,
empty_decode_html,
empty_cost_html,
)

# Profile prefill performance
prefill_results = profile_prefill_performance(
ai_configurator, profile_num_gpus, isl
)

if not prefill_results[0]:
return (
None,
None,
None,
"❌ Failed to generate prefill results",
empty_prefill_html,
empty_decode_html,
empty_cost_html,
)

# Profile decode performance
decode_results = profile_decode_performance(
ai_configurator, profile_num_gpus, isl, osl
)

if not decode_results:
return (
None,
None,
None,
"❌ Failed to generate decode results",
empty_prefill_html,
empty_decode_html,
empty_cost_html,
)

# Generate interactive plots
prefill_plot = plot_prefill_performance_interactive(prefill_results, ttft)
decode_plot = plot_decode_performance_interactive(decode_results, itl)
cost_plot = plot_cost_sla_interactive(
isl, osl, prefill_results, decode_results, gpu_cost_per_hour
)

# Generate success status message
status_msg = format_status_message(
profile_num_gpus, prefill_results, gpu_cost_per_hour
)

# Build all tables
prefill_table_html, decode_table_html, cost_table_html = build_all_tables(
prefill_results, decode_results, isl, osl, gpu_cost_per_hour
)

return (
prefill_plot,
decode_plot,
cost_plot,
status_msg,
prefill_table_html,
decode_table_html,
cost_table_html,
)

except Exception as e:
import traceback

error_msg = f"❌ Error generating plots:\n{str(e)}\n\n{traceback.format_exc()}"
return (
None,
None,
None,
error_msg,
empty_prefill_html,
empty_decode_html,
empty_cost_html,
)
Loading
Loading