diff --git a/benchmarks/profiler/webapp/__init__.py b/benchmarks/profiler/webapp/__init__.py
new file mode 100644
index 0000000000..1b4f510e5f
--- /dev/null
+++ b/benchmarks/profiler/webapp/__init__.py
@@ -0,0 +1,7 @@
+import importlib.metadata
+
+from benchmarks.profiler.webapp.main import main
+
+__version__ = importlib.metadata.version("aiconfigurator")
+
+main()
diff --git a/benchmarks/profiler/webapp/core/__init__.py b/benchmarks/profiler/webapp/core/__init__.py
new file mode 100644
index 0000000000..dce7de9dda
--- /dev/null
+++ b/benchmarks/profiler/webapp/core/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Core functionality for the Dynamo SLA Profiler webapp.
+
+This package contains:
+- constants: Shared constants and configuration
+- profiling: Performance profiling logic using AI Configurator
+"""
diff --git a/benchmarks/profiler/webapp/core/constants.py b/benchmarks/profiler/webapp/core/constants.py
new file mode 100644
index 0000000000..08557007fd
--- /dev/null
+++ b/benchmarks/profiler/webapp/core/constants.py
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Constants and configuration for the Dynamo SLA Profiler webapp.
+"""
+
+# Table headers for different performance metrics
+PREFILL_TABLE_HEADERS = [
+ "GPUs",
+ "TTFT (ms)",
+ "Throughput (tokens/s/GPU)",
+]
+
+DECODE_TABLE_HEADERS = [
+ "GPUs",
+ "ITL (ms)",
+ "Throughput (tokens/s/GPU)",
+]
+
+COST_TABLE_HEADERS = [
+ "TTFT (ms)",
+ "Prefill Thpt (tokens/s/GPU)",
+ "ITL (ms)",
+ "Decode Thpt (tokens/s/GPU)",
+ "Tokens/User",
+ "Cost ($)",
+]
+
+# Backend version mapping
+BACKEND_VERSIONS = {
+ "trtllm": ["1.0.0", "0.20.0", "0.19.0", "0.18.0"],
+ "vllm": ["0.10.0"],
+ "sglang": ["0.4.5"],
+}
+
+# Supported GPU systems
+GPU_SYSTEMS = [
+ "H100_SXM",
+ "H200_SXM",
+ "A100_SXM",
+ "A100_PCIE",
+]
+
+# Supported inference backends
+INFERENCE_BACKENDS = ["vllm", "sglang", "trtllm"]
+
+# GPU count options
+MIN_GPU_OPTIONS = [1, 2, 4, 8]
+MAX_GPU_OPTIONS = [1, 2, 4, 8, 16]
+
+# Default decode interpolation granularity
+DEFAULT_DECODE_INTERPOLATION_GRANULARITY = 6
+
+# CSS styles for custom table rendering
+TABLE_CSS = """
+
+"""
+
+# Default configuration YAML placeholder
+DEFAULT_CONFIG_YAML = """apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+ name: vllm-disagg
+spec:
+ services:
+ Frontend:
+ dynamoNamespace: vllm-disagg
+ componentType: frontend
+ replicas: 1
+ extraPodSpec:
+ mainContainer:
+ image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"""
+
+# Plot interaction instructions
+PLOT_INTERACTION_INSTRUCTIONS = """
+**How to interact with plots:**
+- **Hover** over points to see detailed information
+- **Click** points to select them (click again to deselect)
+- **Multiple selection**: Click multiple points with shift key or select tools from the top right corner to compare specific configurations
+- The table below each plot will filter to show only selected points, or all points if none are selected
+"""
+
+# Tab descriptions
+PREFILL_TAB_DESCRIPTION = """
+**Prefill Performance**: Interactive plot showing the relationship between Time to First Token (TTFT)
+and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled).
+Table shows selected points, or all points if none selected.
+"""
+
+DECODE_TAB_DESCRIPTION = """
+**Decode Performance**: Interactive plot showing the relationship between Inter Token Latency (ITL)
+and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled).
+Table shows selected points, or all points if none selected.
+"""
+
+COST_TAB_DESCRIPTION = """
+**Cost Analysis**: Interactive plot showing the cost per 1000 requests under different SLA configurations.
+Lower curves represent better cost efficiency for the same throughput. **Click points to select/deselect** (multi-select enabled).
+Table shows selected points, or all points if none selected.
+"""
+
+# Plotly color palette
+PLOTLY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"]
+
+# Plotly dark theme configuration
+PLOTLY_DARK_THEME = {
+ "template": "plotly_dark",
+ "plot_bgcolor": "rgba(0, 0, 0, 0)",
+ "paper_bgcolor": "rgba(0, 0, 0, 0)",
+ "modebar": dict(
+ bgcolor="rgba(0, 0, 0, 0)",
+ color="rgba(255, 255, 255, 0.5)",
+ activecolor="rgba(255, 255, 255, 0.9)",
+ ),
+ "legend": dict(
+ yanchor="top",
+ y=0.99,
+ xanchor="left",
+ x=0.01,
+ bgcolor="rgba(0, 0, 0, 0.5)",
+ font=dict(color="white"),
+ ),
+}
diff --git a/benchmarks/profiler/webapp/core/orchestrator.py b/benchmarks/profiler/webapp/core/orchestrator.py
new file mode 100644
index 0000000000..c857cf8c07
--- /dev/null
+++ b/benchmarks/profiler/webapp/core/orchestrator.py
@@ -0,0 +1,185 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Orchestration logic for generating performance plots.
+
+This module contains the main pipeline that coordinates profiling,
+plot generation, and table building.
+"""
+
+from benchmarks.profiler.webapp.core.profiling import (
+ format_status_message,
+ generate_gpu_configurations,
+ initialize_ai_configurator,
+ profile_decode_performance,
+ profile_prefill_performance,
+ validate_inputs,
+)
+from benchmarks.profiler.webapp.ui.plots import (
+ plot_cost_sla_interactive,
+ plot_decode_performance_interactive,
+ plot_prefill_performance_interactive,
+)
+from benchmarks.profiler.webapp.ui.tables import build_all_tables, get_empty_tables
+
+
+def generate_plots(
+ aic_model_name: str,
+ backend: str,
+ config_yaml: str,
+ use_aic: bool,
+ aic_backend: str,
+ aic_backend_version: str,
+ aic_system: str,
+ min_num_gpus_per_engine: int,
+ max_num_gpus_per_engine: int,
+ num_gpus_per_node: int,
+ gpu_cost_per_hour: float,
+ isl: int,
+ osl: int,
+ max_context_length: int,
+ ttft: float,
+ itl: float,
+):
+ """
+ Generate performance plots using AI Configurator estimation.
+
+ This function profiles LLM inference performance by:
+ 1. Estimating prefill performance (TTFT) across different GPU counts
+ 2. Estimating decode performance (ITL) at various concurrency levels
+ 3. Computing cost-vs-SLA tradeoffs based on GPU pricing
+
+ Args:
+ aic_model_name: Model name for AI Configurator (e.g., "QWEN3_32B")
+ backend: Inference backend (vllm, sglang, trtllm) - for reference only
+ config_yaml: YAML configuration string from UI (reserved for future use)
+ use_aic: Whether to use AI Configurator (must be True for webapp)
+ aic_backend: Backend for AI Configurator estimation
+ aic_backend_version: Version of the backend
+ aic_system: GPU system (e.g., "H200_SXM")
+ min_num_gpus_per_engine: Minimum TP size to profile
+ max_num_gpus_per_engine: Maximum TP size to profile
+ num_gpus_per_node: GPUs per node (for MoE models, unused for dense)
+ gpu_cost_per_hour: Cost per GPU per hour in dollars
+ isl: Input sequence length
+ osl: Output sequence length
+ max_context_length: Maximum context length (currently unused)
+ ttft: Target TTFT in milliseconds (for visualization)
+ itl: Target ITL in milliseconds (for visualization)
+
+ Returns:
+ Tuple of (prefill_plot, decode_plot, cost_plot, status_message,
+ prefill_table_html, decode_table_html, cost_table_html)
+ """
+ empty_prefill_html, empty_decode_html, empty_cost_html = get_empty_tables()
+
+ try:
+ # Validate inputs
+ is_valid, error_msg = validate_inputs(
+ use_aic, aic_model_name, aic_system, aic_backend_version
+ )
+ if not is_valid:
+ return (
+ None,
+ None,
+ None,
+ error_msg,
+ empty_prefill_html,
+ empty_decode_html,
+ empty_cost_html,
+ )
+
+ # Initialize AI Configurator
+ ai_configurator = initialize_ai_configurator(
+ aic_model_name, aic_system, aic_backend, aic_backend_version
+ )
+
+ # Generate GPU configurations to profile
+ profile_num_gpus = generate_gpu_configurations(
+ min_num_gpus_per_engine, max_num_gpus_per_engine
+ )
+
+ if not profile_num_gpus:
+ return (
+ None,
+ None,
+ None,
+ "❌ No valid GPU configurations to profile",
+ empty_prefill_html,
+ empty_decode_html,
+ empty_cost_html,
+ )
+
+ # Profile prefill performance
+ prefill_results = profile_prefill_performance(
+ ai_configurator, profile_num_gpus, isl
+ )
+
+ if not prefill_results[0]:
+ return (
+ None,
+ None,
+ None,
+ "❌ Failed to generate prefill results",
+ empty_prefill_html,
+ empty_decode_html,
+ empty_cost_html,
+ )
+
+ # Profile decode performance
+ decode_results = profile_decode_performance(
+ ai_configurator, profile_num_gpus, isl, osl
+ )
+
+ if not decode_results:
+ return (
+ None,
+ None,
+ None,
+ "❌ Failed to generate decode results",
+ empty_prefill_html,
+ empty_decode_html,
+ empty_cost_html,
+ )
+
+ # Generate interactive plots
+ prefill_plot = plot_prefill_performance_interactive(prefill_results, ttft)
+ decode_plot = plot_decode_performance_interactive(decode_results, itl)
+ cost_plot = plot_cost_sla_interactive(
+ isl, osl, prefill_results, decode_results, gpu_cost_per_hour
+ )
+
+ # Generate success status message
+ status_msg = format_status_message(
+ profile_num_gpus, prefill_results, gpu_cost_per_hour
+ )
+
+ # Build all tables
+ prefill_table_html, decode_table_html, cost_table_html = build_all_tables(
+ prefill_results, decode_results, isl, osl, gpu_cost_per_hour
+ )
+
+ return (
+ prefill_plot,
+ decode_plot,
+ cost_plot,
+ status_msg,
+ prefill_table_html,
+ decode_table_html,
+ cost_table_html,
+ )
+
+ except Exception as e:
+ import traceback
+
+ error_msg = f"❌ Error generating plots:\n{str(e)}\n\n{traceback.format_exc()}"
+ return (
+ None,
+ None,
+ None,
+ error_msg,
+ empty_prefill_html,
+ empty_decode_html,
+ empty_cost_html,
+ )
diff --git a/benchmarks/profiler/webapp/core/profiling.py b/benchmarks/profiler/webapp/core/profiling.py
new file mode 100644
index 0000000000..c78aa242bf
--- /dev/null
+++ b/benchmarks/profiler/webapp/core/profiling.py
@@ -0,0 +1,201 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Performance profiling logic for the Dynamo SLA Profiler webapp.
+
+This module handles the actual performance estimation using AI Configurator,
+including prefill and decode performance profiling.
+"""
+
+import math
+
+from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
+from benchmarks.profiler.utils.profile_decode import get_num_request_range
+from benchmarks.profiler.webapp.core.constants import (
+ DEFAULT_DECODE_INTERPOLATION_GRANULARITY,
+)
+
+
+def validate_inputs(use_aic, aic_model_name, aic_system, aic_backend_version):
+ """
+ Validate AI Configurator inputs.
+
+ Args:
+ use_aic: Whether AI Configurator is enabled
+ aic_model_name: Model name for AI Configurator
+ aic_system: GPU system name
+ aic_backend_version: Backend version
+
+ Returns:
+ Tuple of (is_valid, error_message)
+ """
+ if not use_aic:
+ return False, "❌ Web UI requires AI Configurator mode"
+
+ if not aic_model_name or not aic_system or not aic_backend_version:
+ return False, "❌ Missing required AI Configurator parameters"
+
+ return True, None
+
+
+def initialize_ai_configurator(
+ aic_model_name, aic_system, aic_backend, aic_backend_version
+):
+ """
+ Initialize AI Configurator Performance Estimator.
+
+ Args:
+ aic_model_name: Model name for AI Configurator
+ aic_system: GPU system (e.g., "H200_SXM")
+ aic_backend: Backend for AI Configurator estimation
+ aic_backend_version: Version of the backend
+
+ Returns:
+ AIConfiguratorPerfEstimator instance
+ """
+ return AIConfiguratorPerfEstimator(
+ aic_model_name,
+ aic_system.lower(),
+ aic_backend,
+ aic_backend_version,
+ )
+
+
+def generate_gpu_configurations(min_num_gpus, max_num_gpus):
+ """
+ Generate GPU counts to profile (powers of 2 for dense models).
+
+ Args:
+ min_num_gpus: Minimum number of GPUs
+ max_num_gpus: Maximum number of GPUs
+
+ Returns:
+ List of GPU counts to profile
+ """
+ profile_num_gpus = [
+ 2**i
+ for i in range(int(math.log2(max_num_gpus)) + 1)
+ if min_num_gpus <= 2**i <= max_num_gpus
+ ]
+ return profile_num_gpus
+
+
+def profile_prefill_performance(ai_configurator, profile_num_gpus, isl):
+ """
+ Profile prefill performance across different GPU counts.
+
+ Args:
+ ai_configurator: AIConfiguratorPerfEstimator instance
+ profile_num_gpus: List of GPU counts to profile
+ isl: Input sequence length
+
+ Returns:
+ Tuple of (num_gpus_list, ttft_list, thpt_per_gpu_list)
+ """
+ prefill_num_gpus = []
+ prefill_ttft = []
+ prefill_thpt_per_gpu = []
+
+ for num_gpus in profile_num_gpus:
+ # Estimate prefill performance using AI Configurator
+ perf_dict = ai_configurator.estimate_prefill_perf(
+ isl,
+ tp_size=num_gpus,
+ )
+ ttft_val = perf_dict["context_latency"]
+ # Calculate throughput: tokens/second/GPU
+ thpt_val = isl / ttft_val * 1000 / num_gpus
+
+ prefill_num_gpus.append(num_gpus)
+ prefill_ttft.append(ttft_val)
+ prefill_thpt_per_gpu.append(thpt_val)
+
+ return (prefill_num_gpus, prefill_ttft, prefill_thpt_per_gpu)
+
+
+def profile_decode_performance(
+ ai_configurator,
+ profile_num_gpus,
+ isl,
+ osl,
+ decode_interpolation_granularity=DEFAULT_DECODE_INTERPOLATION_GRANULARITY,
+):
+ """
+ Profile decode performance at various concurrency levels.
+
+ Args:
+ ai_configurator: AIConfiguratorPerfEstimator instance
+ profile_num_gpus: List of GPU counts to profile
+ isl: Input sequence length
+ osl: Output sequence length
+ decode_interpolation_granularity: Granularity for decode interpolation
+
+ Returns:
+ List of tuples (num_gpus, itl_list, thpt_per_gpu_list)
+ """
+ decode_results = []
+ # For dense models (not MoE), attention_dp_size = 1
+ attention_dp_size = 1
+
+ for num_gpus in profile_num_gpus:
+ # Get maximum batch size for this configuration
+ max_concurrency = ai_configurator.get_max_batch_size(isl, osl, tp_size=num_gpus)
+
+ # Determine request sweep range
+ sweep_num_request = get_num_request_range(
+ attention_dp_size,
+ max_concurrency,
+ decode_interpolation_granularity,
+ )
+
+ engine_decode_itl = []
+ engine_decode_thpt_per_gpu = []
+
+ for num_request in sweep_num_request:
+ # Estimate decode performance using AI Configurator
+ perf_dict = ai_configurator.estimate_perf(
+ isl,
+ osl,
+ num_request,
+ mode="decode",
+ tp_size=num_gpus,
+ )
+
+ itl_val = perf_dict["tpot"]
+ thpt_val = perf_dict["tokens/s/gpu"]
+
+ engine_decode_itl.append(itl_val)
+ engine_decode_thpt_per_gpu.append(thpt_val)
+
+ # Store results for this GPU configuration
+ if engine_decode_itl:
+ decode_results.append(
+ (num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu)
+ )
+
+ return decode_results
+
+
+def format_status_message(profile_num_gpus, prefill_results, gpu_cost_per_hour):
+ """
+ Format success status message with profiling summary.
+
+ Args:
+ profile_num_gpus: List of GPU counts profiled
+ prefill_results: Prefill profiling results
+ gpu_cost_per_hour: Cost per GPU per hour
+
+ Returns:
+ Formatted status message string
+ """
+ _, prefill_ttft, _ = prefill_results
+ prefill_num_gpus, _, _ = prefill_results
+
+ best_prefill_idx = prefill_ttft.index(min(prefill_ttft))
+ return (
+ f"✅ Plots generated successfully!\n"
+ f"📊 Profiled {len(profile_num_gpus)} GPU configurations: {profile_num_gpus}\n"
+ f"⚡ Best prefill: {min(prefill_ttft):.1f}ms TTFT at {prefill_num_gpus[best_prefill_idx]} GPUs\n"
+ f"💰 GPU Cost: ${gpu_cost_per_hour:.2f}/hour"
+ )
diff --git a/benchmarks/profiler/webapp/main.py b/benchmarks/profiler/webapp/main.py
new file mode 100644
index 0000000000..ce9e563cd3
--- /dev/null
+++ b/benchmarks/profiler/webapp/main.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Main entry point for the Dynamo SLA Profiler web application.
+
+This webapp provides an interactive interface for profiling LLM inference performance
+using AI Configurator estimates.
+"""
+
+from benchmarks.profiler.webapp.ui.app import build_interface
+
+
+def main():
+ """Launch the Dynamo SLA Profiler webapp."""
+ # Load custom JavaScript for enhanced interactivity
+ with open("benchmarks/profiler/webapp/static/utils.js", "r") as f:
+ custom_js = f"()=>{{{f.read()}}}"
+
+ # Build and launch the interface
+ demo = build_interface(custom_js)
+ demo.launch(server_name="0.0.0.0")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/profiler/webapp/static/utils.js b/benchmarks/profiler/webapp/static/utils.js
new file mode 100644
index 0000000000..4390769ffa
--- /dev/null
+++ b/benchmarks/profiler/webapp/static/utils.js
@@ -0,0 +1,308 @@
+// Storage for selected points (multi-selection)
+const selectedPointKeys = {
+ prefill: [],
+ decode: [],
+ cost: []
+};
+
+// Storage for all data points
+const allDataPoints = {
+ prefill: [],
+ decode: [],
+ cost: []
+};
+
+// Lookup from point key to row values
+const pointDataLookup = {
+ prefill: {},
+ decode: {},
+ cost: {}
+};
+
+const tableHeaders = {
+ prefill: ["GPUs", "TTFT (ms)", "Throughput (tokens/s/GPU)"],
+ decode: ["GPUs", "ITL (ms)", "Throughput (tokens/s/GPU)"],
+ cost: [
+ "TTFT (ms)",
+ "Prefill Thpt (tokens/s/GPU)",
+ "ITL (ms)",
+ "Decode Thpt (tokens/s/GPU)",
+ "Tokens/User",
+ "Cost ($)"
+ ]
+};
+
+function getTraceUid(trace, fallbackIndex) {
+ if (!trace) {
+ return `trace-${fallbackIndex}`;
+ }
+ return trace.uid || `trace-${fallbackIndex}`;
+}
+
+function makePointKey(traceUid, pointIndex) {
+ return `${traceUid}:${pointIndex}`;
+}
+
+function getDisplayRows(plotType) {
+ if (!selectedPointKeys[plotType] || selectedPointKeys[plotType].length === 0) {
+ return allDataPoints[plotType].map((row) => row.values);
+ }
+
+ const lookup = pointDataLookup[plotType] || {};
+ return selectedPointKeys[plotType]
+ .map((key) => lookup[key])
+ .filter(Boolean)
+ .map((row) => row.values);
+}
+
+function computeSelectedKeys(plotDiv, lookup) {
+ const keys = [];
+ if (!plotDiv || !plotDiv.data) {
+ return keys;
+ }
+
+ plotDiv.data.forEach((trace, traceIdx) => {
+ if (!trace) {
+ return;
+ }
+
+ const traceUid = getTraceUid(trace, traceIdx);
+ const selectedPoints = trace.selectedpoints;
+
+ if (!Array.isArray(selectedPoints) || selectedPoints.length === 0) {
+ return;
+ }
+
+ selectedPoints.forEach((pointIndex) => {
+ const key = makePointKey(traceUid, pointIndex);
+ if (!lookup || lookup[key]) {
+ keys.push(key);
+ }
+ });
+ });
+
+ return keys;
+}
+
+function normalizeRow(row) {
+ if (row == null) {
+ return [];
+ }
+ if (Array.isArray(row)) {
+ return row.slice();
+ }
+ if (typeof row === "object") {
+ if (typeof row[Symbol.iterator] === "function") {
+ return Array.from(row);
+ }
+ return Object.values(row);
+ }
+ return [row];
+}
+
+function formatCell(value) {
+ if (value == null) {
+ return "";
+ }
+ if (typeof value === "number" && Number.isFinite(value)) {
+ if (Number.isInteger(value)) {
+ return value.toString();
+ }
+ return value.toFixed(3);
+ }
+ return `${value}`;
+}
+
+function renderTableHTML(headers, rows) {
+ const safeHeaders = headers || [];
+ const headerCells = safeHeaders.map((header) => `
${header} | `).join("");
+
+ let bodyHtml = "";
+ if (!rows || rows.length === 0) {
+ bodyHtml = `| No data selected yet. Click points on the plot to populate this table. |
`;
+ } else {
+ bodyHtml = rows
+ .map((row) => {
+ const normalized = normalizeRow(row);
+ const length = safeHeaders.length > 0 ? safeHeaders.length : normalized.length;
+ const cells = Array.from({ length }, (_, idx) => {
+ const value = normalized[idx];
+ return `${formatCell(value)} | `;
+ });
+ return `${cells.join("")}
`;
+ })
+ .join("");
+ }
+
+ return `
+
+
+ ${headerCells}
+ ${bodyHtml}
+
+
+ `;
+}
+
+function updateDataTable(tableId, data, plotType) {
+ const container = document.getElementById(tableId);
+ if (!container) {
+ console.log(`Table container ${tableId} not found`);
+ return;
+ }
+
+ const headers = tableHeaders[plotType] || [];
+ container.innerHTML = renderTableHTML(headers, data);
+ console.log(`Updated table ${tableId} with ${data ? data.length : 0} rows`);
+}
+
+function resizePlotlyGraphs() {
+ const plots = document.querySelectorAll('.js-plotly-plot');
+ console.log(`Found ${plots.length} Plotly graphs`);
+ for (let i = 0; i < plots.length; i++) {
+ if (window.Plotly && plots[i]) {
+ window.Plotly.relayout(plots[i], {autosize: true});
+ console.log(`Resized plot ${i}`);
+ }
+ }
+}
+
+function setupPlotClickHandler(plotId, tableId, plotType) {
+ const attemptSetup = () => {
+ const plotContainer = document.querySelector(`#${plotId}`);
+ if (!plotContainer) {
+ console.log(`Plot ${plotId} not found, retrying...`);
+ setTimeout(attemptSetup, 500);
+ return;
+ }
+
+ const plotDiv = plotContainer.querySelector('.js-plotly-plot');
+ if (!plotDiv) {
+ console.log(`Plotly div not found in ${plotId}, retrying...`);
+ setTimeout(attemptSetup, 500);
+ return;
+ }
+
+ console.log(`Setting up handlers for ${plotId}`);
+
+ const headers = tableHeaders[plotType] || [];
+
+ const syncSelection = (source) => {
+ const lookup = pointDataLookup[plotType] || {};
+ const keys = computeSelectedKeys(plotDiv, lookup);
+ selectedPointKeys[plotType] = keys;
+ updateDataTable(tableId, getDisplayRows(plotType), plotType);
+ console.log(`Selection synced for ${plotType} (${source || 'update'}): ${keys.length} point(s)`);
+ };
+
+ const refreshAllDataPoints = () => {
+ if (!plotDiv || !plotDiv.data) {
+ return;
+ }
+
+ const rows = [];
+ const lookup = {};
+ plotDiv.data.forEach((trace, traceIdx) => {
+ if (!trace || !trace.customdata) {
+ return;
+ }
+
+ const traceUid = getTraceUid(trace, traceIdx);
+
+ trace.customdata.forEach((item, pointIndex) => {
+ const normalized = normalizeRow(item);
+ if (normalized.length === 0) {
+ return;
+ }
+
+ const alignedRow = headers.length
+ ? headers.map((_, idx) => normalized[idx])
+ : normalized;
+
+ const key = makePointKey(traceUid, pointIndex);
+ const rowObj = { key, values: alignedRow };
+ rows.push(rowObj);
+ lookup[key] = rowObj;
+ });
+ });
+
+ const newHash = JSON.stringify(rows.map((row) => [row.key, row.values]));
+ if (plotDiv.__dynamo_data_hash !== newHash) {
+ plotDiv.__dynamo_data_hash = newHash;
+ allDataPoints[plotType] = rows;
+ pointDataLookup[plotType] = lookup;
+ syncSelection('data-refresh');
+ console.log(`Stored ${rows.length} data points for ${plotType}`);
+ }
+ };
+
+ refreshAllDataPoints();
+
+ if (plotDiv.on) {
+ plotDiv.on('plotly_afterplot', refreshAllDataPoints);
+ plotDiv.on('plotly_restyle', refreshAllDataPoints);
+ plotDiv.on('plotly_relayout', refreshAllDataPoints);
+ }
+
+ plotDiv.on('plotly_click', function(data) {
+ console.log(`Click detected on ${plotId}`, data);
+ if (data.points && data.points.length > 0) {
+ setTimeout(() => syncSelection('click'), 0);
+ }
+ });
+
+ if (plotDiv.on) {
+ plotDiv.on('plotly_selected', function(eventData) {
+ if (!eventData || !eventData.points) {
+ return;
+ }
+
+ syncSelection('selection-tool');
+ });
+
+ plotDiv.on('plotly_deselect', function() {
+ syncSelection('deselect');
+ });
+ }
+
+ console.log(`Handlers configured for ${plotId}`);
+ };
+
+ setTimeout(attemptSetup, 500);
+}
+
+// Wait for DOM to be ready and set up observers
+setTimeout(() => {
+ // Find all tab buttons and add click listeners
+ const tabButtons = document.querySelectorAll('button[role="tab"]');
+ tabButtons.forEach(button => {
+ button.addEventListener('click', () => {
+ setTimeout(resizePlotlyGraphs, 150);
+ });
+ });
+
+ // Use MutationObserver to detect tab visibility changes
+ const observer = new MutationObserver(() => {
+ resizePlotlyGraphs();
+ });
+
+ // Observe changes to elements with tab content
+ const tabPanels = document.querySelectorAll('[role="tabpanel"]');
+ tabPanels.forEach(panel => {
+ observer.observe(panel, {
+ attributes: true,
+ attributeFilter: ['style', 'class', 'hidden']
+ });
+ });
+
+ // Initial resize
+ resizePlotlyGraphs();
+
+ // Setup click handlers for all plots
+ setupPlotClickHandler('prefill_plot', 'prefill_table', 'prefill');
+ setupPlotClickHandler('decode_plot', 'decode_table', 'decode');
+ setupPlotClickHandler('cost_plot', 'cost_table', 'cost');
+}, 1000);
+
+// Also resize on window resize
+window.addEventListener('resize', resizePlotlyGraphs);
\ No newline at end of file
diff --git a/benchmarks/profiler/webapp/ui/__init__.py b/benchmarks/profiler/webapp/ui/__init__.py
new file mode 100644
index 0000000000..02040289f5
--- /dev/null
+++ b/benchmarks/profiler/webapp/ui/__init__.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+UI components for the Dynamo SLA Profiler webapp.
+
+This package contains:
+- settings: Model, hardware, and SLA configuration UI components
+- results: Results tabs with plots and tables
+- handlers: Event handlers for UI interactions
+- plots: Interactive Plotly plotting functions
+- tables: Table building and data preparation utilities
+"""
diff --git a/benchmarks/profiler/webapp/ui/app.py b/benchmarks/profiler/webapp/ui/app.py
new file mode 100644
index 0000000000..b76db0bd20
--- /dev/null
+++ b/benchmarks/profiler/webapp/ui/app.py
@@ -0,0 +1,99 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Gradio application interface builder for the Dynamo SLA Profiler.
+
+This module builds the complete Gradio interface by assembling
+all UI components and setting up event handlers.
+"""
+
+import gradio as gr
+
+from benchmarks.profiler.webapp.core.constants import (
+ PLOT_INTERACTION_INSTRUCTIONS,
+ TABLE_CSS,
+)
+from benchmarks.profiler.webapp.core.orchestrator import generate_plots
+from benchmarks.profiler.webapp.ui.handlers import setup_event_handlers
+from benchmarks.profiler.webapp.ui.results import create_results_tabs
+from benchmarks.profiler.webapp.ui.settings import (
+ create_hardware_settings,
+ create_model_settings,
+ create_sla_settings,
+)
+from benchmarks.profiler.webapp.ui.tables import get_empty_tables
+
+
+def build_interface(custom_js: str = None) -> gr.Blocks:
+ """
+ Build the complete Gradio interface for the SLA Profiler.
+
+ Args:
+ custom_js: Optional custom JavaScript to inject into the interface
+
+ Returns:
+ Configured Gradio Blocks interface
+ """
+ with gr.Blocks(title="Dynamo SLA Profiler", js=custom_js) as demo:
+ # Header
+ gr.Markdown("# Dynamo SLA Profiler")
+ gr.Markdown(
+ "Generate performance plots using AI Configurator to estimate profiling results. "
+ "Configure the parameters below and click 'Generate Plots' to see the results."
+ )
+ gr.HTML(TABLE_CSS)
+
+ # Get empty table HTML
+ empty_prefill_html, empty_decode_html, empty_cost_html = get_empty_tables()
+
+ # Store all components for event handlers
+ components = {}
+
+ with gr.Row():
+ # Left panel: Settings
+ with gr.Column(scale=1):
+ # Model and backend settings
+ gr.Markdown("### Dynamo Settings")
+ model_components = create_model_settings()
+ components.update(model_components)
+
+ # Hardware settings
+ gr.Markdown("### Hardware Settings")
+ hardware_components = create_hardware_settings()
+ components.update(hardware_components)
+
+ # SLA settings
+ gr.Markdown("### SLA Settings")
+ sla_components = create_sla_settings()
+ components.update(sla_components)
+
+ # Generate button and status
+ components["generate_btn"] = gr.Button(
+ "Generate Performance Plots", variant="primary", size="lg"
+ )
+ components["status"] = gr.Textbox(
+ label="Status",
+ value="Ready to generate plots",
+ interactive=False,
+ show_label=False,
+ lines=5,
+ )
+
+ # Right panel: Results
+ with gr.Column(min_width=700):
+ gr.Markdown("### Performance Results")
+ gr.Markdown(PLOT_INTERACTION_INSTRUCTIONS)
+
+ results_components = create_results_tabs(
+ empty_prefill_html, empty_decode_html, empty_cost_html
+ )
+ components.update(results_components)
+
+ # Store demo reference for event handlers
+ components["demo"] = demo
+
+ # Set up all event handlers
+ setup_event_handlers(components, generate_plots)
+
+ return demo
diff --git a/benchmarks/profiler/webapp/ui/handlers.py b/benchmarks/profiler/webapp/ui/handlers.py
new file mode 100644
index 0000000000..26b221fd35
--- /dev/null
+++ b/benchmarks/profiler/webapp/ui/handlers.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Event handlers for UI interactions in the Dynamo SLA Profiler webapp.
+
+This module sets up all event handlers for buttons, dropdowns, and other interactive elements.
+"""
+
+import gradio as gr
+
+from benchmarks.profiler.webapp.core.constants import BACKEND_VERSIONS
+
+
+def setup_event_handlers(components, generate_plots_fn):
+ """
+ Set up event handlers for UI interactions.
+
+ Args:
+ components: Dictionary of all UI components
+ generate_plots_fn: The generate_plots function to call
+
+ Returns:
+ None (modifies components in place)
+ """
+ # Prepare input list for generate_plots
+ inputs = [
+ components["aic_model_name"],
+ components["backend"],
+ components["config_yaml"],
+ components["use_aic"],
+ components["aic_backend"],
+ components["aic_backend_version"],
+ components["aic_system"],
+ components["min_num_gpus_per_engine"],
+ components["max_num_gpus_per_engine"],
+ components["num_gpus_per_node"],
+ components["gpu_cost_per_hour"],
+ components["isl"],
+ components["osl"],
+ components["max_context_length"],
+ components["ttft"],
+ components["itl"],
+ ]
+
+ # Prepare output list for generate_plots
+ outputs = [
+ components["prefill_plot"],
+ components["decode_plot"],
+ components["cost_plot"],
+ components["status"],
+ components["prefill_table"],
+ components["decode_table"],
+ components["cost_table"],
+ ]
+
+ # Generate button click handler
+ components["generate_btn"].click(
+ fn=generate_plots_fn,
+ inputs=inputs,
+ outputs=outputs,
+ )
+
+ # Auto-generate plots on load with default values
+ components["demo"].load(
+ fn=generate_plots_fn,
+ inputs=inputs,
+ outputs=outputs,
+ )
+
+ # Toggle AI Configurator fields visibility
+ components["use_aic"].change(
+ fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
+ inputs=[components["use_aic"]],
+ outputs=[components["aic_backend"], components["aic_backend_version"]],
+ )
+
+ # Update backend version choices when backend changes
+ def update_backend_versions(backend):
+ versions = BACKEND_VERSIONS.get(backend, ["1.0.0"])
+ return gr.update(choices=versions, value=versions[0])
+
+ components["aic_backend"].change(
+ fn=update_backend_versions,
+ inputs=[components["aic_backend"]],
+ outputs=[components["aic_backend_version"]],
+ )
diff --git a/benchmarks/profiler/webapp/ui/plots.py b/benchmarks/profiler/webapp/ui/plots.py
new file mode 100644
index 0000000000..29fa1694c4
--- /dev/null
+++ b/benchmarks/profiler/webapp/ui/plots.py
@@ -0,0 +1,293 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Interactive plotting functions for Gradio webapp using Plotly.
+
+This module provides interactive versions of the profiler plots using Plotly,
+which integrates seamlessly with Gradio's gr.Plot component.
+"""
+
+import numpy as np
+import plotly.graph_objects as go
+
+from benchmarks.profiler.utils.parato import compute_parato
+from benchmarks.profiler.webapp.core.constants import PLOTLY_COLORS, PLOTLY_DARK_THEME
+
+
+def _configure_dark_theme(fig, title, xaxis_title, yaxis_title):
+ """
+ Apply dark theme configuration to a Plotly figure.
+
+ Args:
+ fig: Plotly Figure object
+ title: Plot title
+ xaxis_title: X-axis title
+ yaxis_title: Y-axis title
+ """
+ fig.update_layout(
+ title={
+ "text": title,
+ "x": 0.5,
+ "xanchor": "center",
+ "font": {"size": 18 if len(title) < 60 else 16},
+ },
+ xaxis_title=xaxis_title,
+ yaxis_title=yaxis_title,
+ hovermode="closest",
+ showlegend=True,
+ autosize=True,
+ clickmode="event+select", # Enable click selection
+ **PLOTLY_DARK_THEME,
+ )
+
+ # Add grid
+ fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor="rgba(128, 128, 128, 0.3)")
+ fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor="rgba(128, 128, 128, 0.3)")
+
+
+def _add_target_line(fig, target_value, label, max_y):
+ """
+ Add a target reference line to a plot.
+
+ Args:
+ fig: Plotly Figure object
+ target_value: X-coordinate of the vertical line
+ label: Label for the target line
+ max_y: Maximum Y value for the line
+ """
+ fig.add_trace(
+ go.Scatter(
+ x=[target_value, target_value],
+ y=[0, max_y * 1.1],
+ mode="lines",
+ line=dict(color="red", width=2, dash="dash"),
+ name=label,
+ hovertemplate=f"{label}",
+ )
+ )
+
+
+def _configure_selection_style(fig, mode, selected_color="red", selected_size=16):
+ """
+ Configure selection appearance for interactive plots.
+
+ Args:
+ fig: Plotly Figure object
+ mode: Trace mode (e.g., "markers+text", "lines+markers")
+ selected_color: Color for selected markers
+ selected_size: Size for selected markers
+ """
+ fig.update_traces(
+ selected=dict(marker=dict(color=selected_color, size=selected_size)),
+ unselected=dict(marker=dict(opacity=0.4 if "text" in mode else 0.5)),
+ selector=dict(mode=mode),
+ )
+
+
+def plot_prefill_performance_interactive(
+ prefill_results: tuple, target_ttft: float
+) -> go.Figure:
+ """
+ Create interactive Plotly plot for prefill performance.
+
+ Args:
+ prefill_results: Tuple of (num_gpus_list, ttft_list, thpt_per_gpu_list)
+ target_ttft: Target TTFT in milliseconds (for reference line)
+
+ Returns:
+ Plotly Figure object for Gradio gr.Plot
+ """
+ num_gpus_list, ttft_list, thpt_per_gpu_list = prefill_results
+
+ fig = go.Figure()
+
+ # Add scatter plot for data points with custom data
+ fig.add_trace(
+ go.Scatter(
+ x=ttft_list,
+ y=thpt_per_gpu_list,
+ mode="markers+text",
+ marker=dict(size=12, color="blue", line=dict(width=2, color="darkblue")),
+ text=[f"{n} GPU(s)" for n in num_gpus_list],
+ textposition="top center",
+ textfont=dict(size=10),
+ name="GPU Configurations",
+ hovertemplate="%{text}
"
+ + "TTFT: %{x:.2f} ms
"
+ + "Throughput: %{y:.2f} tokens/s/GPU
"
+ + "",
+ customdata=list(zip(num_gpus_list, ttft_list, thpt_per_gpu_list)),
+ )
+ )
+
+ # Add target TTFT line
+ max_thpt = max(thpt_per_gpu_list) if thpt_per_gpu_list else 1000
+ _add_target_line(fig, target_ttft, f"Target TTFT: {target_ttft} ms", max_thpt)
+
+ # Apply dark theme and configure layout
+ _configure_dark_theme(
+ fig,
+ "Prefill Performance",
+ "Time to First Token (ms)",
+ "Prefill Throughput per GPU (tokens/s/GPU)",
+ )
+
+ # Configure selection appearance
+ _configure_selection_style(
+ fig, "markers+text", selected_color="red", selected_size=16
+ )
+
+ return fig
+
+
+def plot_decode_performance_interactive(
+ decode_results: list, target_itl: float
+) -> go.Figure:
+ """
+ Create interactive Plotly plot for decode performance.
+
+ Args:
+ decode_results: List of tuples (num_gpus, itl_list, thpt_per_gpu_list)
+ target_itl: Target ITL in milliseconds (for reference line)
+
+ Returns:
+ Plotly Figure object for Gradio gr.Plot
+ """
+ fig = go.Figure()
+
+ # Plot each GPU configuration
+ for idx, (num_gpus, itl_list, thpt_per_gpu_list) in enumerate(decode_results):
+ color = PLOTLY_COLORS[idx % len(PLOTLY_COLORS)]
+ # Prepare custom data for each point
+ customdata = [
+ [num_gpus, itl, thpt] for itl, thpt in zip(itl_list, thpt_per_gpu_list)
+ ]
+
+ fig.add_trace(
+ go.Scatter(
+ x=itl_list,
+ y=thpt_per_gpu_list,
+ mode="lines+markers",
+ marker=dict(size=8, color=color),
+ line=dict(color=color, width=2),
+ name=f"{num_gpus} GPU(s)",
+ hovertemplate=f"{num_gpus} GPU(s)
"
+ + "ITL: %{x:.2f} ms
"
+ + "Throughput: %{y:.2f} tokens/s/GPU
"
+ + "",
+ customdata=customdata,
+ )
+ )
+
+ # Add target ITL line
+ all_thpt = [
+ thpt for _, _, thpt_list in decode_results for thpt in thpt_list if thpt_list
+ ]
+ max_thpt = max(all_thpt) if all_thpt else 1000
+ _add_target_line(fig, target_itl, f"Target ITL: {target_itl} ms", max_thpt)
+
+ # Apply dark theme and configure layout
+ _configure_dark_theme(
+ fig,
+ "Decode Performance",
+ "Inter Token Latency (ms)",
+ "Decode Throughput per GPU (tokens/s/GPU)",
+ )
+
+ # Configure selection appearance for markers
+ _configure_selection_style(
+ fig, "lines+markers", selected_color="yellow", selected_size=12
+ )
+
+ return fig
+
+
+def plot_cost_sla_interactive(
+ isl: int,
+ osl: int,
+ prefill_results: tuple,
+ decode_results: list,
+ gpu_cost_per_hour: float = 3.0,
+) -> go.Figure:
+ """
+ Create interactive Plotly plot for cost vs SLA analysis.
+
+ Args:
+ isl: Input sequence length
+ osl: Output sequence length
+ prefill_results: Tuple of (num_gpus, ttft, thpt_per_gpu) for prefill
+ decode_results: List of tuples (num_gpus, itl_list, thpt_per_gpu_list) for decode
+ gpu_cost_per_hour: Cost per GPU per hour in dollars (default: 3.0)
+
+ Returns:
+ Plotly Figure object for Gradio gr.Plot
+ """
+ # Compute Pareto fronts
+ p_ttft, p_thpt = compute_parato(prefill_results[1], prefill_results[2])
+
+ _d_itl, _d_thpt = [], []
+ for _d_result in decode_results:
+ _d_itl.extend(_d_result[1])
+ _d_thpt.extend(_d_result[2])
+ d_itl, d_thpt = compute_parato(_d_itl, _d_thpt)
+
+ # Convert to numpy arrays for element-wise operations
+ p_ttft = np.array(p_ttft)
+ p_thpt = np.array(p_thpt)
+ d_itl = np.array(d_itl)
+ d_thpt = np.array(d_thpt)
+
+ # Calculate cost metrics
+ fig = go.Figure()
+
+ for idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)):
+ # Calculate costs for this TTFT curve
+ prefill_cost = isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600
+
+ # Calculate tokens per user and cost arrays (element-wise operations)
+ tokens_per_user_array = 1000 / d_itl # Element-wise division with numpy array
+ cost_array = osl * 1000 / d_thpt * gpu_cost_per_hour / 3600 + prefill_cost
+
+ color = PLOTLY_COLORS[idx % len(PLOTLY_COLORS)]
+
+ # Prepare custom data for each point
+ customdata = [
+ [
+ _p_ttft,
+ _p_thpt,
+ float(d_itl[i]),
+ float(d_thpt[i]),
+ float(tokens_per_user_array[i]),
+ float(cost_array[i]),
+ ]
+ for i in range(len(d_itl))
+ ]
+
+ # Add line plot for this TTFT curve
+ fig.add_trace(
+ go.Scatter(
+ x=tokens_per_user_array,
+ y=cost_array,
+ mode="lines+markers",
+ marker=dict(size=10, symbol="x", color=color, line=dict(width=2)),
+ line=dict(color=color, width=2),
+ name=f"TTFT: {_p_ttft:.2f}ms",
+ hovertemplate=f"TTFT: {_p_ttft:.2f}ms
"
+ + "Tokens/User: %{x:.2f}
"
+ + "Cost: $%{y:.3f}
"
+ + "",
+ customdata=customdata,
+ )
+ )
+
+ # Apply dark theme and configure layout
+ title = f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${gpu_cost_per_hour:.2f}) Under Different SLA"
+ _configure_dark_theme(fig, title, "Tokens per User", "Cost ($)")
+
+ # Configure selection appearance for markers
+ _configure_selection_style(
+ fig, "lines+markers", selected_color="yellow", selected_size=14
+ )
+
+ return fig
diff --git a/benchmarks/profiler/webapp/ui/results.py b/benchmarks/profiler/webapp/ui/results.py
new file mode 100644
index 0000000000..2496f5cf1d
--- /dev/null
+++ b/benchmarks/profiler/webapp/ui/results.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+UI components for results display in the Dynamo SLA Profiler webapp.
+
+This module provides functions to build the results tabs with plots and tables.
+"""
+
+import gradio as gr
+
+from benchmarks.profiler.webapp.core.constants import (
+ COST_TAB_DESCRIPTION,
+ DECODE_TAB_DESCRIPTION,
+ PREFILL_TAB_DESCRIPTION,
+)
+
+
+def create_results_tabs(empty_prefill_html, empty_decode_html, empty_cost_html):
+ """
+ Create the results tabs with plots and tables.
+
+ Args:
+ empty_prefill_html: Empty prefill table HTML
+ empty_decode_html: Empty decode table HTML
+ empty_cost_html: Empty cost table HTML
+
+ Returns:
+ Dictionary of Gradio components
+ """
+ with gr.Tab("Prefill Performance"):
+ prefill_plot = gr.Plot(
+ label="Prefill Performance",
+ show_label=False,
+ elem_id="prefill_plot",
+ )
+ gr.Markdown(PREFILL_TAB_DESCRIPTION)
+ gr.Markdown("#### Data Points")
+ prefill_table = gr.HTML(
+ value=empty_prefill_html,
+ elem_id="prefill_table",
+ )
+
+ with gr.Tab("Decode Performance"):
+ decode_plot = gr.Plot(
+ label="Decode Performance",
+ show_label=False,
+ elem_id="decode_plot",
+ )
+ gr.Markdown(DECODE_TAB_DESCRIPTION)
+ gr.Markdown("#### Data Points")
+ decode_table = gr.HTML(
+ value=empty_decode_html,
+ elem_id="decode_table",
+ )
+
+ with gr.Tab("Cost vs SLA"):
+ cost_plot = gr.Plot(
+ label="Cost vs SLA",
+ show_label=False,
+ elem_id="cost_plot",
+ )
+ gr.Markdown(COST_TAB_DESCRIPTION)
+ gr.Markdown("#### Data Points")
+ cost_table = gr.HTML(
+ value=empty_cost_html,
+ elem_id="cost_table",
+ )
+
+ return {
+ "prefill_plot": prefill_plot,
+ "decode_plot": decode_plot,
+ "cost_plot": cost_plot,
+ "prefill_table": prefill_table,
+ "decode_table": decode_table,
+ "cost_table": cost_table,
+ }
diff --git a/benchmarks/profiler/webapp/ui/settings.py b/benchmarks/profiler/webapp/ui/settings.py
new file mode 100644
index 0000000000..2ea129ee00
--- /dev/null
+++ b/benchmarks/profiler/webapp/ui/settings.py
@@ -0,0 +1,192 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+UI components for settings panels in the Dynamo SLA Profiler webapp.
+
+This module provides functions to build the settings UI sections:
+- Model and backend configuration
+- Hardware configuration (GPUs, cost)
+- SLA parameters (ISL, OSL, TTFT, ITL)
+"""
+
+import gradio as gr
+from aiconfigurator.sdk.common import SupportedModels
+
+from benchmarks.profiler.webapp.core.constants import (
+ BACKEND_VERSIONS,
+ DEFAULT_CONFIG_YAML,
+ GPU_SYSTEMS,
+ INFERENCE_BACKENDS,
+ MAX_GPU_OPTIONS,
+ MIN_GPU_OPTIONS,
+)
+
+
+def create_model_settings():
+ """
+ Create the model and backend settings UI.
+
+ Returns:
+ Dictionary of Gradio components
+ """
+ with gr.Group():
+ with gr.Row():
+ supported_models = list(SupportedModels.keys())
+ aic_model_name = gr.Dropdown(
+ label="Model",
+ choices=supported_models,
+ value=supported_models[0],
+ info="Model to profile",
+ )
+
+ backend = gr.Dropdown(
+ label="Backend",
+ choices=INFERENCE_BACKENDS,
+ value="trtllm",
+ info="Inference backend",
+ )
+
+ config_yaml = gr.Textbox(
+ label="Config (YAML)",
+ placeholder=DEFAULT_CONFIG_YAML,
+ lines=5,
+ info="DynamoGraphDeployment YAML configuration",
+ )
+
+ use_aic = gr.Checkbox(
+ label="Use AI Configurator",
+ value=True,
+ info="Use AI Configurator to estimate performance",
+ )
+
+ with gr.Row():
+ aic_backend = gr.Dropdown(
+ label="AI Configurator Backend",
+ choices=INFERENCE_BACKENDS,
+ value="trtllm",
+ info="Backend for AI Configurator estimation",
+ visible=True,
+ )
+
+ aic_backend_version = gr.Dropdown(
+ label="AI Configurator Backend Version",
+ choices=BACKEND_VERSIONS["trtllm"],
+ value="0.20.0",
+ info="Backend version for AI Configurator",
+ allow_custom_value=True,
+ visible=True,
+ )
+
+ return {
+ "aic_model_name": aic_model_name,
+ "backend": backend,
+ "config_yaml": config_yaml,
+ "use_aic": use_aic,
+ "aic_backend": aic_backend,
+ "aic_backend_version": aic_backend_version,
+ }
+
+
+def create_hardware_settings():
+ """
+ Create the hardware configuration UI.
+
+ Returns:
+ Dictionary of Gradio components
+ """
+ with gr.Group():
+ with gr.Row():
+ aic_system = gr.Dropdown(
+ label="System",
+ choices=GPU_SYSTEMS,
+ value="H200_SXM",
+ info="Target GPU system",
+ )
+
+ gpu_cost_per_hour = gr.Number(
+ label="Cost per GPU Hour ($)",
+ value=3.0,
+ info="Cost per GPU per hour in dollars",
+ )
+
+ with gr.Row():
+ min_num_gpus_per_engine = gr.Dropdown(
+ label="Min GPUs per Engine",
+ choices=MIN_GPU_OPTIONS,
+ value=1,
+ info="Minimum number of GPUs (TP size)",
+ )
+
+ max_num_gpus_per_engine = gr.Dropdown(
+ label="Max GPUs per Engine",
+ choices=MAX_GPU_OPTIONS,
+ value=4,
+ info="Maximum number of GPUs (TP size)",
+ )
+
+ num_gpus_per_node = gr.Number(
+ label="GPUs per Node",
+ value=8,
+ info="Number of GPUs per node (for MoE models)",
+ )
+
+ return {
+ "aic_system": aic_system,
+ "gpu_cost_per_hour": gpu_cost_per_hour,
+ "min_num_gpus_per_engine": min_num_gpus_per_engine,
+ "max_num_gpus_per_engine": max_num_gpus_per_engine,
+ "num_gpus_per_node": num_gpus_per_node,
+ }
+
+
+def create_sla_settings():
+ """
+ Create the SLA configuration UI.
+
+ Returns:
+ Dictionary of Gradio components
+ """
+ with gr.Group():
+ with gr.Row():
+ isl = gr.Number(
+ label="Input Sequence Length (ISL)",
+ value=5000,
+ precision=0,
+ info="Target input sequence length",
+ )
+
+ osl = gr.Number(
+ label="Output Sequence Length (OSL)",
+ value=50,
+ precision=0,
+ info="Target output sequence length",
+ )
+
+ with gr.Row():
+ max_context_length = gr.Number(
+ label="Max Context Length",
+ value=8192,
+ precision=0,
+ info="Maximum context length supported by the model",
+ )
+
+ ttft = gr.Number(
+ label="Target TTFT (ms)",
+ value=50.0,
+ info="Target Time To First Token in milliseconds",
+ )
+
+ itl = gr.Number(
+ label="Target ITL (ms)",
+ value=10.0,
+ info="Target Inter Token Latency in milliseconds",
+ )
+
+ return {
+ "isl": isl,
+ "osl": osl,
+ "max_context_length": max_context_length,
+ "ttft": ttft,
+ "itl": itl,
+ }
diff --git a/benchmarks/profiler/webapp/ui/tables.py b/benchmarks/profiler/webapp/ui/tables.py
new file mode 100644
index 0000000000..247cf5f7c4
--- /dev/null
+++ b/benchmarks/profiler/webapp/ui/tables.py
@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Table building and data preparation utilities for the Dynamo SLA Profiler webapp.
+"""
+
+from numbers import Real
+
+import numpy as np
+
+from benchmarks.profiler.utils.parato import compute_parato
+from benchmarks.profiler.webapp.core.constants import (
+ COST_TABLE_HEADERS,
+ DECODE_TABLE_HEADERS,
+ PREFILL_TABLE_HEADERS,
+)
+
+
+def _format_cell(value):
+ """Format a cell value for display in HTML table."""
+ if isinstance(value, bool):
+ return "✅" if value else "❌"
+ if isinstance(value, Real) and not isinstance(value, bool):
+ if isinstance(value, int):
+ return f"{value}"
+ return f"{value:.3f}"
+ return str(value)
+
+
+def build_table_html(headers, rows):
+ """
+ Build an HTML table from headers and rows.
+
+ Args:
+ headers: List of header strings
+ rows: List of row data (each row is a list of values)
+
+ Returns:
+ HTML string containing the table
+ """
+ header_html = "".join(f"{header} | " for header in headers)
+
+ if not rows:
+ empty_row = (
+ f"| "
+ "No data selected yet. Click points on the plot to populate this table."
+ " |
"
+ )
+ body_html = empty_row
+ else:
+ body_html = "".join(
+ "" + "".join(f"| {_format_cell(cell)} | " for cell in row) + "
"
+ for row in rows
+ )
+
+ return (
+ ""
+ "
"
+ f"{header_html}
"
+ f"{body_html}"
+ "
"
+ "
"
+ )
+
+
+def get_empty_tables():
+ """Get empty table HTML for all three table types."""
+ return (
+ build_table_html(PREFILL_TABLE_HEADERS, []),
+ build_table_html(DECODE_TABLE_HEADERS, []),
+ build_table_html(COST_TABLE_HEADERS, []),
+ )
+
+
+def prepare_prefill_table_data(prefill_results):
+ """
+ Prepare table data for prefill performance.
+
+ Args:
+ prefill_results: Tuple of (num_gpus_list, ttft_list, thpt_per_gpu_list)
+
+ Returns:
+ List of rows for the table
+ """
+ num_gpus_list, ttft_list, thpt_per_gpu_list = prefill_results
+ return [
+ [num_gpus, round(ttft, 3), round(thpt, 3)]
+ for num_gpus, ttft, thpt in zip(num_gpus_list, ttft_list, thpt_per_gpu_list)
+ ]
+
+
+def prepare_decode_table_data(decode_results):
+ """
+ Prepare table data for decode performance.
+
+ Args:
+ decode_results: List of tuples (num_gpus, itl_list, thpt_list)
+
+ Returns:
+ List of rows for the table
+ """
+ table_data = []
+ for num_gpus, itl_list, thpt_list in decode_results:
+ for itl, thpt in zip(itl_list, thpt_list):
+ table_data.append([num_gpus, round(itl, 3), round(thpt, 3)])
+ return table_data
+
+
+def prepare_cost_table_data(
+ isl, osl, prefill_results, decode_results, gpu_cost_per_hour
+):
+ """
+ Prepare table data for cost analysis.
+
+ Args:
+ isl: Input sequence length
+ osl: Output sequence length
+ prefill_results: Tuple of (num_gpus, ttft, thpt_per_gpu) for prefill
+ decode_results: List of tuples (num_gpus, itl_list, thpt_per_gpu_list) for decode
+ gpu_cost_per_hour: Cost per GPU per hour in dollars
+
+ Returns:
+ List of rows for the table
+ """
+ # Compute Pareto fronts
+ p_ttft, p_thpt = compute_parato(prefill_results[1], prefill_results[2])
+
+ _d_itl, _d_thpt = [], []
+ for _d_result in decode_results:
+ _d_itl.extend(_d_result[1])
+ _d_thpt.extend(_d_result[2])
+ d_itl, d_thpt = compute_parato(_d_itl, _d_thpt)
+
+ # Convert to numpy arrays
+ p_ttft = np.array(p_ttft)
+ p_thpt = np.array(p_thpt)
+ d_itl = np.array(d_itl)
+ d_thpt = np.array(d_thpt)
+
+ # Calculate cost data
+ table_data = []
+ for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
+ prefill_cost = isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600
+ tokens_per_user_array = 1000 / d_itl
+ cost_array = osl * 1000 / d_thpt * gpu_cost_per_hour / 3600 + prefill_cost
+
+ for i in range(len(d_itl)):
+ table_data.append(
+ [
+ round(float(_p_ttft), 3),
+ round(float(_p_thpt), 3),
+ round(float(d_itl[i]), 3),
+ round(float(d_thpt[i]), 3),
+ round(float(tokens_per_user_array[i]), 3),
+ round(float(cost_array[i]), 3),
+ ]
+ )
+
+ return table_data
+
+
+def build_all_tables(prefill_results, decode_results, isl, osl, gpu_cost_per_hour):
+ """
+ Build all three table HTMLs from profiling results.
+
+ Args:
+ prefill_results: Prefill profiling results
+ decode_results: Decode profiling results
+ isl: Input sequence length
+ osl: Output sequence length
+ gpu_cost_per_hour: Cost per GPU per hour
+
+ Returns:
+ Tuple of (prefill_table_html, decode_table_html, cost_table_html)
+ """
+ prefill_data = prepare_prefill_table_data(prefill_results)
+ decode_data = prepare_decode_table_data(decode_results)
+ cost_data = prepare_cost_table_data(
+ isl, osl, prefill_results, decode_results, gpu_cost_per_hour
+ )
+
+ return (
+ build_table_html(PREFILL_TABLE_HEADERS, prefill_data),
+ build_table_html(DECODE_TABLE_HEADERS, decode_data),
+ build_table_html(COST_TABLE_HEADERS, cost_data),
+ )
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
index d99b7c611c..f1aeb8fba8 100644
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -48,6 +48,8 @@ dependencies = [
"types-tabulate",
"transformers",
"pytest-mypy",
+ "gradio>=5.49.1",
+ "plotly>=6.4.0",
]
[project.scripts]