diff --git a/benchmarks/profiler/webapp/__init__.py b/benchmarks/profiler/webapp/__init__.py new file mode 100644 index 0000000000..1b4f510e5f --- /dev/null +++ b/benchmarks/profiler/webapp/__init__.py @@ -0,0 +1,7 @@ +import importlib.metadata + +from benchmarks.profiler.webapp.main import main + +__version__ = importlib.metadata.version("aiconfigurator") + +main() diff --git a/benchmarks/profiler/webapp/core/__init__.py b/benchmarks/profiler/webapp/core/__init__.py new file mode 100644 index 0000000000..dce7de9dda --- /dev/null +++ b/benchmarks/profiler/webapp/core/__init__.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Core functionality for the Dynamo SLA Profiler webapp. + +This package contains: +- constants: Shared constants and configuration +- profiling: Performance profiling logic using AI Configurator +""" diff --git a/benchmarks/profiler/webapp/core/constants.py b/benchmarks/profiler/webapp/core/constants.py new file mode 100644 index 0000000000..08557007fd --- /dev/null +++ b/benchmarks/profiler/webapp/core/constants.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Constants and configuration for the Dynamo SLA Profiler webapp. +""" + +# Table headers for different performance metrics +PREFILL_TABLE_HEADERS = [ + "GPUs", + "TTFT (ms)", + "Throughput (tokens/s/GPU)", +] + +DECODE_TABLE_HEADERS = [ + "GPUs", + "ITL (ms)", + "Throughput (tokens/s/GPU)", +] + +COST_TABLE_HEADERS = [ + "TTFT (ms)", + "Prefill Thpt (tokens/s/GPU)", + "ITL (ms)", + "Decode Thpt (tokens/s/GPU)", + "Tokens/User", + "Cost ($)", +] + +# Backend version mapping +BACKEND_VERSIONS = { + "trtllm": ["1.0.0", "0.20.0", "0.19.0", "0.18.0"], + "vllm": ["0.10.0"], + "sglang": ["0.4.5"], +} + +# Supported GPU systems +GPU_SYSTEMS = [ + "H100_SXM", + "H200_SXM", + "A100_SXM", + "A100_PCIE", +] + +# Supported inference backends +INFERENCE_BACKENDS = ["vllm", "sglang", "trtllm"] + +# GPU count options +MIN_GPU_OPTIONS = [1, 2, 4, 8] +MAX_GPU_OPTIONS = [1, 2, 4, 8, 16] + +# Default decode interpolation granularity +DEFAULT_DECODE_INTERPOLATION_GRANULARITY = 6 + +# CSS styles for custom table rendering +TABLE_CSS = """ + +""" + +# Default configuration YAML placeholder +DEFAULT_CONFIG_YAML = """apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: vllm-disagg +spec: + services: + Frontend: + dynamoNamespace: vllm-disagg + componentType: frontend + replicas: 1 + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag""" + +# Plot interaction instructions +PLOT_INTERACTION_INSTRUCTIONS = """ +**How to interact with plots:** +- **Hover** over points to see detailed information +- **Click** points to select them (click again to deselect) +- **Multiple selection**: Click multiple points with shift key or select tools from the top right corner to compare specific configurations +- The table below each plot will filter to show only selected points, or all points if none are selected +""" + +# Tab descriptions +PREFILL_TAB_DESCRIPTION = """ +**Prefill Performance**: Interactive plot showing the relationship between Time to First Token (TTFT) +and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled). +Table shows selected points, or all points if none selected. +""" + +DECODE_TAB_DESCRIPTION = """ +**Decode Performance**: Interactive plot showing the relationship between Inter Token Latency (ITL) +and throughput per GPU for different GPU counts. **Click points to select/deselect** (multi-select enabled). +Table shows selected points, or all points if none selected. +""" + +COST_TAB_DESCRIPTION = """ +**Cost Analysis**: Interactive plot showing the cost per 1000 requests under different SLA configurations. +Lower curves represent better cost efficiency for the same throughput. **Click points to select/deselect** (multi-select enabled). +Table shows selected points, or all points if none selected. +""" + +# Plotly color palette +PLOTLY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"] + +# Plotly dark theme configuration +PLOTLY_DARK_THEME = { + "template": "plotly_dark", + "plot_bgcolor": "rgba(0, 0, 0, 0)", + "paper_bgcolor": "rgba(0, 0, 0, 0)", + "modebar": dict( + bgcolor="rgba(0, 0, 0, 0)", + color="rgba(255, 255, 255, 0.5)", + activecolor="rgba(255, 255, 255, 0.9)", + ), + "legend": dict( + yanchor="top", + y=0.99, + xanchor="left", + x=0.01, + bgcolor="rgba(0, 0, 0, 0.5)", + font=dict(color="white"), + ), +} diff --git a/benchmarks/profiler/webapp/core/orchestrator.py b/benchmarks/profiler/webapp/core/orchestrator.py new file mode 100644 index 0000000000..c857cf8c07 --- /dev/null +++ b/benchmarks/profiler/webapp/core/orchestrator.py @@ -0,0 +1,185 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Orchestration logic for generating performance plots. + +This module contains the main pipeline that coordinates profiling, +plot generation, and table building. +""" + +from benchmarks.profiler.webapp.core.profiling import ( + format_status_message, + generate_gpu_configurations, + initialize_ai_configurator, + profile_decode_performance, + profile_prefill_performance, + validate_inputs, +) +from benchmarks.profiler.webapp.ui.plots import ( + plot_cost_sla_interactive, + plot_decode_performance_interactive, + plot_prefill_performance_interactive, +) +from benchmarks.profiler.webapp.ui.tables import build_all_tables, get_empty_tables + + +def generate_plots( + aic_model_name: str, + backend: str, + config_yaml: str, + use_aic: bool, + aic_backend: str, + aic_backend_version: str, + aic_system: str, + min_num_gpus_per_engine: int, + max_num_gpus_per_engine: int, + num_gpus_per_node: int, + gpu_cost_per_hour: float, + isl: int, + osl: int, + max_context_length: int, + ttft: float, + itl: float, +): + """ + Generate performance plots using AI Configurator estimation. + + This function profiles LLM inference performance by: + 1. Estimating prefill performance (TTFT) across different GPU counts + 2. Estimating decode performance (ITL) at various concurrency levels + 3. Computing cost-vs-SLA tradeoffs based on GPU pricing + + Args: + aic_model_name: Model name for AI Configurator (e.g., "QWEN3_32B") + backend: Inference backend (vllm, sglang, trtllm) - for reference only + config_yaml: YAML configuration string from UI (reserved for future use) + use_aic: Whether to use AI Configurator (must be True for webapp) + aic_backend: Backend for AI Configurator estimation + aic_backend_version: Version of the backend + aic_system: GPU system (e.g., "H200_SXM") + min_num_gpus_per_engine: Minimum TP size to profile + max_num_gpus_per_engine: Maximum TP size to profile + num_gpus_per_node: GPUs per node (for MoE models, unused for dense) + gpu_cost_per_hour: Cost per GPU per hour in dollars + isl: Input sequence length + osl: Output sequence length + max_context_length: Maximum context length (currently unused) + ttft: Target TTFT in milliseconds (for visualization) + itl: Target ITL in milliseconds (for visualization) + + Returns: + Tuple of (prefill_plot, decode_plot, cost_plot, status_message, + prefill_table_html, decode_table_html, cost_table_html) + """ + empty_prefill_html, empty_decode_html, empty_cost_html = get_empty_tables() + + try: + # Validate inputs + is_valid, error_msg = validate_inputs( + use_aic, aic_model_name, aic_system, aic_backend_version + ) + if not is_valid: + return ( + None, + None, + None, + error_msg, + empty_prefill_html, + empty_decode_html, + empty_cost_html, + ) + + # Initialize AI Configurator + ai_configurator = initialize_ai_configurator( + aic_model_name, aic_system, aic_backend, aic_backend_version + ) + + # Generate GPU configurations to profile + profile_num_gpus = generate_gpu_configurations( + min_num_gpus_per_engine, max_num_gpus_per_engine + ) + + if not profile_num_gpus: + return ( + None, + None, + None, + "❌ No valid GPU configurations to profile", + empty_prefill_html, + empty_decode_html, + empty_cost_html, + ) + + # Profile prefill performance + prefill_results = profile_prefill_performance( + ai_configurator, profile_num_gpus, isl + ) + + if not prefill_results[0]: + return ( + None, + None, + None, + "❌ Failed to generate prefill results", + empty_prefill_html, + empty_decode_html, + empty_cost_html, + ) + + # Profile decode performance + decode_results = profile_decode_performance( + ai_configurator, profile_num_gpus, isl, osl + ) + + if not decode_results: + return ( + None, + None, + None, + "❌ Failed to generate decode results", + empty_prefill_html, + empty_decode_html, + empty_cost_html, + ) + + # Generate interactive plots + prefill_plot = plot_prefill_performance_interactive(prefill_results, ttft) + decode_plot = plot_decode_performance_interactive(decode_results, itl) + cost_plot = plot_cost_sla_interactive( + isl, osl, prefill_results, decode_results, gpu_cost_per_hour + ) + + # Generate success status message + status_msg = format_status_message( + profile_num_gpus, prefill_results, gpu_cost_per_hour + ) + + # Build all tables + prefill_table_html, decode_table_html, cost_table_html = build_all_tables( + prefill_results, decode_results, isl, osl, gpu_cost_per_hour + ) + + return ( + prefill_plot, + decode_plot, + cost_plot, + status_msg, + prefill_table_html, + decode_table_html, + cost_table_html, + ) + + except Exception as e: + import traceback + + error_msg = f"❌ Error generating plots:\n{str(e)}\n\n{traceback.format_exc()}" + return ( + None, + None, + None, + error_msg, + empty_prefill_html, + empty_decode_html, + empty_cost_html, + ) diff --git a/benchmarks/profiler/webapp/core/profiling.py b/benchmarks/profiler/webapp/core/profiling.py new file mode 100644 index 0000000000..c78aa242bf --- /dev/null +++ b/benchmarks/profiler/webapp/core/profiling.py @@ -0,0 +1,201 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Performance profiling logic for the Dynamo SLA Profiler webapp. + +This module handles the actual performance estimation using AI Configurator, +including prefill and decode performance profiling. +""" + +import math + +from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator +from benchmarks.profiler.utils.profile_decode import get_num_request_range +from benchmarks.profiler.webapp.core.constants import ( + DEFAULT_DECODE_INTERPOLATION_GRANULARITY, +) + + +def validate_inputs(use_aic, aic_model_name, aic_system, aic_backend_version): + """ + Validate AI Configurator inputs. + + Args: + use_aic: Whether AI Configurator is enabled + aic_model_name: Model name for AI Configurator + aic_system: GPU system name + aic_backend_version: Backend version + + Returns: + Tuple of (is_valid, error_message) + """ + if not use_aic: + return False, "❌ Web UI requires AI Configurator mode" + + if not aic_model_name or not aic_system or not aic_backend_version: + return False, "❌ Missing required AI Configurator parameters" + + return True, None + + +def initialize_ai_configurator( + aic_model_name, aic_system, aic_backend, aic_backend_version +): + """ + Initialize AI Configurator Performance Estimator. + + Args: + aic_model_name: Model name for AI Configurator + aic_system: GPU system (e.g., "H200_SXM") + aic_backend: Backend for AI Configurator estimation + aic_backend_version: Version of the backend + + Returns: + AIConfiguratorPerfEstimator instance + """ + return AIConfiguratorPerfEstimator( + aic_model_name, + aic_system.lower(), + aic_backend, + aic_backend_version, + ) + + +def generate_gpu_configurations(min_num_gpus, max_num_gpus): + """ + Generate GPU counts to profile (powers of 2 for dense models). + + Args: + min_num_gpus: Minimum number of GPUs + max_num_gpus: Maximum number of GPUs + + Returns: + List of GPU counts to profile + """ + profile_num_gpus = [ + 2**i + for i in range(int(math.log2(max_num_gpus)) + 1) + if min_num_gpus <= 2**i <= max_num_gpus + ] + return profile_num_gpus + + +def profile_prefill_performance(ai_configurator, profile_num_gpus, isl): + """ + Profile prefill performance across different GPU counts. + + Args: + ai_configurator: AIConfiguratorPerfEstimator instance + profile_num_gpus: List of GPU counts to profile + isl: Input sequence length + + Returns: + Tuple of (num_gpus_list, ttft_list, thpt_per_gpu_list) + """ + prefill_num_gpus = [] + prefill_ttft = [] + prefill_thpt_per_gpu = [] + + for num_gpus in profile_num_gpus: + # Estimate prefill performance using AI Configurator + perf_dict = ai_configurator.estimate_prefill_perf( + isl, + tp_size=num_gpus, + ) + ttft_val = perf_dict["context_latency"] + # Calculate throughput: tokens/second/GPU + thpt_val = isl / ttft_val * 1000 / num_gpus + + prefill_num_gpus.append(num_gpus) + prefill_ttft.append(ttft_val) + prefill_thpt_per_gpu.append(thpt_val) + + return (prefill_num_gpus, prefill_ttft, prefill_thpt_per_gpu) + + +def profile_decode_performance( + ai_configurator, + profile_num_gpus, + isl, + osl, + decode_interpolation_granularity=DEFAULT_DECODE_INTERPOLATION_GRANULARITY, +): + """ + Profile decode performance at various concurrency levels. + + Args: + ai_configurator: AIConfiguratorPerfEstimator instance + profile_num_gpus: List of GPU counts to profile + isl: Input sequence length + osl: Output sequence length + decode_interpolation_granularity: Granularity for decode interpolation + + Returns: + List of tuples (num_gpus, itl_list, thpt_per_gpu_list) + """ + decode_results = [] + # For dense models (not MoE), attention_dp_size = 1 + attention_dp_size = 1 + + for num_gpus in profile_num_gpus: + # Get maximum batch size for this configuration + max_concurrency = ai_configurator.get_max_batch_size(isl, osl, tp_size=num_gpus) + + # Determine request sweep range + sweep_num_request = get_num_request_range( + attention_dp_size, + max_concurrency, + decode_interpolation_granularity, + ) + + engine_decode_itl = [] + engine_decode_thpt_per_gpu = [] + + for num_request in sweep_num_request: + # Estimate decode performance using AI Configurator + perf_dict = ai_configurator.estimate_perf( + isl, + osl, + num_request, + mode="decode", + tp_size=num_gpus, + ) + + itl_val = perf_dict["tpot"] + thpt_val = perf_dict["tokens/s/gpu"] + + engine_decode_itl.append(itl_val) + engine_decode_thpt_per_gpu.append(thpt_val) + + # Store results for this GPU configuration + if engine_decode_itl: + decode_results.append( + (num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu) + ) + + return decode_results + + +def format_status_message(profile_num_gpus, prefill_results, gpu_cost_per_hour): + """ + Format success status message with profiling summary. + + Args: + profile_num_gpus: List of GPU counts profiled + prefill_results: Prefill profiling results + gpu_cost_per_hour: Cost per GPU per hour + + Returns: + Formatted status message string + """ + _, prefill_ttft, _ = prefill_results + prefill_num_gpus, _, _ = prefill_results + + best_prefill_idx = prefill_ttft.index(min(prefill_ttft)) + return ( + f"✅ Plots generated successfully!\n" + f"📊 Profiled {len(profile_num_gpus)} GPU configurations: {profile_num_gpus}\n" + f"⚡ Best prefill: {min(prefill_ttft):.1f}ms TTFT at {prefill_num_gpus[best_prefill_idx]} GPUs\n" + f"💰 GPU Cost: ${gpu_cost_per_hour:.2f}/hour" + ) diff --git a/benchmarks/profiler/webapp/main.py b/benchmarks/profiler/webapp/main.py new file mode 100644 index 0000000000..ce9e563cd3 --- /dev/null +++ b/benchmarks/profiler/webapp/main.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Main entry point for the Dynamo SLA Profiler web application. + +This webapp provides an interactive interface for profiling LLM inference performance +using AI Configurator estimates. +""" + +from benchmarks.profiler.webapp.ui.app import build_interface + + +def main(): + """Launch the Dynamo SLA Profiler webapp.""" + # Load custom JavaScript for enhanced interactivity + with open("benchmarks/profiler/webapp/static/utils.js", "r") as f: + custom_js = f"()=>{{{f.read()}}}" + + # Build and launch the interface + demo = build_interface(custom_js) + demo.launch(server_name="0.0.0.0") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/profiler/webapp/static/utils.js b/benchmarks/profiler/webapp/static/utils.js new file mode 100644 index 0000000000..4390769ffa --- /dev/null +++ b/benchmarks/profiler/webapp/static/utils.js @@ -0,0 +1,308 @@ +// Storage for selected points (multi-selection) +const selectedPointKeys = { + prefill: [], + decode: [], + cost: [] +}; + +// Storage for all data points +const allDataPoints = { + prefill: [], + decode: [], + cost: [] +}; + +// Lookup from point key to row values +const pointDataLookup = { + prefill: {}, + decode: {}, + cost: {} +}; + +const tableHeaders = { + prefill: ["GPUs", "TTFT (ms)", "Throughput (tokens/s/GPU)"], + decode: ["GPUs", "ITL (ms)", "Throughput (tokens/s/GPU)"], + cost: [ + "TTFT (ms)", + "Prefill Thpt (tokens/s/GPU)", + "ITL (ms)", + "Decode Thpt (tokens/s/GPU)", + "Tokens/User", + "Cost ($)" + ] +}; + +function getTraceUid(trace, fallbackIndex) { + if (!trace) { + return `trace-${fallbackIndex}`; + } + return trace.uid || `trace-${fallbackIndex}`; +} + +function makePointKey(traceUid, pointIndex) { + return `${traceUid}:${pointIndex}`; +} + +function getDisplayRows(plotType) { + if (!selectedPointKeys[plotType] || selectedPointKeys[plotType].length === 0) { + return allDataPoints[plotType].map((row) => row.values); + } + + const lookup = pointDataLookup[plotType] || {}; + return selectedPointKeys[plotType] + .map((key) => lookup[key]) + .filter(Boolean) + .map((row) => row.values); +} + +function computeSelectedKeys(plotDiv, lookup) { + const keys = []; + if (!plotDiv || !plotDiv.data) { + return keys; + } + + plotDiv.data.forEach((trace, traceIdx) => { + if (!trace) { + return; + } + + const traceUid = getTraceUid(trace, traceIdx); + const selectedPoints = trace.selectedpoints; + + if (!Array.isArray(selectedPoints) || selectedPoints.length === 0) { + return; + } + + selectedPoints.forEach((pointIndex) => { + const key = makePointKey(traceUid, pointIndex); + if (!lookup || lookup[key]) { + keys.push(key); + } + }); + }); + + return keys; +} + +function normalizeRow(row) { + if (row == null) { + return []; + } + if (Array.isArray(row)) { + return row.slice(); + } + if (typeof row === "object") { + if (typeof row[Symbol.iterator] === "function") { + return Array.from(row); + } + return Object.values(row); + } + return [row]; +} + +function formatCell(value) { + if (value == null) { + return ""; + } + if (typeof value === "number" && Number.isFinite(value)) { + if (Number.isInteger(value)) { + return value.toString(); + } + return value.toFixed(3); + } + return `${value}`; +} + +function renderTableHTML(headers, rows) { + const safeHeaders = headers || []; + const headerCells = safeHeaders.map((header) => `${header}`).join(""); + + let bodyHtml = ""; + if (!rows || rows.length === 0) { + bodyHtml = `No data selected yet. Click points on the plot to populate this table.`; + } else { + bodyHtml = rows + .map((row) => { + const normalized = normalizeRow(row); + const length = safeHeaders.length > 0 ? safeHeaders.length : normalized.length; + const cells = Array.from({ length }, (_, idx) => { + const value = normalized[idx]; + return `${formatCell(value)}`; + }); + return `${cells.join("")}`; + }) + .join(""); + } + + return ` +
+ + ${headerCells} + ${bodyHtml} +
+
+ `; +} + +function updateDataTable(tableId, data, plotType) { + const container = document.getElementById(tableId); + if (!container) { + console.log(`Table container ${tableId} not found`); + return; + } + + const headers = tableHeaders[plotType] || []; + container.innerHTML = renderTableHTML(headers, data); + console.log(`Updated table ${tableId} with ${data ? data.length : 0} rows`); +} + +function resizePlotlyGraphs() { + const plots = document.querySelectorAll('.js-plotly-plot'); + console.log(`Found ${plots.length} Plotly graphs`); + for (let i = 0; i < plots.length; i++) { + if (window.Plotly && plots[i]) { + window.Plotly.relayout(plots[i], {autosize: true}); + console.log(`Resized plot ${i}`); + } + } +} + +function setupPlotClickHandler(plotId, tableId, plotType) { + const attemptSetup = () => { + const plotContainer = document.querySelector(`#${plotId}`); + if (!plotContainer) { + console.log(`Plot ${plotId} not found, retrying...`); + setTimeout(attemptSetup, 500); + return; + } + + const plotDiv = plotContainer.querySelector('.js-plotly-plot'); + if (!plotDiv) { + console.log(`Plotly div not found in ${plotId}, retrying...`); + setTimeout(attemptSetup, 500); + return; + } + + console.log(`Setting up handlers for ${plotId}`); + + const headers = tableHeaders[plotType] || []; + + const syncSelection = (source) => { + const lookup = pointDataLookup[plotType] || {}; + const keys = computeSelectedKeys(plotDiv, lookup); + selectedPointKeys[plotType] = keys; + updateDataTable(tableId, getDisplayRows(plotType), plotType); + console.log(`Selection synced for ${plotType} (${source || 'update'}): ${keys.length} point(s)`); + }; + + const refreshAllDataPoints = () => { + if (!plotDiv || !plotDiv.data) { + return; + } + + const rows = []; + const lookup = {}; + plotDiv.data.forEach((trace, traceIdx) => { + if (!trace || !trace.customdata) { + return; + } + + const traceUid = getTraceUid(trace, traceIdx); + + trace.customdata.forEach((item, pointIndex) => { + const normalized = normalizeRow(item); + if (normalized.length === 0) { + return; + } + + const alignedRow = headers.length + ? headers.map((_, idx) => normalized[idx]) + : normalized; + + const key = makePointKey(traceUid, pointIndex); + const rowObj = { key, values: alignedRow }; + rows.push(rowObj); + lookup[key] = rowObj; + }); + }); + + const newHash = JSON.stringify(rows.map((row) => [row.key, row.values])); + if (plotDiv.__dynamo_data_hash !== newHash) { + plotDiv.__dynamo_data_hash = newHash; + allDataPoints[plotType] = rows; + pointDataLookup[plotType] = lookup; + syncSelection('data-refresh'); + console.log(`Stored ${rows.length} data points for ${plotType}`); + } + }; + + refreshAllDataPoints(); + + if (plotDiv.on) { + plotDiv.on('plotly_afterplot', refreshAllDataPoints); + plotDiv.on('plotly_restyle', refreshAllDataPoints); + plotDiv.on('plotly_relayout', refreshAllDataPoints); + } + + plotDiv.on('plotly_click', function(data) { + console.log(`Click detected on ${plotId}`, data); + if (data.points && data.points.length > 0) { + setTimeout(() => syncSelection('click'), 0); + } + }); + + if (plotDiv.on) { + plotDiv.on('plotly_selected', function(eventData) { + if (!eventData || !eventData.points) { + return; + } + + syncSelection('selection-tool'); + }); + + plotDiv.on('plotly_deselect', function() { + syncSelection('deselect'); + }); + } + + console.log(`Handlers configured for ${plotId}`); + }; + + setTimeout(attemptSetup, 500); +} + +// Wait for DOM to be ready and set up observers +setTimeout(() => { + // Find all tab buttons and add click listeners + const tabButtons = document.querySelectorAll('button[role="tab"]'); + tabButtons.forEach(button => { + button.addEventListener('click', () => { + setTimeout(resizePlotlyGraphs, 150); + }); + }); + + // Use MutationObserver to detect tab visibility changes + const observer = new MutationObserver(() => { + resizePlotlyGraphs(); + }); + + // Observe changes to elements with tab content + const tabPanels = document.querySelectorAll('[role="tabpanel"]'); + tabPanels.forEach(panel => { + observer.observe(panel, { + attributes: true, + attributeFilter: ['style', 'class', 'hidden'] + }); + }); + + // Initial resize + resizePlotlyGraphs(); + + // Setup click handlers for all plots + setupPlotClickHandler('prefill_plot', 'prefill_table', 'prefill'); + setupPlotClickHandler('decode_plot', 'decode_table', 'decode'); + setupPlotClickHandler('cost_plot', 'cost_table', 'cost'); +}, 1000); + +// Also resize on window resize +window.addEventListener('resize', resizePlotlyGraphs); \ No newline at end of file diff --git a/benchmarks/profiler/webapp/ui/__init__.py b/benchmarks/profiler/webapp/ui/__init__.py new file mode 100644 index 0000000000..02040289f5 --- /dev/null +++ b/benchmarks/profiler/webapp/ui/__init__.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +UI components for the Dynamo SLA Profiler webapp. + +This package contains: +- settings: Model, hardware, and SLA configuration UI components +- results: Results tabs with plots and tables +- handlers: Event handlers for UI interactions +- plots: Interactive Plotly plotting functions +- tables: Table building and data preparation utilities +""" diff --git a/benchmarks/profiler/webapp/ui/app.py b/benchmarks/profiler/webapp/ui/app.py new file mode 100644 index 0000000000..b76db0bd20 --- /dev/null +++ b/benchmarks/profiler/webapp/ui/app.py @@ -0,0 +1,99 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Gradio application interface builder for the Dynamo SLA Profiler. + +This module builds the complete Gradio interface by assembling +all UI components and setting up event handlers. +""" + +import gradio as gr + +from benchmarks.profiler.webapp.core.constants import ( + PLOT_INTERACTION_INSTRUCTIONS, + TABLE_CSS, +) +from benchmarks.profiler.webapp.core.orchestrator import generate_plots +from benchmarks.profiler.webapp.ui.handlers import setup_event_handlers +from benchmarks.profiler.webapp.ui.results import create_results_tabs +from benchmarks.profiler.webapp.ui.settings import ( + create_hardware_settings, + create_model_settings, + create_sla_settings, +) +from benchmarks.profiler.webapp.ui.tables import get_empty_tables + + +def build_interface(custom_js: str = None) -> gr.Blocks: + """ + Build the complete Gradio interface for the SLA Profiler. + + Args: + custom_js: Optional custom JavaScript to inject into the interface + + Returns: + Configured Gradio Blocks interface + """ + with gr.Blocks(title="Dynamo SLA Profiler", js=custom_js) as demo: + # Header + gr.Markdown("# Dynamo SLA Profiler") + gr.Markdown( + "Generate performance plots using AI Configurator to estimate profiling results. " + "Configure the parameters below and click 'Generate Plots' to see the results." + ) + gr.HTML(TABLE_CSS) + + # Get empty table HTML + empty_prefill_html, empty_decode_html, empty_cost_html = get_empty_tables() + + # Store all components for event handlers + components = {} + + with gr.Row(): + # Left panel: Settings + with gr.Column(scale=1): + # Model and backend settings + gr.Markdown("### Dynamo Settings") + model_components = create_model_settings() + components.update(model_components) + + # Hardware settings + gr.Markdown("### Hardware Settings") + hardware_components = create_hardware_settings() + components.update(hardware_components) + + # SLA settings + gr.Markdown("### SLA Settings") + sla_components = create_sla_settings() + components.update(sla_components) + + # Generate button and status + components["generate_btn"] = gr.Button( + "Generate Performance Plots", variant="primary", size="lg" + ) + components["status"] = gr.Textbox( + label="Status", + value="Ready to generate plots", + interactive=False, + show_label=False, + lines=5, + ) + + # Right panel: Results + with gr.Column(min_width=700): + gr.Markdown("### Performance Results") + gr.Markdown(PLOT_INTERACTION_INSTRUCTIONS) + + results_components = create_results_tabs( + empty_prefill_html, empty_decode_html, empty_cost_html + ) + components.update(results_components) + + # Store demo reference for event handlers + components["demo"] = demo + + # Set up all event handlers + setup_event_handlers(components, generate_plots) + + return demo diff --git a/benchmarks/profiler/webapp/ui/handlers.py b/benchmarks/profiler/webapp/ui/handlers.py new file mode 100644 index 0000000000..26b221fd35 --- /dev/null +++ b/benchmarks/profiler/webapp/ui/handlers.py @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Event handlers for UI interactions in the Dynamo SLA Profiler webapp. + +This module sets up all event handlers for buttons, dropdowns, and other interactive elements. +""" + +import gradio as gr + +from benchmarks.profiler.webapp.core.constants import BACKEND_VERSIONS + + +def setup_event_handlers(components, generate_plots_fn): + """ + Set up event handlers for UI interactions. + + Args: + components: Dictionary of all UI components + generate_plots_fn: The generate_plots function to call + + Returns: + None (modifies components in place) + """ + # Prepare input list for generate_plots + inputs = [ + components["aic_model_name"], + components["backend"], + components["config_yaml"], + components["use_aic"], + components["aic_backend"], + components["aic_backend_version"], + components["aic_system"], + components["min_num_gpus_per_engine"], + components["max_num_gpus_per_engine"], + components["num_gpus_per_node"], + components["gpu_cost_per_hour"], + components["isl"], + components["osl"], + components["max_context_length"], + components["ttft"], + components["itl"], + ] + + # Prepare output list for generate_plots + outputs = [ + components["prefill_plot"], + components["decode_plot"], + components["cost_plot"], + components["status"], + components["prefill_table"], + components["decode_table"], + components["cost_table"], + ] + + # Generate button click handler + components["generate_btn"].click( + fn=generate_plots_fn, + inputs=inputs, + outputs=outputs, + ) + + # Auto-generate plots on load with default values + components["demo"].load( + fn=generate_plots_fn, + inputs=inputs, + outputs=outputs, + ) + + # Toggle AI Configurator fields visibility + components["use_aic"].change( + fn=lambda x: (gr.update(visible=x), gr.update(visible=x)), + inputs=[components["use_aic"]], + outputs=[components["aic_backend"], components["aic_backend_version"]], + ) + + # Update backend version choices when backend changes + def update_backend_versions(backend): + versions = BACKEND_VERSIONS.get(backend, ["1.0.0"]) + return gr.update(choices=versions, value=versions[0]) + + components["aic_backend"].change( + fn=update_backend_versions, + inputs=[components["aic_backend"]], + outputs=[components["aic_backend_version"]], + ) diff --git a/benchmarks/profiler/webapp/ui/plots.py b/benchmarks/profiler/webapp/ui/plots.py new file mode 100644 index 0000000000..29fa1694c4 --- /dev/null +++ b/benchmarks/profiler/webapp/ui/plots.py @@ -0,0 +1,293 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Interactive plotting functions for Gradio webapp using Plotly. + +This module provides interactive versions of the profiler plots using Plotly, +which integrates seamlessly with Gradio's gr.Plot component. +""" + +import numpy as np +import plotly.graph_objects as go + +from benchmarks.profiler.utils.parato import compute_parato +from benchmarks.profiler.webapp.core.constants import PLOTLY_COLORS, PLOTLY_DARK_THEME + + +def _configure_dark_theme(fig, title, xaxis_title, yaxis_title): + """ + Apply dark theme configuration to a Plotly figure. + + Args: + fig: Plotly Figure object + title: Plot title + xaxis_title: X-axis title + yaxis_title: Y-axis title + """ + fig.update_layout( + title={ + "text": title, + "x": 0.5, + "xanchor": "center", + "font": {"size": 18 if len(title) < 60 else 16}, + }, + xaxis_title=xaxis_title, + yaxis_title=yaxis_title, + hovermode="closest", + showlegend=True, + autosize=True, + clickmode="event+select", # Enable click selection + **PLOTLY_DARK_THEME, + ) + + # Add grid + fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor="rgba(128, 128, 128, 0.3)") + fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor="rgba(128, 128, 128, 0.3)") + + +def _add_target_line(fig, target_value, label, max_y): + """ + Add a target reference line to a plot. + + Args: + fig: Plotly Figure object + target_value: X-coordinate of the vertical line + label: Label for the target line + max_y: Maximum Y value for the line + """ + fig.add_trace( + go.Scatter( + x=[target_value, target_value], + y=[0, max_y * 1.1], + mode="lines", + line=dict(color="red", width=2, dash="dash"), + name=label, + hovertemplate=f"{label}", + ) + ) + + +def _configure_selection_style(fig, mode, selected_color="red", selected_size=16): + """ + Configure selection appearance for interactive plots. + + Args: + fig: Plotly Figure object + mode: Trace mode (e.g., "markers+text", "lines+markers") + selected_color: Color for selected markers + selected_size: Size for selected markers + """ + fig.update_traces( + selected=dict(marker=dict(color=selected_color, size=selected_size)), + unselected=dict(marker=dict(opacity=0.4 if "text" in mode else 0.5)), + selector=dict(mode=mode), + ) + + +def plot_prefill_performance_interactive( + prefill_results: tuple, target_ttft: float +) -> go.Figure: + """ + Create interactive Plotly plot for prefill performance. + + Args: + prefill_results: Tuple of (num_gpus_list, ttft_list, thpt_per_gpu_list) + target_ttft: Target TTFT in milliseconds (for reference line) + + Returns: + Plotly Figure object for Gradio gr.Plot + """ + num_gpus_list, ttft_list, thpt_per_gpu_list = prefill_results + + fig = go.Figure() + + # Add scatter plot for data points with custom data + fig.add_trace( + go.Scatter( + x=ttft_list, + y=thpt_per_gpu_list, + mode="markers+text", + marker=dict(size=12, color="blue", line=dict(width=2, color="darkblue")), + text=[f"{n} GPU(s)" for n in num_gpus_list], + textposition="top center", + textfont=dict(size=10), + name="GPU Configurations", + hovertemplate="%{text}
" + + "TTFT: %{x:.2f} ms
" + + "Throughput: %{y:.2f} tokens/s/GPU
" + + "", + customdata=list(zip(num_gpus_list, ttft_list, thpt_per_gpu_list)), + ) + ) + + # Add target TTFT line + max_thpt = max(thpt_per_gpu_list) if thpt_per_gpu_list else 1000 + _add_target_line(fig, target_ttft, f"Target TTFT: {target_ttft} ms", max_thpt) + + # Apply dark theme and configure layout + _configure_dark_theme( + fig, + "Prefill Performance", + "Time to First Token (ms)", + "Prefill Throughput per GPU (tokens/s/GPU)", + ) + + # Configure selection appearance + _configure_selection_style( + fig, "markers+text", selected_color="red", selected_size=16 + ) + + return fig + + +def plot_decode_performance_interactive( + decode_results: list, target_itl: float +) -> go.Figure: + """ + Create interactive Plotly plot for decode performance. + + Args: + decode_results: List of tuples (num_gpus, itl_list, thpt_per_gpu_list) + target_itl: Target ITL in milliseconds (for reference line) + + Returns: + Plotly Figure object for Gradio gr.Plot + """ + fig = go.Figure() + + # Plot each GPU configuration + for idx, (num_gpus, itl_list, thpt_per_gpu_list) in enumerate(decode_results): + color = PLOTLY_COLORS[idx % len(PLOTLY_COLORS)] + # Prepare custom data for each point + customdata = [ + [num_gpus, itl, thpt] for itl, thpt in zip(itl_list, thpt_per_gpu_list) + ] + + fig.add_trace( + go.Scatter( + x=itl_list, + y=thpt_per_gpu_list, + mode="lines+markers", + marker=dict(size=8, color=color), + line=dict(color=color, width=2), + name=f"{num_gpus} GPU(s)", + hovertemplate=f"{num_gpus} GPU(s)
" + + "ITL: %{x:.2f} ms
" + + "Throughput: %{y:.2f} tokens/s/GPU
" + + "", + customdata=customdata, + ) + ) + + # Add target ITL line + all_thpt = [ + thpt for _, _, thpt_list in decode_results for thpt in thpt_list if thpt_list + ] + max_thpt = max(all_thpt) if all_thpt else 1000 + _add_target_line(fig, target_itl, f"Target ITL: {target_itl} ms", max_thpt) + + # Apply dark theme and configure layout + _configure_dark_theme( + fig, + "Decode Performance", + "Inter Token Latency (ms)", + "Decode Throughput per GPU (tokens/s/GPU)", + ) + + # Configure selection appearance for markers + _configure_selection_style( + fig, "lines+markers", selected_color="yellow", selected_size=12 + ) + + return fig + + +def plot_cost_sla_interactive( + isl: int, + osl: int, + prefill_results: tuple, + decode_results: list, + gpu_cost_per_hour: float = 3.0, +) -> go.Figure: + """ + Create interactive Plotly plot for cost vs SLA analysis. + + Args: + isl: Input sequence length + osl: Output sequence length + prefill_results: Tuple of (num_gpus, ttft, thpt_per_gpu) for prefill + decode_results: List of tuples (num_gpus, itl_list, thpt_per_gpu_list) for decode + gpu_cost_per_hour: Cost per GPU per hour in dollars (default: 3.0) + + Returns: + Plotly Figure object for Gradio gr.Plot + """ + # Compute Pareto fronts + p_ttft, p_thpt = compute_parato(prefill_results[1], prefill_results[2]) + + _d_itl, _d_thpt = [], [] + for _d_result in decode_results: + _d_itl.extend(_d_result[1]) + _d_thpt.extend(_d_result[2]) + d_itl, d_thpt = compute_parato(_d_itl, _d_thpt) + + # Convert to numpy arrays for element-wise operations + p_ttft = np.array(p_ttft) + p_thpt = np.array(p_thpt) + d_itl = np.array(d_itl) + d_thpt = np.array(d_thpt) + + # Calculate cost metrics + fig = go.Figure() + + for idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)): + # Calculate costs for this TTFT curve + prefill_cost = isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600 + + # Calculate tokens per user and cost arrays (element-wise operations) + tokens_per_user_array = 1000 / d_itl # Element-wise division with numpy array + cost_array = osl * 1000 / d_thpt * gpu_cost_per_hour / 3600 + prefill_cost + + color = PLOTLY_COLORS[idx % len(PLOTLY_COLORS)] + + # Prepare custom data for each point + customdata = [ + [ + _p_ttft, + _p_thpt, + float(d_itl[i]), + float(d_thpt[i]), + float(tokens_per_user_array[i]), + float(cost_array[i]), + ] + for i in range(len(d_itl)) + ] + + # Add line plot for this TTFT curve + fig.add_trace( + go.Scatter( + x=tokens_per_user_array, + y=cost_array, + mode="lines+markers", + marker=dict(size=10, symbol="x", color=color, line=dict(width=2)), + line=dict(color=color, width=2), + name=f"TTFT: {_p_ttft:.2f}ms", + hovertemplate=f"TTFT: {_p_ttft:.2f}ms
" + + "Tokens/User: %{x:.2f}
" + + "Cost: $%{y:.3f}
" + + "", + customdata=customdata, + ) + ) + + # Apply dark theme and configure layout + title = f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${gpu_cost_per_hour:.2f}) Under Different SLA" + _configure_dark_theme(fig, title, "Tokens per User", "Cost ($)") + + # Configure selection appearance for markers + _configure_selection_style( + fig, "lines+markers", selected_color="yellow", selected_size=14 + ) + + return fig diff --git a/benchmarks/profiler/webapp/ui/results.py b/benchmarks/profiler/webapp/ui/results.py new file mode 100644 index 0000000000..2496f5cf1d --- /dev/null +++ b/benchmarks/profiler/webapp/ui/results.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +UI components for results display in the Dynamo SLA Profiler webapp. + +This module provides functions to build the results tabs with plots and tables. +""" + +import gradio as gr + +from benchmarks.profiler.webapp.core.constants import ( + COST_TAB_DESCRIPTION, + DECODE_TAB_DESCRIPTION, + PREFILL_TAB_DESCRIPTION, +) + + +def create_results_tabs(empty_prefill_html, empty_decode_html, empty_cost_html): + """ + Create the results tabs with plots and tables. + + Args: + empty_prefill_html: Empty prefill table HTML + empty_decode_html: Empty decode table HTML + empty_cost_html: Empty cost table HTML + + Returns: + Dictionary of Gradio components + """ + with gr.Tab("Prefill Performance"): + prefill_plot = gr.Plot( + label="Prefill Performance", + show_label=False, + elem_id="prefill_plot", + ) + gr.Markdown(PREFILL_TAB_DESCRIPTION) + gr.Markdown("#### Data Points") + prefill_table = gr.HTML( + value=empty_prefill_html, + elem_id="prefill_table", + ) + + with gr.Tab("Decode Performance"): + decode_plot = gr.Plot( + label="Decode Performance", + show_label=False, + elem_id="decode_plot", + ) + gr.Markdown(DECODE_TAB_DESCRIPTION) + gr.Markdown("#### Data Points") + decode_table = gr.HTML( + value=empty_decode_html, + elem_id="decode_table", + ) + + with gr.Tab("Cost vs SLA"): + cost_plot = gr.Plot( + label="Cost vs SLA", + show_label=False, + elem_id="cost_plot", + ) + gr.Markdown(COST_TAB_DESCRIPTION) + gr.Markdown("#### Data Points") + cost_table = gr.HTML( + value=empty_cost_html, + elem_id="cost_table", + ) + + return { + "prefill_plot": prefill_plot, + "decode_plot": decode_plot, + "cost_plot": cost_plot, + "prefill_table": prefill_table, + "decode_table": decode_table, + "cost_table": cost_table, + } diff --git a/benchmarks/profiler/webapp/ui/settings.py b/benchmarks/profiler/webapp/ui/settings.py new file mode 100644 index 0000000000..2ea129ee00 --- /dev/null +++ b/benchmarks/profiler/webapp/ui/settings.py @@ -0,0 +1,192 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +UI components for settings panels in the Dynamo SLA Profiler webapp. + +This module provides functions to build the settings UI sections: +- Model and backend configuration +- Hardware configuration (GPUs, cost) +- SLA parameters (ISL, OSL, TTFT, ITL) +""" + +import gradio as gr +from aiconfigurator.sdk.common import SupportedModels + +from benchmarks.profiler.webapp.core.constants import ( + BACKEND_VERSIONS, + DEFAULT_CONFIG_YAML, + GPU_SYSTEMS, + INFERENCE_BACKENDS, + MAX_GPU_OPTIONS, + MIN_GPU_OPTIONS, +) + + +def create_model_settings(): + """ + Create the model and backend settings UI. + + Returns: + Dictionary of Gradio components + """ + with gr.Group(): + with gr.Row(): + supported_models = list(SupportedModels.keys()) + aic_model_name = gr.Dropdown( + label="Model", + choices=supported_models, + value=supported_models[0], + info="Model to profile", + ) + + backend = gr.Dropdown( + label="Backend", + choices=INFERENCE_BACKENDS, + value="trtllm", + info="Inference backend", + ) + + config_yaml = gr.Textbox( + label="Config (YAML)", + placeholder=DEFAULT_CONFIG_YAML, + lines=5, + info="DynamoGraphDeployment YAML configuration", + ) + + use_aic = gr.Checkbox( + label="Use AI Configurator", + value=True, + info="Use AI Configurator to estimate performance", + ) + + with gr.Row(): + aic_backend = gr.Dropdown( + label="AI Configurator Backend", + choices=INFERENCE_BACKENDS, + value="trtllm", + info="Backend for AI Configurator estimation", + visible=True, + ) + + aic_backend_version = gr.Dropdown( + label="AI Configurator Backend Version", + choices=BACKEND_VERSIONS["trtllm"], + value="0.20.0", + info="Backend version for AI Configurator", + allow_custom_value=True, + visible=True, + ) + + return { + "aic_model_name": aic_model_name, + "backend": backend, + "config_yaml": config_yaml, + "use_aic": use_aic, + "aic_backend": aic_backend, + "aic_backend_version": aic_backend_version, + } + + +def create_hardware_settings(): + """ + Create the hardware configuration UI. + + Returns: + Dictionary of Gradio components + """ + with gr.Group(): + with gr.Row(): + aic_system = gr.Dropdown( + label="System", + choices=GPU_SYSTEMS, + value="H200_SXM", + info="Target GPU system", + ) + + gpu_cost_per_hour = gr.Number( + label="Cost per GPU Hour ($)", + value=3.0, + info="Cost per GPU per hour in dollars", + ) + + with gr.Row(): + min_num_gpus_per_engine = gr.Dropdown( + label="Min GPUs per Engine", + choices=MIN_GPU_OPTIONS, + value=1, + info="Minimum number of GPUs (TP size)", + ) + + max_num_gpus_per_engine = gr.Dropdown( + label="Max GPUs per Engine", + choices=MAX_GPU_OPTIONS, + value=4, + info="Maximum number of GPUs (TP size)", + ) + + num_gpus_per_node = gr.Number( + label="GPUs per Node", + value=8, + info="Number of GPUs per node (for MoE models)", + ) + + return { + "aic_system": aic_system, + "gpu_cost_per_hour": gpu_cost_per_hour, + "min_num_gpus_per_engine": min_num_gpus_per_engine, + "max_num_gpus_per_engine": max_num_gpus_per_engine, + "num_gpus_per_node": num_gpus_per_node, + } + + +def create_sla_settings(): + """ + Create the SLA configuration UI. + + Returns: + Dictionary of Gradio components + """ + with gr.Group(): + with gr.Row(): + isl = gr.Number( + label="Input Sequence Length (ISL)", + value=5000, + precision=0, + info="Target input sequence length", + ) + + osl = gr.Number( + label="Output Sequence Length (OSL)", + value=50, + precision=0, + info="Target output sequence length", + ) + + with gr.Row(): + max_context_length = gr.Number( + label="Max Context Length", + value=8192, + precision=0, + info="Maximum context length supported by the model", + ) + + ttft = gr.Number( + label="Target TTFT (ms)", + value=50.0, + info="Target Time To First Token in milliseconds", + ) + + itl = gr.Number( + label="Target ITL (ms)", + value=10.0, + info="Target Inter Token Latency in milliseconds", + ) + + return { + "isl": isl, + "osl": osl, + "max_context_length": max_context_length, + "ttft": ttft, + "itl": itl, + } diff --git a/benchmarks/profiler/webapp/ui/tables.py b/benchmarks/profiler/webapp/ui/tables.py new file mode 100644 index 0000000000..247cf5f7c4 --- /dev/null +++ b/benchmarks/profiler/webapp/ui/tables.py @@ -0,0 +1,187 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Table building and data preparation utilities for the Dynamo SLA Profiler webapp. +""" + +from numbers import Real + +import numpy as np + +from benchmarks.profiler.utils.parato import compute_parato +from benchmarks.profiler.webapp.core.constants import ( + COST_TABLE_HEADERS, + DECODE_TABLE_HEADERS, + PREFILL_TABLE_HEADERS, +) + + +def _format_cell(value): + """Format a cell value for display in HTML table.""" + if isinstance(value, bool): + return "✅" if value else "❌" + if isinstance(value, Real) and not isinstance(value, bool): + if isinstance(value, int): + return f"{value}" + return f"{value:.3f}" + return str(value) + + +def build_table_html(headers, rows): + """ + Build an HTML table from headers and rows. + + Args: + headers: List of header strings + rows: List of row data (each row is a list of values) + + Returns: + HTML string containing the table + """ + header_html = "".join(f"{header}" for header in headers) + + if not rows: + empty_row = ( + f"" + "No data selected yet. Click points on the plot to populate this table." + "" + ) + body_html = empty_row + else: + body_html = "".join( + "" + "".join(f"{_format_cell(cell)}" for cell in row) + "" + for row in rows + ) + + return ( + "
" + "" + f"{header_html}" + f"{body_html}" + "
" + "
" + ) + + +def get_empty_tables(): + """Get empty table HTML for all three table types.""" + return ( + build_table_html(PREFILL_TABLE_HEADERS, []), + build_table_html(DECODE_TABLE_HEADERS, []), + build_table_html(COST_TABLE_HEADERS, []), + ) + + +def prepare_prefill_table_data(prefill_results): + """ + Prepare table data for prefill performance. + + Args: + prefill_results: Tuple of (num_gpus_list, ttft_list, thpt_per_gpu_list) + + Returns: + List of rows for the table + """ + num_gpus_list, ttft_list, thpt_per_gpu_list = prefill_results + return [ + [num_gpus, round(ttft, 3), round(thpt, 3)] + for num_gpus, ttft, thpt in zip(num_gpus_list, ttft_list, thpt_per_gpu_list) + ] + + +def prepare_decode_table_data(decode_results): + """ + Prepare table data for decode performance. + + Args: + decode_results: List of tuples (num_gpus, itl_list, thpt_list) + + Returns: + List of rows for the table + """ + table_data = [] + for num_gpus, itl_list, thpt_list in decode_results: + for itl, thpt in zip(itl_list, thpt_list): + table_data.append([num_gpus, round(itl, 3), round(thpt, 3)]) + return table_data + + +def prepare_cost_table_data( + isl, osl, prefill_results, decode_results, gpu_cost_per_hour +): + """ + Prepare table data for cost analysis. + + Args: + isl: Input sequence length + osl: Output sequence length + prefill_results: Tuple of (num_gpus, ttft, thpt_per_gpu) for prefill + decode_results: List of tuples (num_gpus, itl_list, thpt_per_gpu_list) for decode + gpu_cost_per_hour: Cost per GPU per hour in dollars + + Returns: + List of rows for the table + """ + # Compute Pareto fronts + p_ttft, p_thpt = compute_parato(prefill_results[1], prefill_results[2]) + + _d_itl, _d_thpt = [], [] + for _d_result in decode_results: + _d_itl.extend(_d_result[1]) + _d_thpt.extend(_d_result[2]) + d_itl, d_thpt = compute_parato(_d_itl, _d_thpt) + + # Convert to numpy arrays + p_ttft = np.array(p_ttft) + p_thpt = np.array(p_thpt) + d_itl = np.array(d_itl) + d_thpt = np.array(d_thpt) + + # Calculate cost data + table_data = [] + for _p_ttft, _p_thpt in zip(p_ttft, p_thpt): + prefill_cost = isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600 + tokens_per_user_array = 1000 / d_itl + cost_array = osl * 1000 / d_thpt * gpu_cost_per_hour / 3600 + prefill_cost + + for i in range(len(d_itl)): + table_data.append( + [ + round(float(_p_ttft), 3), + round(float(_p_thpt), 3), + round(float(d_itl[i]), 3), + round(float(d_thpt[i]), 3), + round(float(tokens_per_user_array[i]), 3), + round(float(cost_array[i]), 3), + ] + ) + + return table_data + + +def build_all_tables(prefill_results, decode_results, isl, osl, gpu_cost_per_hour): + """ + Build all three table HTMLs from profiling results. + + Args: + prefill_results: Prefill profiling results + decode_results: Decode profiling results + isl: Input sequence length + osl: Output sequence length + gpu_cost_per_hour: Cost per GPU per hour + + Returns: + Tuple of (prefill_table_html, decode_table_html, cost_table_html) + """ + prefill_data = prepare_prefill_table_data(prefill_results) + decode_data = prepare_decode_table_data(decode_results) + cost_data = prepare_cost_table_data( + isl, osl, prefill_results, decode_results, gpu_cost_per_hour + ) + + return ( + build_table_html(PREFILL_TABLE_HEADERS, prefill_data), + build_table_html(DECODE_TABLE_HEADERS, decode_data), + build_table_html(COST_TABLE_HEADERS, cost_data), + ) diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index d99b7c611c..f1aeb8fba8 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -48,6 +48,8 @@ dependencies = [ "types-tabulate", "transformers", "pytest-mypy", + "gradio>=5.49.1", + "plotly>=6.4.0", ] [project.scripts]