Skip to content

Commit 6c484fc

Browse files
committed
[llm-d] visualizations: llmd_inference: add basic Prom metrics plotting
1 parent c240cc1 commit 6c484fc

File tree

6 files changed

+618
-1
lines changed

6 files changed

+618
-1
lines changed

projects/llm-d/visualizations/llmd_inference/data/plots.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ visualize:
77
- "report: Prometheus Resource Usage"
88
- "report: Prometheus GPU Performance"
99
- "report: Prometheus System Health"
10+
- "report: VLLM Metrics Analysis"

projects/llm-d/visualizations/llmd_inference/plotting/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@ def register():
77
from . import error_report
88
from . import throughput_analysis
99
from . import prometheus
10+
from . import vllm_metrics
1011

1112
report.register()
1213
error_report.register()
1314
throughput_analysis.register()
1415
prometheus.register()
16+
vllm_metrics.register()
17+
vllm_metrics.register_report()
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import projects.matrix_benchmarking.visualizations.helpers.store.prom as helpers_store_prom
2+
import projects
3+
llmd = getattr(projects, "llm-d")
4+
5+
def register():
6+
"""
7+
Register Prometheus metrics plots for LLM-D inference visualization
8+
"""
9+
get_llmd_main_metrics = llmd.visualizations.llmd_inference.store.parsers.get_llmd_main_metrics
10+
get_llmd_main_metrics(register=True)
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
from dash import html
2+
3+
import projects.matrix_benchmarking.visualizations.helpers.plotting.report as report
4+
5+
import matrix_benchmarking.plotting.table_stats as table_stats
6+
import matrix_benchmarking.common as common
7+
8+
9+
def register():
10+
"""Register all Prometheus report classes"""
11+
PrometheusResourceUsageReport()
12+
PrometheusGPUPerformanceReport()
13+
PrometheusSystemHealthReport()
14+
15+
16+
class PrometheusResourceUsageReport():
17+
def __init__(self):
18+
self.name = "report: Prometheus Resource Usage"
19+
self.id_name = self.name.lower().replace(" ", "_").replace("-", "_")
20+
self.no_graph = True
21+
self.is_report = True
22+
23+
table_stats.TableStats._register_stat(self)
24+
common.Matrix.settings["stats"].add(self.name)
25+
26+
def do_plot(self, *args):
27+
"""
28+
Generate comprehensive resource usage report from Prometheus metrics
29+
"""
30+
ordered_vars, settings, setting_lists, variables, cfg = args
31+
entries = list(common.Matrix.all_records(settings, setting_lists))
32+
33+
header = []
34+
header.append(html.H2("📊 Resource Usage Analytics"))
35+
header.append(html.P("Comprehensive CPU and memory usage analysis for LLM-D inference workloads"))
36+
header.append(html.Br())
37+
38+
if not entries:
39+
header.append(html.P("No test entries found."))
40+
return None, header
41+
42+
# Application Resource Usage Section
43+
header.append(html.H3("🚀 LLM-D Application Resources"))
44+
header.append(html.P("CPU and memory usage for LLM-D inference services"))
45+
46+
header += report.Plot_and_Text("Prom: LLM Inference Service: CPU usage", args)
47+
header += report.Plot_and_Text("Prom: LLM Inference Service: Mem usage", args)
48+
header += report.Plot_and_Text("Prom: LLM Inference Gateway: CPU usage", args)
49+
header += report.Plot_and_Text("Prom: LLM Inference Gateway: Mem usage", args)
50+
51+
header.append(html.Br())
52+
53+
# Cluster Resource Usage Section
54+
header.append(html.H3("🏗️ Cluster Resource Overview"))
55+
header.append(html.P("Overall cluster CPU and memory utilization"))
56+
57+
header += report.Plot_and_Text("Prom: sutest cluster CPU usage", args)
58+
header += report.Plot_and_Text("Prom: sutest cluster memory usage", args)
59+
60+
header.append(html.Br())
61+
62+
# Node-level Resource Usage Section
63+
header.append(html.H3("🖥️ Node-level Resource Usage"))
64+
header.append(html.P("CPU usage and idle time breakdown by node type"))
65+
66+
header += report.Plot_and_Text("Prom: Sutest Control Plane Node CPU usage", args)
67+
header += report.Plot_and_Text("Prom: Sutest Worker Node CPU usage", args)
68+
header += report.Plot_and_Text("Prom: Sutest Control Plane Node CPU idle", args)
69+
header += report.Plot_and_Text("Prom: Sutest Worker Node CPU idle", args)
70+
71+
return None, header
72+
73+
74+
class PrometheusGPUPerformanceReport():
75+
def __init__(self):
76+
self.name = "report: Prometheus GPU Performance"
77+
self.id_name = self.name.lower().replace(" ", "_").replace("-", "_")
78+
self.no_graph = True
79+
self.is_report = True
80+
81+
table_stats.TableStats._register_stat(self)
82+
common.Matrix.settings["stats"].add(self.name)
83+
84+
def do_plot(self, *args):
85+
"""
86+
Generate comprehensive GPU performance report from Prometheus DCGM metrics
87+
"""
88+
ordered_vars, settings, setting_lists, variables, cfg = args
89+
entries = list(common.Matrix.all_records(settings, setting_lists))
90+
91+
header = []
92+
header.append(html.H2("🎮 GPU Performance Analytics"))
93+
header.append(html.P("Comprehensive GPU utilization, memory, and throughput analysis using DCGM metrics"))
94+
header.append(html.Br())
95+
96+
if not entries:
97+
header.append(html.P("No test entries found."))
98+
return None, header
99+
100+
# GPU Memory Usage Section
101+
header.append(html.H3("💾 GPU Memory Utilization"))
102+
header.append(html.P("GPU memory consumption and allocation patterns"))
103+
104+
header += report.Plot_and_Text("Prom: Sutest GPU memory used", args)
105+
header += report.Plot_and_Text("Prom: Sutest GPU memory used (all GPUs)", args)
106+
header += report.Plot_and_Text("Prom: Sutest GPU memory unallocated", args)
107+
header += report.Plot_and_Text("Prom: Sutest GPU memory transfer utilization", args)
108+
109+
header.append(html.Br())
110+
111+
# GPU Compute Performance Section
112+
header.append(html.H3("⚡ GPU Compute Performance"))
113+
header.append(html.P("GPU compute utilization and active processing units"))
114+
115+
header += report.Plot_and_Text("Prom: Sutest GPU compute utilization (not 100% accurate)", args)
116+
header += report.Plot_and_Text("Prom: Sutest GPU engine usage (not 100% accurate)", args)
117+
header += report.Plot_and_Text("Prom: Sutest GPU active computes", args)
118+
header += report.Plot_and_Text("Prom: Sutest GPU computes occupancy", args)
119+
120+
header.append(html.Br())
121+
122+
# GPU Pipeline Usage Section
123+
header.append(html.H3("🔧 GPU Pipeline Utilization"))
124+
header.append(html.P("GPU floating-point pipeline usage by precision"))
125+
126+
header += report.Plot_and_Text("Prom: Sutest GPU active fp16 pipe", args)
127+
header += report.Plot_and_Text("Prom: Sutest GPU active fp32 pipe", args)
128+
header += report.Plot_and_Text("Prom: Sutest GPU active fp64 pipe", args)
129+
130+
header.append(html.Br())
131+
132+
# GPU Interconnect Performance Section
133+
header.append(html.H3("🔗 GPU Interconnect Performance"))
134+
header.append(html.P("NVLink and PCIe transfer rates and throughput"))
135+
136+
header += report.Plot_and_Text("Prom: Sutest GPU NVLink transfer (rx)", args)
137+
header += report.Plot_and_Text("Prom: Sutest GPU NVLink transfer (tx)", args)
138+
header += report.Plot_and_Text("Prom: Sutest GPU PCIe transfer (rx)", args)
139+
header += report.Plot_and_Text("Prom: Sutest GPU PCIe transfer (tx)", args)
140+
141+
return None, header
142+
143+
144+
class PrometheusSystemHealthReport():
145+
def __init__(self):
146+
self.name = "report: Prometheus System Health"
147+
self.id_name = self.name.lower().replace(" ", "_").replace("-", "_")
148+
self.no_graph = True
149+
self.is_report = True
150+
151+
table_stats.TableStats._register_stat(self)
152+
common.Matrix.settings["stats"].add(self.name)
153+
154+
def do_plot(self, *args):
155+
"""
156+
Generate comprehensive system health report from Prometheus cluster metrics
157+
"""
158+
ordered_vars, settings, setting_lists, variables, cfg = args
159+
entries = list(common.Matrix.all_records(settings, setting_lists))
160+
161+
header = []
162+
header.append(html.H2("🏥 System Health Analytics"))
163+
header.append(html.P("Kubernetes cluster health monitoring including API server and ETCD performance"))
164+
header.append(html.Br())
165+
166+
if not entries:
167+
header.append(html.P("No test entries found."))
168+
return None, header
169+
170+
# API Server Performance Section
171+
header.append(html.H3("🔌 API Server Performance"))
172+
header.append(html.P("Kubernetes API server resource usage and request handling"))
173+
174+
header += report.Plot_and_Text("Prom: Sutest ApiServer: CPU usage", args)
175+
header += report.Plot_and_Text("Prom: Sutest ApiServer: Mem usage", args)
176+
177+
header.append(html.Br())
178+
179+
# API Server Request Analysis Section
180+
header.append(html.H3("📈 API Server Request Analysis"))
181+
header.append(html.P("API request patterns, success rates, and error analysis"))
182+
183+
header += report.Plot_and_Text("Prom: Sutest API Server Requests (successes)", args)
184+
header += report.Plot_and_Text("Prom: Sutest API Server Requests (client errors)", args)
185+
header += report.Plot_and_Text("Prom: Sutest API Server Requests (server errors)", args)
186+
187+
header.append(html.Br())
188+
189+
if False:
190+
# API Server Request Latency Section
191+
header.append(html.H3("⏱️ API Server Request Latency"))
192+
header.append(html.P("API request duration analysis by operation type"))
193+
194+
header += report.Plot_and_Text("Prom: Sutest GET Requests duration", args)
195+
header += report.Plot_and_Text("Prom: Sutest PUT Requests duration", args)
196+
header += report.Plot_and_Text("Prom: Sutest LIST Requests duration", args)
197+
header += report.Plot_and_Text("Prom: Sutest PATCH Requests duration", args)
198+
199+
header.append(html.Br())
200+
201+
# ETCD Performance Section
202+
header.append(html.H3("🗄️ ETCD Performance"))
203+
header.append(html.P("ETCD cluster health and resource utilization"))
204+
205+
header += report.Plot_and_Text("Prom: Sutest ETCD: CPU usage", args)
206+
header += report.Plot_and_Text("Prom: Sutest ETCD: Mem usage", args)
207+
208+
return None, header

0 commit comments

Comments
 (0)