|
| 1 | +from dash import html |
| 2 | + |
| 3 | +import projects.matrix_benchmarking.visualizations.helpers.plotting.report as report |
| 4 | + |
| 5 | +import matrix_benchmarking.plotting.table_stats as table_stats |
| 6 | +import matrix_benchmarking.common as common |
| 7 | + |
| 8 | + |
| 9 | +def register(): |
| 10 | + """Register all Prometheus report classes""" |
| 11 | + PrometheusResourceUsageReport() |
| 12 | + PrometheusGPUPerformanceReport() |
| 13 | + PrometheusSystemHealthReport() |
| 14 | + |
| 15 | + |
| 16 | +class PrometheusResourceUsageReport(): |
| 17 | + def __init__(self): |
| 18 | + self.name = "report: Prometheus Resource Usage" |
| 19 | + self.id_name = self.name.lower().replace(" ", "_").replace("-", "_") |
| 20 | + self.no_graph = True |
| 21 | + self.is_report = True |
| 22 | + |
| 23 | + table_stats.TableStats._register_stat(self) |
| 24 | + common.Matrix.settings["stats"].add(self.name) |
| 25 | + |
| 26 | + def do_plot(self, *args): |
| 27 | + """ |
| 28 | + Generate comprehensive resource usage report from Prometheus metrics |
| 29 | + """ |
| 30 | + ordered_vars, settings, setting_lists, variables, cfg = args |
| 31 | + entries = list(common.Matrix.all_records(settings, setting_lists)) |
| 32 | + |
| 33 | + header = [] |
| 34 | + header.append(html.H2("📊 Resource Usage Analytics")) |
| 35 | + header.append(html.P("Comprehensive CPU and memory usage analysis for LLM-D inference workloads")) |
| 36 | + header.append(html.Br()) |
| 37 | + |
| 38 | + if not entries: |
| 39 | + header.append(html.P("No test entries found.")) |
| 40 | + return None, header |
| 41 | + |
| 42 | + # Application Resource Usage Section |
| 43 | + header.append(html.H3("🚀 LLM-D Application Resources")) |
| 44 | + header.append(html.P("CPU and memory usage for LLM-D inference services")) |
| 45 | + |
| 46 | + header += report.Plot_and_Text("Prom: LLM Inference Service: CPU usage", args) |
| 47 | + header += report.Plot_and_Text("Prom: LLM Inference Service: Mem usage", args) |
| 48 | + header += report.Plot_and_Text("Prom: LLM Inference Gateway: CPU usage", args) |
| 49 | + header += report.Plot_and_Text("Prom: LLM Inference Gateway: Mem usage", args) |
| 50 | + |
| 51 | + header.append(html.Br()) |
| 52 | + |
| 53 | + # Cluster Resource Usage Section |
| 54 | + header.append(html.H3("🏗️ Cluster Resource Overview")) |
| 55 | + header.append(html.P("Overall cluster CPU and memory utilization")) |
| 56 | + |
| 57 | + header += report.Plot_and_Text("Prom: sutest cluster CPU usage", args) |
| 58 | + header += report.Plot_and_Text("Prom: sutest cluster memory usage", args) |
| 59 | + |
| 60 | + header.append(html.Br()) |
| 61 | + |
| 62 | + # Node-level Resource Usage Section |
| 63 | + header.append(html.H3("🖥️ Node-level Resource Usage")) |
| 64 | + header.append(html.P("CPU usage and idle time breakdown by node type")) |
| 65 | + |
| 66 | + header += report.Plot_and_Text("Prom: Sutest Control Plane Node CPU usage", args) |
| 67 | + header += report.Plot_and_Text("Prom: Sutest Worker Node CPU usage", args) |
| 68 | + header += report.Plot_and_Text("Prom: Sutest Control Plane Node CPU idle", args) |
| 69 | + header += report.Plot_and_Text("Prom: Sutest Worker Node CPU idle", args) |
| 70 | + |
| 71 | + return None, header |
| 72 | + |
| 73 | + |
| 74 | +class PrometheusGPUPerformanceReport(): |
| 75 | + def __init__(self): |
| 76 | + self.name = "report: Prometheus GPU Performance" |
| 77 | + self.id_name = self.name.lower().replace(" ", "_").replace("-", "_") |
| 78 | + self.no_graph = True |
| 79 | + self.is_report = True |
| 80 | + |
| 81 | + table_stats.TableStats._register_stat(self) |
| 82 | + common.Matrix.settings["stats"].add(self.name) |
| 83 | + |
| 84 | + def do_plot(self, *args): |
| 85 | + """ |
| 86 | + Generate comprehensive GPU performance report from Prometheus DCGM metrics |
| 87 | + """ |
| 88 | + ordered_vars, settings, setting_lists, variables, cfg = args |
| 89 | + entries = list(common.Matrix.all_records(settings, setting_lists)) |
| 90 | + |
| 91 | + header = [] |
| 92 | + header.append(html.H2("🎮 GPU Performance Analytics")) |
| 93 | + header.append(html.P("Comprehensive GPU utilization, memory, and throughput analysis using DCGM metrics")) |
| 94 | + header.append(html.Br()) |
| 95 | + |
| 96 | + if not entries: |
| 97 | + header.append(html.P("No test entries found.")) |
| 98 | + return None, header |
| 99 | + |
| 100 | + # GPU Memory Usage Section |
| 101 | + header.append(html.H3("💾 GPU Memory Utilization")) |
| 102 | + header.append(html.P("GPU memory consumption and allocation patterns")) |
| 103 | + |
| 104 | + header += report.Plot_and_Text("Prom: Sutest GPU memory used", args) |
| 105 | + header += report.Plot_and_Text("Prom: Sutest GPU memory used (all GPUs)", args) |
| 106 | + header += report.Plot_and_Text("Prom: Sutest GPU memory unallocated", args) |
| 107 | + header += report.Plot_and_Text("Prom: Sutest GPU memory transfer utilization", args) |
| 108 | + |
| 109 | + header.append(html.Br()) |
| 110 | + |
| 111 | + # GPU Compute Performance Section |
| 112 | + header.append(html.H3("⚡ GPU Compute Performance")) |
| 113 | + header.append(html.P("GPU compute utilization and active processing units")) |
| 114 | + |
| 115 | + header += report.Plot_and_Text("Prom: Sutest GPU compute utilization (not 100% accurate)", args) |
| 116 | + header += report.Plot_and_Text("Prom: Sutest GPU engine usage (not 100% accurate)", args) |
| 117 | + header += report.Plot_and_Text("Prom: Sutest GPU active computes", args) |
| 118 | + header += report.Plot_and_Text("Prom: Sutest GPU computes occupancy", args) |
| 119 | + |
| 120 | + header.append(html.Br()) |
| 121 | + |
| 122 | + # GPU Pipeline Usage Section |
| 123 | + header.append(html.H3("🔧 GPU Pipeline Utilization")) |
| 124 | + header.append(html.P("GPU floating-point pipeline usage by precision")) |
| 125 | + |
| 126 | + header += report.Plot_and_Text("Prom: Sutest GPU active fp16 pipe", args) |
| 127 | + header += report.Plot_and_Text("Prom: Sutest GPU active fp32 pipe", args) |
| 128 | + header += report.Plot_and_Text("Prom: Sutest GPU active fp64 pipe", args) |
| 129 | + |
| 130 | + header.append(html.Br()) |
| 131 | + |
| 132 | + # GPU Interconnect Performance Section |
| 133 | + header.append(html.H3("🔗 GPU Interconnect Performance")) |
| 134 | + header.append(html.P("NVLink and PCIe transfer rates and throughput")) |
| 135 | + |
| 136 | + header += report.Plot_and_Text("Prom: Sutest GPU NVLink transfer (rx)", args) |
| 137 | + header += report.Plot_and_Text("Prom: Sutest GPU NVLink transfer (tx)", args) |
| 138 | + header += report.Plot_and_Text("Prom: Sutest GPU PCIe transfer (rx)", args) |
| 139 | + header += report.Plot_and_Text("Prom: Sutest GPU PCIe transfer (tx)", args) |
| 140 | + |
| 141 | + return None, header |
| 142 | + |
| 143 | + |
| 144 | +class PrometheusSystemHealthReport(): |
| 145 | + def __init__(self): |
| 146 | + self.name = "report: Prometheus System Health" |
| 147 | + self.id_name = self.name.lower().replace(" ", "_").replace("-", "_") |
| 148 | + self.no_graph = True |
| 149 | + self.is_report = True |
| 150 | + |
| 151 | + table_stats.TableStats._register_stat(self) |
| 152 | + common.Matrix.settings["stats"].add(self.name) |
| 153 | + |
| 154 | + def do_plot(self, *args): |
| 155 | + """ |
| 156 | + Generate comprehensive system health report from Prometheus cluster metrics |
| 157 | + """ |
| 158 | + ordered_vars, settings, setting_lists, variables, cfg = args |
| 159 | + entries = list(common.Matrix.all_records(settings, setting_lists)) |
| 160 | + |
| 161 | + header = [] |
| 162 | + header.append(html.H2("🏥 System Health Analytics")) |
| 163 | + header.append(html.P("Kubernetes cluster health monitoring including API server and ETCD performance")) |
| 164 | + header.append(html.Br()) |
| 165 | + |
| 166 | + if not entries: |
| 167 | + header.append(html.P("No test entries found.")) |
| 168 | + return None, header |
| 169 | + |
| 170 | + # API Server Performance Section |
| 171 | + header.append(html.H3("🔌 API Server Performance")) |
| 172 | + header.append(html.P("Kubernetes API server resource usage and request handling")) |
| 173 | + |
| 174 | + header += report.Plot_and_Text("Prom: Sutest ApiServer: CPU usage", args) |
| 175 | + header += report.Plot_and_Text("Prom: Sutest ApiServer: Mem usage", args) |
| 176 | + |
| 177 | + header.append(html.Br()) |
| 178 | + |
| 179 | + # API Server Request Analysis Section |
| 180 | + header.append(html.H3("📈 API Server Request Analysis")) |
| 181 | + header.append(html.P("API request patterns, success rates, and error analysis")) |
| 182 | + |
| 183 | + header += report.Plot_and_Text("Prom: Sutest API Server Requests (successes)", args) |
| 184 | + header += report.Plot_and_Text("Prom: Sutest API Server Requests (client errors)", args) |
| 185 | + header += report.Plot_and_Text("Prom: Sutest API Server Requests (server errors)", args) |
| 186 | + |
| 187 | + header.append(html.Br()) |
| 188 | + |
| 189 | + if False: |
| 190 | + # API Server Request Latency Section |
| 191 | + header.append(html.H3("⏱️ API Server Request Latency")) |
| 192 | + header.append(html.P("API request duration analysis by operation type")) |
| 193 | + |
| 194 | + header += report.Plot_and_Text("Prom: Sutest GET Requests duration", args) |
| 195 | + header += report.Plot_and_Text("Prom: Sutest PUT Requests duration", args) |
| 196 | + header += report.Plot_and_Text("Prom: Sutest LIST Requests duration", args) |
| 197 | + header += report.Plot_and_Text("Prom: Sutest PATCH Requests duration", args) |
| 198 | + |
| 199 | + header.append(html.Br()) |
| 200 | + |
| 201 | + # ETCD Performance Section |
| 202 | + header.append(html.H3("🗄️ ETCD Performance")) |
| 203 | + header.append(html.P("ETCD cluster health and resource utilization")) |
| 204 | + |
| 205 | + header += report.Plot_and_Text("Prom: Sutest ETCD: CPU usage", args) |
| 206 | + header += report.Plot_and_Text("Prom: Sutest ETCD: Mem usage", args) |
| 207 | + |
| 208 | + return None, header |
0 commit comments