Skip to content

Commit 167f671

Browse files
authored
Merge pull request #13 from Finoptimize/gpu-metrics
GPU metrics
2 parents 03cc25d + 1d19726 commit 167f671

File tree

8 files changed

+3343
-4
lines changed

8 files changed

+3343
-4
lines changed

README.md

Lines changed: 72 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,74 @@ monitor.RecordCost(observability.CostEntry{
116116
summary := monitor.GetCostSummary(startTime, endTime)
117117
```
118118

119+
### Real-time GPU Metrics Collection
120+
121+
```go
122+
import "github.com/Finoptimize/agentaflow-sro-community/pkg/gpu"
123+
import "github.com/Finoptimize/agentaflow-sro-community/pkg/observability"
124+
125+
// Create GPU metrics collector (collects every 5 seconds)
126+
metricsCollector := gpu.NewMetricsCollector(5 * time.Second)
127+
128+
// Create monitoring service integration
129+
monitoringService := observability.NewMonitoringService(10000)
130+
integration := observability.NewGPUMetricsIntegration(monitoringService, metricsCollector)
131+
132+
// Start real-time collection
133+
metricsCollector.Start()
134+
135+
// Register callback for real-time monitoring
136+
metricsCollector.RegisterCallback(func(metrics gpu.GPUMetrics) {
137+
fmt.Printf("GPU %s: %.1f%% util, %.1f°C, %dMB used\n",
138+
metrics.GPUID, metrics.UtilizationGPU, metrics.Temperature, metrics.MemoryUsed)
139+
})
140+
141+
// Get system overview
142+
overview := metricsCollector.GetSystemOverview()
143+
fmt.Printf("Total GPUs: %v, Active: %v, Avg Util: %.1f%%\n",
144+
overview["total_gpus"], overview["active_gpus"], overview["avg_utilization"])
145+
146+
// Get efficiency metrics
147+
efficiency := metricsCollector.GetGPUEfficiencyMetrics("gpu-0", time.Hour)
148+
fmt.Printf("GPU efficiency: %.1f%% idle time, %.3f power efficiency\n",
149+
efficiency["idle_time_percent"], efficiency["avg_power_efficiency"])
150+
```
151+
152+
### Advanced GPU Analytics
153+
154+
```go
155+
// Create metrics aggregation service
156+
aggregationService := gpu.NewMetricsAggregationService(
157+
metricsCollector,
158+
1*time.Minute, // Aggregation interval
159+
24*time.Hour, // Retention period
160+
)
161+
aggregationService.Start()
162+
163+
// Get comprehensive GPU statistics
164+
stats, _ := aggregationService.GetGPUStats("gpu-0")
165+
fmt.Printf("Average utilization: %.1f%%, Peak: %.1f%%\n",
166+
stats.AverageUtilization, stats.PeakUtilization)
167+
168+
// Get efficiency report
169+
report := aggregationService.GetEfficiencyReport()
170+
clusterEff := report["cluster_efficiency"].(map[string]interface{})
171+
fmt.Printf("Cluster idle time: %.1f%%, Efficiency potential: %.1f%%\n",
172+
clusterEff["average_idle_time_percent"], clusterEff["utilization_potential"])
173+
174+
// Analyze performance trends
175+
trends := aggregationService.GetPerformanceTrends("gpu-0", 4*time.Hour)
176+
utilTrend := trends["utilization_trend"].(map[string]float64)
177+
fmt.Printf("Utilization trend: slope=%.3f (r²=%.3f)\n",
178+
utilTrend["slope"], utilTrend["r_squared"])
179+
180+
// Get cost analysis
181+
costAnalysis := aggregationService.GetCostAnalysis()
182+
fmt.Printf("Estimated cost: $%.2f, Potential savings: $%.2f (%.1f%%)\n",
183+
costAnalysis["total_estimated_cost"], costAnalysis["total_potential_savings"],
184+
costAnalysis["savings_percentage"])
185+
```
186+
119187
### Kubernetes GPU Scheduling
120188

121189
```bash
@@ -158,13 +226,13 @@ workload := &k8s.GPUWorkload{
158226
},
159227
}
160228
scheduler.SubmitGPUWorkload(workload)
161-
```
162-
163-
## 📊 Key Benefits
229+
```## 📊 Key Benefits
164230
165231
| Component | Benefit | Impact |
166232
|-----------|---------|--------|
167233
| GPU Scheduling | Optimized utilization | Up to 40% reduction in GPU idle time |
234+
| Real-time Metrics | Live GPU monitoring | Real-time utilization, temperature, power tracking |
235+
| GPU Analytics | Performance insights | Efficiency scoring, trend analysis, cost optimization |
168236
| Kubernetes Integration | Native K8s scheduling | Seamless integration with existing clusters |
169237
| Request Batching | Improved throughput | 3-5x increase in requests/second |
170238
| Response Caching | Reduced latency | Up to 50% faster responses |
@@ -225,7 +293,7 @@ Contributions are welcome! This is a community edition focused on providing acce
225293
## 🗺️ Roadmap
226294

227295
- ✅ Kubernetes integration for GPU scheduling
228-
- Real-time GPU metrics collection
296+
- Real-time GPU metrics collection
229297
- Prometheus/Grafana integration
230298
- Web dashboard for monitoring
231299
- OpenTelemetry support for tracing

0 commit comments

Comments
 (0)