@@ -116,6 +116,74 @@ monitor.RecordCost(observability.CostEntry{
116116summary := monitor.GetCostSummary (startTime, endTime)
117117```
118118
119+ ### Real-time GPU Metrics Collection
120+
121+ ``` go
122+ import " github.com/Finoptimize/agentaflow-sro-community/pkg/gpu"
123+ import " github.com/Finoptimize/agentaflow-sro-community/pkg/observability"
124+
125+ // Create GPU metrics collector (collects every 5 seconds)
126+ metricsCollector := gpu.NewMetricsCollector (5 * time.Second )
127+
128+ // Create monitoring service integration
129+ monitoringService := observability.NewMonitoringService (10000 )
130+ integration := observability.NewGPUMetricsIntegration (monitoringService, metricsCollector)
131+
132+ // Start real-time collection
133+ metricsCollector.Start ()
134+
135+ // Register callback for real-time monitoring
136+ metricsCollector.RegisterCallback (func (metrics gpu.GPUMetrics ) {
137+ fmt.Printf (" GPU %s : %.1f%% util, %.1f °C, %d MB used\n " ,
138+ metrics.GPUID , metrics.UtilizationGPU , metrics.Temperature , metrics.MemoryUsed )
139+ })
140+
141+ // Get system overview
142+ overview := metricsCollector.GetSystemOverview ()
143+ fmt.Printf (" Total GPUs: %v , Active: %v , Avg Util: %.1f%% \n " ,
144+ overview[" total_gpus" ], overview[" active_gpus" ], overview[" avg_utilization" ])
145+
146+ // Get efficiency metrics
147+ efficiency := metricsCollector.GetGPUEfficiencyMetrics (" gpu-0" , time.Hour )
148+ fmt.Printf (" GPU efficiency: %.1f%% idle time, %.3f power efficiency\n " ,
149+ efficiency[" idle_time_percent" ], efficiency[" avg_power_efficiency" ])
150+ ```
151+
152+ ### Advanced GPU Analytics
153+
154+ ``` go
155+ // Create metrics aggregation service
156+ aggregationService := gpu.NewMetricsAggregationService (
157+ metricsCollector,
158+ 1 *time.Minute , // Aggregation interval
159+ 24 *time.Hour , // Retention period
160+ )
161+ aggregationService.Start ()
162+
163+ // Get comprehensive GPU statistics
164+ stats , _ := aggregationService.GetGPUStats (" gpu-0" )
165+ fmt.Printf (" Average utilization: %.1f%% , Peak: %.1f%% \n " ,
166+ stats.AverageUtilization , stats.PeakUtilization )
167+
168+ // Get efficiency report
169+ report := aggregationService.GetEfficiencyReport ()
170+ clusterEff := report[" cluster_efficiency" ].(map [string ]interface {})
171+ fmt.Printf (" Cluster idle time: %.1f%% , Efficiency potential: %.1f%% \n " ,
172+ clusterEff[" average_idle_time_percent" ], clusterEff[" utilization_potential" ])
173+
174+ // Analyze performance trends
175+ trends := aggregationService.GetPerformanceTrends (" gpu-0" , 4 *time.Hour )
176+ utilTrend := trends[" utilization_trend" ].(map [string ]float64 )
177+ fmt.Printf (" Utilization trend: slope=%.3f (r²=%.3f )\n " ,
178+ utilTrend[" slope" ], utilTrend[" r_squared" ])
179+
180+ // Get cost analysis
181+ costAnalysis := aggregationService.GetCostAnalysis ()
182+ fmt.Printf (" Estimated cost: $%.2f , Potential savings: $%.2f (%.1f%% )\n " ,
183+ costAnalysis[" total_estimated_cost" ], costAnalysis[" total_potential_savings" ],
184+ costAnalysis[" savings_percentage" ])
185+ ```
186+
119187### Kubernetes GPU Scheduling
120188
121189``` bash
@@ -158,13 +226,13 @@ workload := &k8s.GPUWorkload{
158226 },
159227}
160228scheduler.SubmitGPUWorkload (workload)
161- ```
162-
163- ## 📊 Key Benefits
229+ ` ` ` ## 📊 Key Benefits
164230
165231| Component | Benefit | Impact |
166232|-----------|---------|--------|
167233| GPU Scheduling | Optimized utilization | Up to 40% reduction in GPU idle time |
234+ | Real-time Metrics | Live GPU monitoring | Real-time utilization, temperature, power tracking |
235+ | GPU Analytics | Performance insights | Efficiency scoring, trend analysis, cost optimization |
168236| Kubernetes Integration | Native K8s scheduling | Seamless integration with existing clusters |
169237| Request Batching | Improved throughput | 3-5x increase in requests/second |
170238| Response Caching | Reduced latency | Up to 50% faster responses |
@@ -225,7 +293,7 @@ Contributions are welcome! This is a community edition focused on providing acce
225293## 🗺️ Roadmap
226294
227295- ✅ Kubernetes integration for GPU scheduling
228- - Real-time GPU metrics collection
296+ - ✅ Real-time GPU metrics collection
229297- Prometheus/Grafana integration
230298- Web dashboard for monitoring
231299- OpenTelemetry support for tracing
0 commit comments