@@ -36,9 +36,11 @@ import (
3636 tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion/api/v1"
3737 tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
3838 "github.com/NexusGPU/tensor-fusion/internal/constants"
39+ "github.com/NexusGPU/tensor-fusion/internal/metrics"
3940 scheduler "github.com/NexusGPU/tensor-fusion/internal/scheduler"
4041 "github.com/NexusGPU/tensor-fusion/internal/utils"
4142 "github.com/NexusGPU/tensor-fusion/internal/worker"
43+ "github.com/prometheus/client_golang/prometheus"
4244)
4345
4446// TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object
@@ -141,7 +143,7 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
141143
142144 // Calculate how many pods need to be removed
143145 podsToRemove := int (currentReplicas - desiredReplicas )
144- if err := r .scaleDownWorkers (ctx , podList .Items [:podsToRemove ]); err != nil {
146+ if err := r .scaleDownWorkers (ctx , workload , podList .Items [:podsToRemove ]); err != nil {
145147 return ctrl.Result {}, err
146148 }
147149 }
@@ -185,7 +187,7 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
185187}
186188
187189// scaleDownWorkers handles the scaling down of worker pods
188- func (r * TensorFusionWorkloadReconciler ) scaleDownWorkers (ctx context.Context , pods []corev1.Pod ) error {
190+ func (r * TensorFusionWorkloadReconciler ) scaleDownWorkers (ctx context.Context , workload * tfv1. TensorFusionWorkload , pods []corev1.Pod ) error {
189191 log := log .FromContext (ctx )
190192
191193 for i := range pods {
@@ -196,6 +198,16 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, p
196198 if err := r .deletePod (ctx , podToDelete ); err != nil {
197199 return err
198200 }
201+
202+ labels := prometheus.Labels {
203+ "worker" : podToDelete .Name ,
204+ "namespace" : podToDelete .Namespace ,
205+ "pool" : workload .Spec .PoolName ,
206+ }
207+ metrics .GpuTflopsRequest .Delete (labels )
208+ metrics .GpuTflopsLimit .Delete (labels )
209+ metrics .VramBytesRequest .Delete (labels )
210+ metrics .VramBytesLimit .Delete (labels )
199211 }
200212 return nil
201213}
@@ -271,7 +283,7 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
271283 return fmt .Errorf ("schedule GPU: %w" , err )
272284 }
273285
274- _ , err = r .tryStartWorker (ctx , workerGenerator , gpu , workload )
286+ pod , err : = r .tryStartWorker (ctx , workerGenerator , gpu , workload )
275287 if err != nil {
276288 // Try to release the GPU resource if pod creation fails
277289 releaseErr := r .Scheduler .Release (ctx , workload .Spec .Resources .Requests , gpu )
@@ -280,6 +292,16 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
280292 }
281293 return fmt .Errorf ("create worker pod: %w" , err )
282294 }
295+
296+ labels := prometheus.Labels {
297+ "worker" : pod .Name ,
298+ "namespace" : pod .Namespace ,
299+ "pool" : workload .Spec .PoolName ,
300+ }
301+ metrics .GpuTflopsRequest .With (labels ).Set (workload .Spec .Resources .Requests .Tflops .AsApproximateFloat64 ())
302+ metrics .GpuTflopsLimit .With (labels ).Set (workload .Spec .Resources .Limits .Tflops .AsApproximateFloat64 ())
303+ metrics .VramBytesRequest .With (labels ).Set (workload .Spec .Resources .Requests .Vram .AsApproximateFloat64 ())
304+ metrics .VramBytesLimit .With (labels ).Set (workload .Spec .Resources .Limits .Vram .AsApproximateFloat64 ())
283305 }
284306
285307 return nil
0 commit comments