Skip to content

Commit a0c9341

Browse files
committed
[Feature] Add cleanup for terminated RayJob/RayCluster metrics
1 parent d1e750d commit a0c9341

File tree

4 files changed

+15
-1
lines changed

4 files changed

+15
-1
lines changed

ray-operator/controllers/ray/metrics/ray_cluster_metrics.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, na
8989
r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace).Set(duration)
9090
}
9191

92+
// DeleteRayClusterMetrics removes metrics that belongs to the specified RayCluster.
93+
func (r *RayClusterMetricsManager) DeleteRayClusterMetrics(name, namespace string) {
94+
numCleanedUpMetrics := r.rayClusterProvisionedDurationSeconds.DeletePartialMatch(prometheus.Labels{"name": name, "namespace": namespace})
95+
r.log.Info("Cleaned up expired RayCluster metric", "name", name, "namespace", namespace, "numCleanedUpMetrics", numCleanedUpMetrics)
96+
}
97+
9298
func (r *RayClusterMetricsManager) collectRayClusterInfo(cluster *rayv1.RayCluster, ch chan<- prometheus.Metric) {
9399
ownerKind := "None"
94100
if v, ok := cluster.Labels[utils.RayOriginatedFromCRDLabelKey]; ok {

ray-operator/controllers/ray/metrics/ray_job_metrics.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace st
8484
r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration)
8585
}
8686

87+
// deleteRayJobMetrics removes metrics that belongs to the specified RayJob.
88+
func (r *RayJobMetricsManager) DeleteRayJobMetrics(name, namespace string) {
89+
numCleanedUpMetrics := r.rayJobExecutionDurationSeconds.DeletePartialMatch(prometheus.Labels{"name": name, "namespace": namespace})
90+
r.log.Info("Cleaned up expired rayJob metric", "name", name, "namespace", namespace, "numCleanedUpMetrics", numCleanedUpMetrics)
91+
}
92+
8793
func (r *RayJobMetricsManager) collectRayJobInfo(rayJob *rayv1.RayJob, ch chan<- prometheus.Metric) {
8894
ch <- prometheus.MustNewConstMetric(
8995
r.rayJobInfo,

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, request ctrl.Reque
135135
if errors.IsNotFound(err) {
136136
// Clear all related expectations
137137
r.rayClusterScaleExpectation.Delete(instance.Name, instance.Namespace)
138+
r.options.RayClusterMetricsManager.DeleteRayClusterMetrics(request.Name, request.Namespace)
138139
} else {
139140
logger.Error(err, "Read request instance error!")
140141
}

ray-operator/controllers/ray/rayjob_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
9191
if err := r.Get(ctx, request.NamespacedName, rayJobInstance); err != nil {
9292
if errors.IsNotFound(err) {
9393
// Request object not found, could have been deleted after reconcile request. Stop reconciliation.
94-
logger.Info("RayJob resource not found. Ignoring since object must be deleted")
94+
logger.Info("RayJob resource not found.")
95+
r.options.RayJobMetricsManager.DeleteRayJobMetrics(request.Name, request.Namespace)
9596
return ctrl.Result{}, nil
9697
}
9798
// Error reading the object - requeue the request.

0 commit comments

Comments
 (0)