Skip to content

Commit 551de65

Browse files
[Feature] Include CR UID in kuberay metrics (#4003)
* [Feature] Include CR UID in kuberay metrics * fix test bug * fix test bug * fix test bug
1 parent b898828 commit 551de65

12 files changed

+115
-79
lines changed

ray-operator/controllers/ray/metrics/mocks/ray_cluster_metrics_mock.go

Lines changed: 6 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/controllers/ray/metrics/mocks/ray_job_metrics_mock.go

Lines changed: 6 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/controllers/ray/metrics/ray_cluster_metrics.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"github.com/go-logr/logr"
88
"github.com/prometheus/client_golang/prometheus"
99
"k8s.io/apimachinery/pkg/api/meta"
10+
"k8s.io/apimachinery/pkg/types"
1011
ctrl "sigs.k8s.io/controller-runtime"
1112
"sigs.k8s.io/controller-runtime/pkg/client"
1213

@@ -16,7 +17,7 @@ import (
1617

1718
//go:generate mockgen -destination=mocks/ray_cluster_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayClusterMetricsObserver
1819
type RayClusterMetricsObserver interface {
19-
ObserveRayClusterProvisionedDuration(name, namespace string, duration float64)
20+
ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64)
2021
}
2122

2223
// RayClusterMetricsManager implements the prometheus.Collector and RayClusterMetricsObserver interface to collect ray cluster metrics.
@@ -36,7 +37,7 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray
3637
Name: "kuberay_cluster_provisioned_duration_seconds",
3738
Help: "The time, in seconds, when a RayCluster's `RayClusterProvisioned` status transitions from false (or unset) to true",
3839
},
39-
[]string{"name", "namespace"},
40+
[]string{"name", "namespace", "uid"},
4041
),
4142
// rayClusterInfo is a gauge metric that indicates the metadata information about RayCluster custom resources.
4243
// The `owner_kind` label indicates the CRD type that originated the RayCluster.
@@ -47,13 +48,13 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray
4748
rayClusterInfo: prometheus.NewDesc(
4849
"kuberay_cluster_info",
4950
"Metadata information about RayCluster custom resources",
50-
[]string{"name", "namespace", "owner_kind"},
51+
[]string{"name", "namespace", "uid", "owner_kind"},
5152
nil,
5253
),
5354
rayClusterConditionProvisioned: prometheus.NewDesc(
5455
"kuberay_cluster_condition_provisioned",
5556
"Indicates whether the RayCluster is provisioned",
56-
[]string{"name", "namespace", "condition"},
57+
[]string{"name", "namespace", "uid", "condition"},
5758
nil,
5859
),
5960
client: client,
@@ -85,8 +86,8 @@ func (r *RayClusterMetricsManager) Collect(ch chan<- prometheus.Metric) {
8586
}
8687
}
8788

88-
func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) {
89-
r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace).Set(duration)
89+
func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64) {
90+
r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace, string(uid)).Set(duration)
9091
}
9192

9293
// DeleteRayClusterMetrics removes metrics that belongs to the specified RayCluster.
@@ -107,6 +108,7 @@ func (r *RayClusterMetricsManager) collectRayClusterInfo(cluster *rayv1.RayClust
107108
1,
108109
cluster.Name,
109110
cluster.Namespace,
111+
string(cluster.UID),
110112
ownerKind,
111113
)
112114
}
@@ -118,6 +120,7 @@ func (r *RayClusterMetricsManager) collectRayClusterConditionProvisioned(cluster
118120
1,
119121
cluster.Name,
120122
cluster.Namespace,
123+
string(cluster.UID),
121124
strconv.FormatBool(meta.IsStatusConditionTrue(cluster.Status.Conditions, string(rayv1.RayClusterProvisioned))),
122125
)
123126
}

ray-operator/controllers/ray/metrics/ray_cluster_metrics_test.go

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/stretchr/testify/require"
1111
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1212
"k8s.io/apimachinery/pkg/runtime"
13+
"k8s.io/apimachinery/pkg/types"
1314
"sigs.k8s.io/controller-runtime/pkg/client"
1415
"sigs.k8s.io/controller-runtime/pkg/client/fake"
1516

@@ -30,6 +31,7 @@ func TestRayClusterInfo(t *testing.T) {
3031
ObjectMeta: metav1.ObjectMeta{
3132
Name: "test-ray-cluster",
3233
Namespace: "default",
34+
UID: types.UID("test-ray-cluster-uid"),
3335
Labels: map[string]string{
3436
"ray.io/originated-from-crd": "RayJob",
3537
},
@@ -39,13 +41,14 @@ func TestRayClusterInfo(t *testing.T) {
3941
ObjectMeta: metav1.ObjectMeta{
4042
Name: "test-ray-cluster-2",
4143
Namespace: "default",
44+
UID: types.UID("test-ray-cluster-2-uid"),
4245
Labels: map[string]string{},
4346
},
4447
},
4548
},
4649
expectedMetrics: []string{
47-
`kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob"} 1`,
48-
`kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None"} 1`,
50+
`kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob",uid="test-ray-cluster-uid"} 1`,
51+
`kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None",uid="test-ray-cluster-2-uid"} 1`,
4952
},
5053
},
5154
}
@@ -98,9 +101,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
98101

99102
// Test case 1: Delete specific cluster metrics
100103
// Manually add some metrics
101-
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster1", "namespace": "ns1"}).Set(10.5)
102-
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster2", "namespace": "ns2"}).Set(20.3)
103-
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster3", "namespace": "ns1"}).Set(5.7)
104+
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster1", "namespace": "ns1", "uid": "uid1"}).Set(10.5)
105+
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster2", "namespace": "ns2", "uid": "uid2"}).Set(20.3)
106+
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster3", "namespace": "ns1", "uid": "uid3"}).Set(5.7)
104107

105108
// Test deleting metrics for cluster1 in ns1
106109
manager.DeleteRayClusterMetrics("cluster1", "ns1")
@@ -109,9 +112,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
109112
body, statusCode := support.GetMetricsResponseAndCode(t, reg)
110113

111114
assert.Equal(t, http.StatusOK, statusCode)
112-
assert.NotContains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
113-
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
114-
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
115+
assert.NotContains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
116+
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)
117+
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)
115118

116119
// Test case 2: Delete with empty name
117120
manager.DeleteRayClusterMetrics("", "ns1")
@@ -120,9 +123,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
120123
body2, statusCode := support.GetMetricsResponseAndCode(t, reg)
121124

122125
assert.Equal(t, http.StatusOK, statusCode)
123-
assert.NotContains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
124-
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
125-
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
126+
assert.NotContains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
127+
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)
128+
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)
126129

127130
// Test case 3: Delete with empty name and namespace
128131
manager.DeleteRayClusterMetrics("", "")
@@ -131,9 +134,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
131134
body3, statusCode := support.GetMetricsResponseAndCode(t, reg)
132135

133136
assert.Equal(t, http.StatusOK, statusCode)
134-
assert.NotContains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
135-
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
136-
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
137+
assert.NotContains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
138+
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)
139+
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)
137140

138141
// Test case 4: Delete with false name and namespace
139142
manager.DeleteRayClusterMetrics("ns2", "cluster2")
@@ -142,9 +145,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
142145
body4, statusCode := support.GetMetricsResponseAndCode(t, reg)
143146

144147
assert.Equal(t, http.StatusOK, statusCode)
145-
assert.NotContains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
146-
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
147-
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
148+
assert.NotContains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
149+
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)
150+
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)
148151
}
149152

150153
func TestRayClusterConditionProvisioned(t *testing.T) {
@@ -160,6 +163,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
160163
ObjectMeta: metav1.ObjectMeta{
161164
Name: "provisioned-cluster",
162165
Namespace: "default",
166+
UID: types.UID("provisioned-cluster-uid"),
163167
},
164168
Status: rayv1.RayClusterStatus{
165169
Conditions: []metav1.Condition{
@@ -174,6 +178,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
174178
ObjectMeta: metav1.ObjectMeta{
175179
Name: "unprovisioned-cluster",
176180
Namespace: "default",
181+
UID: types.UID("unprovisioned-cluster-uid"),
177182
},
178183
Status: rayv1.RayClusterStatus{
179184
Conditions: []metav1.Condition{
@@ -186,8 +191,8 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
186191
},
187192
},
188193
expectedMetrics: []string{
189-
`kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default"} 1`,
190-
`kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default"} 1`,
194+
`kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default",uid="provisioned-cluster-uid"} 1`,
195+
`kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default",uid="unprovisioned-cluster-uid"} 1`,
191196
},
192197
},
193198
}

ray-operator/controllers/ray/metrics/ray_job_metrics.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
"github.com/go-logr/logr"
88
"github.com/prometheus/client_golang/prometheus"
9+
"k8s.io/apimachinery/pkg/types"
910
ctrl "sigs.k8s.io/controller-runtime"
1011
"sigs.k8s.io/controller-runtime/pkg/client"
1112

@@ -14,7 +15,7 @@ import (
1415

1516
//go:generate mockgen -destination=mocks/ray_job_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayJobMetricsObserver
1617
type RayJobMetricsObserver interface {
17-
ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64)
18+
ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64)
1819
}
1920

2021
// RayJobMetricsManager implements the prometheus.Collector and RayJobMetricsObserver interface to collect ray job metrics.
@@ -34,20 +35,20 @@ func NewRayJobMetricsManager(ctx context.Context, client client.Client) *RayJobM
3435
Name: "kuberay_job_execution_duration_seconds",
3536
Help: "Duration from when the RayJob CR’s JobDeploymentStatus transitions from Initializing to either the Retrying state or a terminal state, such as Complete or Failed. The Retrying state indicates that the CR previously failed and that spec.backoffLimit is enabled.",
3637
},
37-
[]string{"name", "namespace", "job_deployment_status", "retry_count"},
38+
[]string{"name", "namespace", "uid", "job_deployment_status", "retry_count"},
3839
),
3940
// rayJobInfo is a gauge metric that indicates the metadata information about RayJob custom resources.
4041
rayJobInfo: prometheus.NewDesc(
4142
"kuberay_job_info",
4243
"Metadata information about RayJob custom resources",
43-
[]string{"name", "namespace"},
44+
[]string{"name", "namespace", "uid"},
4445
nil,
4546
),
4647
// rayJobDeploymentStatus is a gauge metric that indicates the current deployment status of the RayJob custom resources.
4748
rayJobDeploymentStatus: prometheus.NewDesc(
4849
"kuberay_job_deployment_status",
4950
"The RayJob's current deployment status",
50-
[]string{"name", "namespace", "deployment_status"},
51+
[]string{"name", "namespace", "uid", "deployment_status"},
5152
nil,
5253
),
5354
client: client,
@@ -80,8 +81,8 @@ func (r *RayJobMetricsManager) Collect(ch chan<- prometheus.Metric) {
8081
}
8182
}
8283

83-
func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) {
84-
r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration)
84+
func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) {
85+
r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(uid), string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration)
8586
}
8687

8788
// DeleteRayJobMetrics removes metrics that belongs to the specified RayJob.
@@ -97,6 +98,7 @@ func (r *RayJobMetricsManager) collectRayJobInfo(rayJob *rayv1.RayJob, ch chan<-
9798
1,
9899
rayJob.Name,
99100
rayJob.Namespace,
101+
string(rayJob.UID),
100102
)
101103
}
102104

@@ -107,6 +109,7 @@ func (r *RayJobMetricsManager) collectRayJobDeploymentStatus(rayJob *rayv1.RayJo
107109
1,
108110
rayJob.Name,
109111
rayJob.Namespace,
112+
string(rayJob.UID),
110113
string(rayJob.Status.JobDeploymentStatus),
111114
)
112115
}

0 commit comments

Comments
 (0)