diff --git a/ray-operator/controllers/ray/metrics/mocks/ray_cluster_metrics_mock.go b/ray-operator/controllers/ray/metrics/mocks/ray_cluster_metrics_mock.go index 4de05cf1231..d0401c36961 100644 --- a/ray-operator/controllers/ray/metrics/mocks/ray_cluster_metrics_mock.go +++ b/ray-operator/controllers/ray/metrics/mocks/ray_cluster_metrics_mock.go @@ -3,7 +3,7 @@ // // Generated by this command: // -// mockgen -destination=mocks/ray_cluster_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayClusterMetricsObserver +// mockgen -destination=controllers/ray/metrics/mocks/ray_cluster_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayClusterMetricsObserver // // Package mocks is a generated GoMock package. @@ -13,6 +13,7 @@ import ( reflect "reflect" gomock "go.uber.org/mock/gomock" + types "k8s.io/apimachinery/pkg/types" ) // MockRayClusterMetricsObserver is a mock of RayClusterMetricsObserver interface. @@ -40,13 +41,13 @@ func (m *MockRayClusterMetricsObserver) EXPECT() *MockRayClusterMetricsObserverM } // ObserveRayClusterProvisionedDuration mocks base method. -func (m *MockRayClusterMetricsObserver) ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) { +func (m *MockRayClusterMetricsObserver) ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64) { m.ctrl.T.Helper() - m.ctrl.Call(m, "ObserveRayClusterProvisionedDuration", name, namespace, duration) + m.ctrl.Call(m, "ObserveRayClusterProvisionedDuration", name, namespace, uid, duration) } // ObserveRayClusterProvisionedDuration indicates an expected call of ObserveRayClusterProvisionedDuration. -func (mr *MockRayClusterMetricsObserverMockRecorder) ObserveRayClusterProvisionedDuration(name, namespace, duration any) *gomock.Call { +func (mr *MockRayClusterMetricsObserverMockRecorder) ObserveRayClusterProvisionedDuration(name, namespace, uid, duration any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ObserveRayClusterProvisionedDuration", reflect.TypeOf((*MockRayClusterMetricsObserver)(nil).ObserveRayClusterProvisionedDuration), name, namespace, duration) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ObserveRayClusterProvisionedDuration", reflect.TypeOf((*MockRayClusterMetricsObserver)(nil).ObserveRayClusterProvisionedDuration), name, namespace, uid, duration) } diff --git a/ray-operator/controllers/ray/metrics/mocks/ray_job_metrics_mock.go b/ray-operator/controllers/ray/metrics/mocks/ray_job_metrics_mock.go index 0248670a571..394f0ef2aa8 100644 --- a/ray-operator/controllers/ray/metrics/mocks/ray_job_metrics_mock.go +++ b/ray-operator/controllers/ray/metrics/mocks/ray_job_metrics_mock.go @@ -3,7 +3,7 @@ // // Generated by this command: // -// mockgen -destination=mocks/ray_job_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayJobMetricsObserver +// mockgen -destination=controllers/ray/metrics/mocks/ray_job_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayJobMetricsObserver // // Package mocks is a generated GoMock package. @@ -14,6 +14,7 @@ import ( v1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" gomock "go.uber.org/mock/gomock" + types "k8s.io/apimachinery/pkg/types" ) // MockRayJobMetricsObserver is a mock of RayJobMetricsObserver interface. @@ -41,13 +42,13 @@ func (m *MockRayJobMetricsObserver) EXPECT() *MockRayJobMetricsObserverMockRecor } // ObserveRayJobExecutionDuration mocks base method. -func (m *MockRayJobMetricsObserver) ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus v1.JobDeploymentStatus, retryCount int, duration float64) { +func (m *MockRayJobMetricsObserver) ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus v1.JobDeploymentStatus, retryCount int, duration float64) { m.ctrl.T.Helper() - m.ctrl.Call(m, "ObserveRayJobExecutionDuration", name, namespace, jobDeploymentStatus, retryCount, duration) + m.ctrl.Call(m, "ObserveRayJobExecutionDuration", name, namespace, uid, jobDeploymentStatus, retryCount, duration) } // ObserveRayJobExecutionDuration indicates an expected call of ObserveRayJobExecutionDuration. -func (mr *MockRayJobMetricsObserverMockRecorder) ObserveRayJobExecutionDuration(name, namespace, jobDeploymentStatus, retryCount, duration any) *gomock.Call { +func (mr *MockRayJobMetricsObserverMockRecorder) ObserveRayJobExecutionDuration(name, namespace, uid, jobDeploymentStatus, retryCount, duration any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ObserveRayJobExecutionDuration", reflect.TypeOf((*MockRayJobMetricsObserver)(nil).ObserveRayJobExecutionDuration), name, namespace, jobDeploymentStatus, retryCount, duration) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ObserveRayJobExecutionDuration", reflect.TypeOf((*MockRayJobMetricsObserver)(nil).ObserveRayJobExecutionDuration), name, namespace, uid, jobDeploymentStatus, retryCount, duration) } diff --git a/ray-operator/controllers/ray/metrics/ray_cluster_metrics.go b/ray-operator/controllers/ray/metrics/ray_cluster_metrics.go index 5ac87269727..0a6b8405d6b 100644 --- a/ray-operator/controllers/ray/metrics/ray_cluster_metrics.go +++ b/ray-operator/controllers/ray/metrics/ray_cluster_metrics.go @@ -7,6 +7,7 @@ import ( "github.com/go-logr/logr" "github.com/prometheus/client_golang/prometheus" "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -16,7 +17,7 @@ import ( //go:generate mockgen -destination=mocks/ray_cluster_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayClusterMetricsObserver type RayClusterMetricsObserver interface { - ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) + ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64) } // RayClusterMetricsManager implements the prometheus.Collector and RayClusterMetricsObserver interface to collect ray cluster metrics. @@ -36,7 +37,7 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray Name: "kuberay_cluster_provisioned_duration_seconds", Help: "The time, in seconds, when a RayCluster's `RayClusterProvisioned` status transitions from false (or unset) to true", }, - []string{"name", "namespace"}, + []string{"name", "namespace", "uid"}, ), // rayClusterInfo is a gauge metric that indicates the metadata information about RayCluster custom resources. // The `owner_kind` label indicates the CRD type that originated the RayCluster. @@ -47,13 +48,13 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray rayClusterInfo: prometheus.NewDesc( "kuberay_cluster_info", "Metadata information about RayCluster custom resources", - []string{"name", "namespace", "owner_kind"}, + []string{"name", "namespace", "uid", "owner_kind"}, nil, ), rayClusterConditionProvisioned: prometheus.NewDesc( "kuberay_cluster_condition_provisioned", "Indicates whether the RayCluster is provisioned", - []string{"name", "namespace", "condition"}, + []string{"name", "namespace", "uid", "condition"}, nil, ), client: client, @@ -85,8 +86,8 @@ func (r *RayClusterMetricsManager) Collect(ch chan<- prometheus.Metric) { } } -func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) { - r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace).Set(duration) +func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64) { + r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace, string(uid)).Set(duration) } // DeleteRayClusterMetrics removes metrics that belongs to the specified RayCluster. @@ -107,6 +108,7 @@ func (r *RayClusterMetricsManager) collectRayClusterInfo(cluster *rayv1.RayClust 1, cluster.Name, cluster.Namespace, + string(cluster.UID), ownerKind, ) } @@ -118,6 +120,7 @@ func (r *RayClusterMetricsManager) collectRayClusterConditionProvisioned(cluster 1, cluster.Name, cluster.Namespace, + string(cluster.UID), strconv.FormatBool(meta.IsStatusConditionTrue(cluster.Status.Conditions, string(rayv1.RayClusterProvisioned))), ) } diff --git a/ray-operator/controllers/ray/metrics/ray_cluster_metrics_test.go b/ray-operator/controllers/ray/metrics/ray_cluster_metrics_test.go index 67b620728b8..abd8def68d9 100644 --- a/ray-operator/controllers/ray/metrics/ray_cluster_metrics_test.go +++ b/ray-operator/controllers/ray/metrics/ray_cluster_metrics_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -30,6 +31,7 @@ func TestRayClusterInfo(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-ray-cluster", Namespace: "default", + UID: types.UID("test-ray-cluster-uid"), Labels: map[string]string{ "ray.io/originated-from-crd": "RayJob", }, @@ -39,13 +41,14 @@ func TestRayClusterInfo(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-ray-cluster-2", Namespace: "default", + UID: types.UID("test-ray-cluster-2-uid"), Labels: map[string]string{}, }, }, }, expectedMetrics: []string{ - `kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob"} 1`, - `kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None"} 1`, + `kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob",uid="test-ray-cluster-uid"} 1`, + `kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None",uid="test-ray-cluster-2-uid"} 1`, }, }, } @@ -98,9 +101,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) { // Test case 1: Delete specific cluster metrics // Manually add some metrics - manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster1", "namespace": "ns1"}).Set(10.5) - manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster2", "namespace": "ns2"}).Set(20.3) - manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster3", "namespace": "ns1"}).Set(5.7) + manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster1", "namespace": "ns1", "uid": "uid1"}).Set(10.5) + manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster2", "namespace": "ns2", "uid": "uid2"}).Set(20.3) + manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster3", "namespace": "ns1", "uid": "uid3"}).Set(5.7) // Test deleting metrics for cluster1 in ns1 manager.DeleteRayClusterMetrics("cluster1", "ns1") @@ -109,9 +112,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) { body, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`) - assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`) - assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`) + assert.NotContains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`) + assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`) + assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`) // Test case 2: Delete with empty name manager.DeleteRayClusterMetrics("", "ns1") @@ -120,9 +123,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) { body2, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`) - assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`) - assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`) + assert.NotContains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`) + assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`) + assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`) // Test case 3: Delete with empty name and namespace manager.DeleteRayClusterMetrics("", "") @@ -131,9 +134,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) { body3, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`) - assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`) - assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`) + assert.NotContains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`) + assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`) + assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`) // Test case 4: Delete with false name and namespace manager.DeleteRayClusterMetrics("ns2", "cluster2") @@ -142,9 +145,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) { body4, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`) - assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`) - assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`) + assert.NotContains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`) + assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`) + assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`) } func TestRayClusterConditionProvisioned(t *testing.T) { @@ -160,6 +163,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "provisioned-cluster", Namespace: "default", + UID: types.UID("provisioned-cluster-uid"), }, Status: rayv1.RayClusterStatus{ Conditions: []metav1.Condition{ @@ -174,6 +178,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "unprovisioned-cluster", Namespace: "default", + UID: types.UID("unprovisioned-cluster-uid"), }, Status: rayv1.RayClusterStatus{ Conditions: []metav1.Condition{ @@ -186,8 +191,8 @@ func TestRayClusterConditionProvisioned(t *testing.T) { }, }, expectedMetrics: []string{ - `kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default"} 1`, - `kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default"} 1`, + `kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default",uid="provisioned-cluster-uid"} 1`, + `kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default",uid="unprovisioned-cluster-uid"} 1`, }, }, } diff --git a/ray-operator/controllers/ray/metrics/ray_job_metrics.go b/ray-operator/controllers/ray/metrics/ray_job_metrics.go index 984ec78d7e8..bbad5ee2aa5 100644 --- a/ray-operator/controllers/ray/metrics/ray_job_metrics.go +++ b/ray-operator/controllers/ray/metrics/ray_job_metrics.go @@ -6,6 +6,7 @@ import ( "github.com/go-logr/logr" "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -14,7 +15,7 @@ import ( //go:generate mockgen -destination=mocks/ray_job_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayJobMetricsObserver type RayJobMetricsObserver interface { - ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) + ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) } // RayJobMetricsManager implements the prometheus.Collector and RayJobMetricsObserver interface to collect ray job metrics. @@ -34,20 +35,20 @@ func NewRayJobMetricsManager(ctx context.Context, client client.Client) *RayJobM Name: "kuberay_job_execution_duration_seconds", Help: "Duration from when the RayJob CR’s JobDeploymentStatus transitions from Initializing to either the Retrying state or a terminal state, such as Complete or Failed. The Retrying state indicates that the CR previously failed and that spec.backoffLimit is enabled.", }, - []string{"name", "namespace", "job_deployment_status", "retry_count"}, + []string{"name", "namespace", "uid", "job_deployment_status", "retry_count"}, ), // rayJobInfo is a gauge metric that indicates the metadata information about RayJob custom resources. rayJobInfo: prometheus.NewDesc( "kuberay_job_info", "Metadata information about RayJob custom resources", - []string{"name", "namespace"}, + []string{"name", "namespace", "uid"}, nil, ), // rayJobDeploymentStatus is a gauge metric that indicates the current deployment status of the RayJob custom resources. rayJobDeploymentStatus: prometheus.NewDesc( "kuberay_job_deployment_status", "The RayJob's current deployment status", - []string{"name", "namespace", "deployment_status"}, + []string{"name", "namespace", "uid", "deployment_status"}, nil, ), client: client, @@ -80,8 +81,8 @@ func (r *RayJobMetricsManager) Collect(ch chan<- prometheus.Metric) { } } -func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) { - r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration) +func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) { + r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(uid), string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration) } // DeleteRayJobMetrics removes metrics that belongs to the specified RayJob. @@ -97,6 +98,7 @@ func (r *RayJobMetricsManager) collectRayJobInfo(rayJob *rayv1.RayJob, ch chan<- 1, rayJob.Name, rayJob.Namespace, + string(rayJob.UID), ) } @@ -107,6 +109,7 @@ func (r *RayJobMetricsManager) collectRayJobDeploymentStatus(rayJob *rayv1.RayJo 1, rayJob.Name, rayJob.Namespace, + string(rayJob.UID), string(rayJob.Status.JobDeploymentStatus), ) } diff --git a/ray-operator/controllers/ray/metrics/ray_job_metrics_test.go b/ray-operator/controllers/ray/metrics/ray_job_metrics_test.go index 717fb01fb6c..f60eb2764d2 100644 --- a/ray-operator/controllers/ray/metrics/ray_job_metrics_test.go +++ b/ray-operator/controllers/ray/metrics/ray_job_metrics_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -30,18 +31,20 @@ func TestMetricRayJobInfo(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-job-1", Namespace: "ns1", + UID: types.UID("ray-job-1-uid"), }, }, { ObjectMeta: metav1.ObjectMeta{ Name: "ray-job-2", Namespace: "ns2", + UID: types.UID("ray-job-2-uid"), }, }, }, expectedMetrics: []string{ - `kuberay_job_info{name="ray-job-1",namespace="ns1"} 1`, - `kuberay_job_info{name="ray-job-2",namespace="ns2"} 1`, + `kuberay_job_info{name="ray-job-1",namespace="ns1",uid="ray-job-1-uid"} 1`, + `kuberay_job_info{name="ray-job-2",namespace="ns2",uid="ray-job-2-uid"} 1`, }, }, } @@ -93,9 +96,9 @@ func TestDeleteRayJobMetrics(t *testing.T) { // Test case 1: Delete specific job metrics // Manually add some metrics - manager.ObserveRayJobExecutionDuration("job1", "ns1", rayv1.JobDeploymentStatusComplete, 0, 10.5) - manager.ObserveRayJobExecutionDuration("job2", "ns2", rayv1.JobDeploymentStatusFailed, 1, 20.3) - manager.ObserveRayJobExecutionDuration("job3", "ns1", rayv1.JobDeploymentStatusRunning, 0, 5.7) + manager.ObserveRayJobExecutionDuration("job1", "ns1", "uid1", rayv1.JobDeploymentStatusComplete, 0, 10.5) + manager.ObserveRayJobExecutionDuration("job2", "ns2", "uid2", rayv1.JobDeploymentStatusFailed, 1, 20.3) + manager.ObserveRayJobExecutionDuration("job3", "ns1", "uid3", rayv1.JobDeploymentStatusRunning, 0, 5.7) // Test deleting metrics for job1 in ns1 manager.DeleteRayJobMetrics("job1", "ns1") @@ -104,9 +107,9 @@ func TestDeleteRayJobMetrics(t *testing.T) { body, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0"}`) - assert.Contains(t, body, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1"}`) - assert.Contains(t, body, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0"}`) + assert.NotContains(t, body, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0",uid="uid1"}`) + assert.Contains(t, body, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1",uid="uid2"}`) + assert.Contains(t, body, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0",uid="uid3"}`) // Test case 2: Delete with empty name manager.DeleteRayJobMetrics("", "ns1") @@ -115,9 +118,9 @@ func TestDeleteRayJobMetrics(t *testing.T) { body2, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body2, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0"}`) - assert.Contains(t, body2, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1"}`) - assert.Contains(t, body2, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0"}`) + assert.NotContains(t, body2, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0",uid="uid1"}`) + assert.Contains(t, body2, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1",uid="uid2"}`) + assert.Contains(t, body2, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0",uid="uid3"}`) // Test case 3: Delete with empty name and namespace manager.DeleteRayJobMetrics("", "") @@ -126,9 +129,9 @@ func TestDeleteRayJobMetrics(t *testing.T) { body3, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body3, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0"}`) - assert.Contains(t, body3, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1"}`) - assert.Contains(t, body3, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0"}`) + assert.NotContains(t, body3, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0",uid="uid1"}`) + assert.Contains(t, body3, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1",uid="uid2"}`) + assert.Contains(t, body3, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0",uid="uid3"}`) // Test case 4: Delete with false name and namespace manager.DeleteRayJobMetrics("ns2", "job2") @@ -137,9 +140,9 @@ func TestDeleteRayJobMetrics(t *testing.T) { body4, statusCode := support.GetMetricsResponseAndCode(t, reg) assert.Equal(t, http.StatusOK, statusCode) - assert.NotContains(t, body4, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0"}`) - assert.Contains(t, body4, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1"}`) - assert.Contains(t, body4, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0"}`) + assert.NotContains(t, body4, `kuberay_job_execution_duration_seconds{job_deployment_status="Complete",name="job1",namespace="ns1",retry_count="0",uid="uid1"}`) + assert.Contains(t, body4, `kuberay_job_execution_duration_seconds{job_deployment_status="Failed",name="job2",namespace="ns2",retry_count="1",uid="uid2"}`) + assert.Contains(t, body4, `kuberay_job_execution_duration_seconds{job_deployment_status="Running",name="job3",namespace="ns1",retry_count="0",uid="uid3"}`) } func TestMetricRayJobDeploymentStatus(t *testing.T) { @@ -155,6 +158,7 @@ func TestMetricRayJobDeploymentStatus(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-job-1", Namespace: "ns1", + UID: types.UID("ray-job-1-uid"), }, Status: rayv1.RayJobStatus{ JobDeploymentStatus: rayv1.JobDeploymentStatusRunning, @@ -164,6 +168,7 @@ func TestMetricRayJobDeploymentStatus(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-job-2", Namespace: "ns2", + UID: types.UID("ray-job-2-uid"), }, Status: rayv1.RayJobStatus{ JobDeploymentStatus: rayv1.JobDeploymentStatusFailed, @@ -171,8 +176,8 @@ func TestMetricRayJobDeploymentStatus(t *testing.T) { }, }, expectedMetrics: []string{ - `kuberay_job_deployment_status{deployment_status="Running",name="ray-job-1",namespace="ns1"} 1`, - `kuberay_job_deployment_status{deployment_status="Failed",name="ray-job-2",namespace="ns2"} 1`, + `kuberay_job_deployment_status{deployment_status="Running",name="ray-job-1",namespace="ns1",uid="ray-job-1-uid"} 1`, + `kuberay_job_deployment_status{deployment_status="Failed",name="ray-job-2",namespace="ns2",uid="ray-job-2-uid"} 1`, }, }, } diff --git a/ray-operator/controllers/ray/metrics/ray_service_metrics.go b/ray-operator/controllers/ray/metrics/ray_service_metrics.go index 2f3e4b04cdd..97285f92cf8 100644 --- a/ray-operator/controllers/ray/metrics/ray_service_metrics.go +++ b/ray-operator/controllers/ray/metrics/ray_service_metrics.go @@ -28,19 +28,19 @@ func NewRayServiceMetricsManager(ctx context.Context, client client.Client) *Ray rayServiceInfo: prometheus.NewDesc( "kuberay_service_info", "Metadata information about RayService custom resources", - []string{"name", "namespace"}, + []string{"name", "namespace", "uid"}, nil, ), rayServiceConditionReady: prometheus.NewDesc( "kuberay_service_condition_ready", "Describes whether the RayService is ready. Ready means users can send requests to the underlying cluster and the number of serve endpoints is greater than 0.", - []string{"name", "namespace", "condition"}, + []string{"name", "namespace", "uid", "condition"}, nil, ), rayServiceConditionUpgradeInProgress: prometheus.NewDesc( "kuberay_service_condition_upgrade_in_progress", "Describes whether the RayService is performing a zero-downtime upgrade.", - []string{"name", "namespace", "condition"}, + []string{"name", "namespace", "uid", "condition"}, nil, ), client: client, @@ -76,6 +76,7 @@ func (c *RayServiceMetricsManager) collectRayServiceInfo(service *rayv1.RayServi 1, service.Name, service.Namespace, + string(service.UID), ) } @@ -87,6 +88,7 @@ func (c *RayServiceMetricsManager) collectRayServiceConditionMetrics(service *ra 1, service.Name, service.Namespace, + string(service.UID), strconv.FormatBool(ready), ) upgradeInProgress := meta.IsStatusConditionTrue(service.Status.Conditions, string(rayv1.UpgradeInProgress)) @@ -96,6 +98,7 @@ func (c *RayServiceMetricsManager) collectRayServiceConditionMetrics(service *ra 1, service.Name, service.Namespace, + string(service.UID), strconv.FormatBool(upgradeInProgress), ) } diff --git a/ray-operator/controllers/ray/metrics/ray_service_metrics_test.go b/ray-operator/controllers/ray/metrics/ray_service_metrics_test.go index a502d1f9d56..0078832a62a 100644 --- a/ray-operator/controllers/ray/metrics/ray_service_metrics_test.go +++ b/ray-operator/controllers/ray/metrics/ray_service_metrics_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -30,18 +31,20 @@ func TestRayServiceInfo(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-service-1", Namespace: "default", + UID: types.UID("ray-service-1-uid"), }, }, { ObjectMeta: metav1.ObjectMeta{ Name: "ray-service-2", Namespace: "default", + UID: types.UID("ray-service-2-uid"), }, }, }, expectedInfo: []string{ - `kuberay_service_info{name="ray-service-1",namespace="default"} 1`, - `kuberay_service_info{name="ray-service-2",namespace="default"} 1`, + `kuberay_service_info{name="ray-service-1",namespace="default",uid="ray-service-1-uid"} 1`, + `kuberay_service_info{name="ray-service-2",namespace="default",uid="ray-service-2-uid"} 1`, }, }, } @@ -96,6 +99,7 @@ func TestRayServiceCondition(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-service-1", Namespace: "default", + UID: types.UID("ray-service-1-uid"), }, Status: rayv1.RayServiceStatuses{ Conditions: []metav1.Condition{ @@ -110,6 +114,7 @@ func TestRayServiceCondition(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-service-2", Namespace: "default", + UID: types.UID("ray-service-2-uid"), }, Status: rayv1.RayServiceStatuses{ Conditions: []metav1.Condition{ @@ -122,8 +127,8 @@ func TestRayServiceCondition(t *testing.T) { }, }, expectedInfo: []string{ - `kuberay_service_condition_ready{condition="true",name="ray-service-1",namespace="default"} 1`, - `kuberay_service_condition_ready{condition="false",name="ray-service-2",namespace="default"} 1`, + `kuberay_service_condition_ready{condition="true",name="ray-service-1",namespace="default",uid="ray-service-1-uid"} 1`, + `kuberay_service_condition_ready{condition="false",name="ray-service-2",namespace="default",uid="ray-service-2-uid"} 1`, }, }, { @@ -133,6 +138,7 @@ func TestRayServiceCondition(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-service-1", Namespace: "default", + UID: types.UID("ray-service-1-uid"), }, Status: rayv1.RayServiceStatuses{ Conditions: []metav1.Condition{ @@ -147,6 +153,7 @@ func TestRayServiceCondition(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "ray-service-2", Namespace: "default", + UID: types.UID("ray-service-2-uid"), }, Status: rayv1.RayServiceStatuses{ Conditions: []metav1.Condition{ @@ -159,8 +166,8 @@ func TestRayServiceCondition(t *testing.T) { }, }, expectedInfo: []string{ - `kuberay_service_condition_upgrade_in_progress{condition="true",name="ray-service-1",namespace="default"} 1`, - `kuberay_service_condition_upgrade_in_progress{condition="false",name="ray-service-2",namespace="default"} 1`, + `kuberay_service_condition_upgrade_in_progress{condition="true",name="ray-service-1",namespace="default",uid="ray-service-1-uid"} 1`, + `kuberay_service_condition_upgrade_in_progress{condition="false",name="ray-service-2",namespace="default",uid="ray-service-2-uid"} 1`, }, }, } diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 51eb06bcca8..814f758a237 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -22,6 +22,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -1540,24 +1541,24 @@ func (r *RayClusterReconciler) updateRayClusterStatus(ctx context.Context, origi if err != nil { logger.Info("Error updating status", "name", originalRayClusterInstance.Name, "error", err, "RayCluster", newInstance) } else { - emitRayClusterMetrics(r.options.RayClusterMetricsManager, newInstance.Name, newInstance.Namespace, originalRayClusterInstance.Status, newInstance.Status, newInstance.CreationTimestamp.Time) + emitRayClusterMetrics(r.options.RayClusterMetricsManager, newInstance.Name, newInstance.Namespace, newInstance.UID, originalRayClusterInstance.Status, newInstance.Status, newInstance.CreationTimestamp.Time) } return inconsistent, err } -func emitRayClusterMetrics(rayClusterMetricsManager *metrics.RayClusterMetricsManager, clusterName, namespace string, oldStatus, newStatus rayv1.RayClusterStatus, creationTimestamp time.Time) { +func emitRayClusterMetrics(rayClusterMetricsManager *metrics.RayClusterMetricsManager, clusterName, namespace string, clusterUID types.UID, oldStatus, newStatus rayv1.RayClusterStatus, creationTimestamp time.Time) { if rayClusterMetricsManager == nil { return } - emitRayClusterProvisionedDuration(rayClusterMetricsManager, clusterName, namespace, oldStatus, newStatus, creationTimestamp) + emitRayClusterProvisionedDuration(rayClusterMetricsManager, clusterName, namespace, clusterUID, oldStatus, newStatus, creationTimestamp) } -func emitRayClusterProvisionedDuration(RayClusterMetricsObserver metrics.RayClusterMetricsObserver, clusterName, namespace string, oldStatus, newStatus rayv1.RayClusterStatus, creationTimestamp time.Time) { +func emitRayClusterProvisionedDuration(RayClusterMetricsObserver metrics.RayClusterMetricsObserver, clusterName, namespace string, clusterUID types.UID, oldStatus, newStatus rayv1.RayClusterStatus, creationTimestamp time.Time) { // Emit kuberay_cluster_provisioned_duration_seconds when a RayCluster's RayClusterProvisioned status transitions from false (or unset) to true if !meta.IsStatusConditionTrue(oldStatus.Conditions, string(rayv1.RayClusterProvisioned)) && meta.IsStatusConditionTrue(newStatus.Conditions, string(rayv1.RayClusterProvisioned)) { - RayClusterMetricsObserver.ObserveRayClusterProvisionedDuration(clusterName, namespace, time.Since(creationTimestamp).Seconds()) + RayClusterMetricsObserver.ObserveRayClusterProvisionedDuration(clusterName, namespace, clusterUID, time.Since(creationTimestamp).Seconds()) } } diff --git a/ray-operator/controllers/ray/raycluster_controller_unit_test.go b/ray-operator/controllers/ray/raycluster_controller_unit_test.go index ec86603d0e8..6749b80dea3 100644 --- a/ray-operator/controllers/ray/raycluster_controller_unit_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_unit_test.go @@ -3437,6 +3437,7 @@ func Test_ReconcileManagedBy(t *testing.T) { func TestEmitRayClusterProvisionedDuration(t *testing.T) { clusterName := "test-ray-cluster" clusterNamespace := "default" + clusterUID := types.UID("test-cluster-uid") // Creation time 5 minutes ago to simulate cluster runtime creationTime := time.Now().Add(-5 * time.Minute) @@ -3515,6 +3516,7 @@ func TestEmitRayClusterProvisionedDuration(t *testing.T) { ObserveRayClusterProvisionedDuration( clusterName, clusterNamespace, + clusterUID, mock.MatchedBy(func(d float64) bool { // Allow some wiggle room in timing return math.Abs(d-tc.expectedDuration) < 1.0 @@ -3526,6 +3528,7 @@ func TestEmitRayClusterProvisionedDuration(t *testing.T) { mockCollector, clusterName, clusterNamespace, + clusterUID, tc.oldStatus, tc.newStatus, creationTime, diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 4f44657b90a..e822c0f999a 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -14,6 +14,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -456,18 +457,18 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) logger.Info("Failed to update RayJob status", "error", err) return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err } - emitRayJobMetrics(r.options.RayJobMetricsManager, rayJobInstance.Name, rayJobInstance.Namespace, originalRayJobInstance.Status, rayJobInstance.Status) + emitRayJobMetrics(r.options.RayJobMetricsManager, rayJobInstance.Name, rayJobInstance.Namespace, rayJobInstance.UID, originalRayJobInstance.Status, rayJobInstance.Status) return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil } -func emitRayJobMetrics(rayJobMetricsManager *metrics.RayJobMetricsManager, rayJobName, rayJobNamespace string, originalRayJobStatus, rayJobStatus rayv1.RayJobStatus) { +func emitRayJobMetrics(rayJobMetricsManager *metrics.RayJobMetricsManager, rayJobName, rayJobNamespace string, rayJobUID types.UID, originalRayJobStatus, rayJobStatus rayv1.RayJobStatus) { if rayJobMetricsManager == nil { return } - emitRayJobExecutionDuration(rayJobMetricsManager, rayJobName, rayJobNamespace, originalRayJobStatus, rayJobStatus) + emitRayJobExecutionDuration(rayJobMetricsManager, rayJobName, rayJobNamespace, rayJobUID, originalRayJobStatus, rayJobStatus) } -func emitRayJobExecutionDuration(rayJobMetricsObserver metrics.RayJobMetricsObserver, rayJobName, rayJobNamespace string, originalRayJobStatus, rayJobStatus rayv1.RayJobStatus) { +func emitRayJobExecutionDuration(rayJobMetricsObserver metrics.RayJobMetricsObserver, rayJobName, rayJobNamespace string, rayJobUID types.UID, originalRayJobStatus, rayJobStatus rayv1.RayJobStatus) { // Emit kuberay_job_execution_duration_seconds when a job transitions from a non-terminal state to either a terminal state or a retrying state (following a failure). if !rayv1.IsJobDeploymentTerminal(originalRayJobStatus.JobDeploymentStatus) && (rayv1.IsJobDeploymentTerminal(rayJobStatus.JobDeploymentStatus) || rayJobStatus.JobDeploymentStatus == rayv1.JobDeploymentStatusRetrying) { retryCount := 0 @@ -477,6 +478,7 @@ func emitRayJobExecutionDuration(rayJobMetricsObserver metrics.RayJobMetricsObse rayJobMetricsObserver.ObserveRayJobExecutionDuration( rayJobName, rayJobNamespace, + rayJobUID, rayJobStatus.JobDeploymentStatus, retryCount, time.Since(rayJobStatus.StartTime.Time).Seconds(), diff --git a/ray-operator/controllers/ray/rayjob_controller_unit_test.go b/ray-operator/controllers/ray/rayjob_controller_unit_test.go index 4e0eda26cb5..c7d2acac80a 100644 --- a/ray-operator/controllers/ray/rayjob_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_unit_test.go @@ -535,6 +535,7 @@ func TestFailedDeleteRayClusterEvent(t *testing.T) { func TestEmitRayJobExecutionDuration(t *testing.T) { rayJobName := "test-job" rayJobNamespace := "default" + rayJobUID := types.UID("test-job-uid") mockTime := time.Now().Add(-60 * time.Second) //nolint:govet // disable govet to keep the order of the struct fields @@ -611,6 +612,7 @@ func TestEmitRayJobExecutionDuration(t *testing.T) { ObserveRayJobExecutionDuration( rayJobName, rayJobNamespace, + rayJobUID, tt.expectedJobDeploymentStatus, tt.expectedRetryCount, mock.MatchedBy(func(d float64) bool { @@ -620,7 +622,7 @@ func TestEmitRayJobExecutionDuration(t *testing.T) { ).Times(1) } - emitRayJobExecutionDuration(mockObserver, rayJobName, rayJobNamespace, tt.originalRayJobStatus, tt.rayJobStatus) + emitRayJobExecutionDuration(mockObserver, rayJobName, rayJobNamespace, rayJobUID, tt.originalRayJobStatus, tt.rayJobStatus) }) } }