Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 9 additions & 6 deletions ray-operator/controllers/ray/metrics/ray_cluster_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand All @@ -16,7 +17,7 @@ import (

//go:generate mockgen -destination=mocks/ray_cluster_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayClusterMetricsObserver
type RayClusterMetricsObserver interface {
ObserveRayClusterProvisionedDuration(name, namespace string, duration float64)
ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64)
}

// RayClusterMetricsManager implements the prometheus.Collector and RayClusterMetricsObserver interface to collect ray cluster metrics.
Expand All @@ -36,7 +37,7 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray
Name: "kuberay_cluster_provisioned_duration_seconds",
Help: "The time, in seconds, when a RayCluster's `RayClusterProvisioned` status transitions from false (or unset) to true",
},
[]string{"name", "namespace"},
[]string{"name", "namespace", "uid"},
),
// rayClusterInfo is a gauge metric that indicates the metadata information about RayCluster custom resources.
// The `owner_kind` label indicates the CRD type that originated the RayCluster.
Expand All @@ -47,13 +48,13 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray
rayClusterInfo: prometheus.NewDesc(
"kuberay_cluster_info",
"Metadata information about RayCluster custom resources",
[]string{"name", "namespace", "owner_kind"},
[]string{"name", "namespace", "uid", "owner_kind"},
nil,
),
rayClusterConditionProvisioned: prometheus.NewDesc(
"kuberay_cluster_condition_provisioned",
"Indicates whether the RayCluster is provisioned",
[]string{"name", "namespace", "condition"},
[]string{"name", "namespace", "uid", "condition"},
nil,
),
client: client,
Expand Down Expand Up @@ -85,8 +86,8 @@ func (r *RayClusterMetricsManager) Collect(ch chan<- prometheus.Metric) {
}
}

func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) {
r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace).Set(duration)
func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64) {
r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace, string(uid)).Set(duration)
}

// DeleteRayClusterMetrics removes metrics that belongs to the specified RayCluster.
Expand All @@ -107,6 +108,7 @@ func (r *RayClusterMetricsManager) collectRayClusterInfo(cluster *rayv1.RayClust
1,
cluster.Name,
cluster.Namespace,
string(cluster.UID),
ownerKind,
)
}
Expand All @@ -118,6 +120,7 @@ func (r *RayClusterMetricsManager) collectRayClusterConditionProvisioned(cluster
1,
cluster.Name,
cluster.Namespace,
string(cluster.UID),
strconv.FormatBool(meta.IsStatusConditionTrue(cluster.Status.Conditions, string(rayv1.RayClusterProvisioned))),
)
}
43 changes: 24 additions & 19 deletions ray-operator/controllers/ray/metrics/ray_cluster_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

Expand All @@ -30,6 +31,7 @@ func TestRayClusterInfo(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "test-ray-cluster",
Namespace: "default",
UID: types.UID("test-ray-cluster-uid"),
Labels: map[string]string{
"ray.io/originated-from-crd": "RayJob",
},
Expand All @@ -39,13 +41,14 @@ func TestRayClusterInfo(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "test-ray-cluster-2",
Namespace: "default",
UID: types.UID("test-ray-cluster-2-uid"),
Labels: map[string]string{},
},
},
},
expectedMetrics: []string{
`kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob"} 1`,
`kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None"} 1`,
`kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob",uid="test-ray-cluster-uid"} 1`,
`kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None",uid="test-ray-cluster-2-uid"} 1`,
},
},
}
Expand Down Expand Up @@ -98,9 +101,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {

// Test case 1: Delete specific cluster metrics
// Manually add some metrics
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster1", "namespace": "ns1"}).Set(10.5)
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster2", "namespace": "ns2"}).Set(20.3)
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster3", "namespace": "ns1"}).Set(5.7)
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster1", "namespace": "ns1", "uid": "uid1"}).Set(10.5)
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster2", "namespace": "ns2", "uid": "uid2"}).Set(20.3)
manager.rayClusterProvisionedDurationSeconds.With(prometheus.Labels{"name": "cluster3", "namespace": "ns1", "uid": "uid3"}).Set(5.7)

// Test deleting metrics for cluster1 in ns1
manager.DeleteRayClusterMetrics("cluster1", "ns1")
Expand All @@ -109,9 +112,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
body, statusCode := support.GetMetricsResponseAndCode(t, reg)

assert.Equal(t, http.StatusOK, statusCode)
assert.NotContains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
assert.NotContains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)
assert.Contains(t, body, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)

// Test case 2: Delete with empty name
manager.DeleteRayClusterMetrics("", "ns1")
Expand All @@ -120,9 +123,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
body2, statusCode := support.GetMetricsResponseAndCode(t, reg)

assert.Equal(t, http.StatusOK, statusCode)
assert.NotContains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
assert.NotContains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)
assert.Contains(t, body2, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)

// Test case 3: Delete with empty name and namespace
manager.DeleteRayClusterMetrics("", "")
Expand All @@ -131,9 +134,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
body3, statusCode := support.GetMetricsResponseAndCode(t, reg)

assert.Equal(t, http.StatusOK, statusCode)
assert.NotContains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
assert.NotContains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)
assert.Contains(t, body3, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)

// Test case 4: Delete with false name and namespace
manager.DeleteRayClusterMetrics("ns2", "cluster2")
Expand All @@ -142,9 +145,9 @@ func TestDeleteRayClusterMetrics(t *testing.T) {
body4, statusCode := support.GetMetricsResponseAndCode(t, reg)

assert.Equal(t, http.StatusOK, statusCode)
assert.NotContains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1"}`)
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1"}`)
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2"}`)
assert.NotContains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster1",namespace="ns1",uid="uid1"}`)
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster3",namespace="ns1",uid="uid3"}`)
assert.Contains(t, body4, `kuberay_cluster_provisioned_duration_seconds{name="cluster2",namespace="ns2",uid="uid2"}`)
}

func TestRayClusterConditionProvisioned(t *testing.T) {
Expand All @@ -160,6 +163,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "provisioned-cluster",
Namespace: "default",
UID: types.UID("provisioned-cluster-uid"),
},
Status: rayv1.RayClusterStatus{
Conditions: []metav1.Condition{
Expand All @@ -174,6 +178,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "unprovisioned-cluster",
Namespace: "default",
UID: types.UID("unprovisioned-cluster-uid"),
},
Status: rayv1.RayClusterStatus{
Conditions: []metav1.Condition{
Expand All @@ -186,8 +191,8 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
},
},
expectedMetrics: []string{
`kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default"} 1`,
`kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default"} 1`,
`kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default",uid="provisioned-cluster-uid"} 1`,
`kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default",uid="unprovisioned-cluster-uid"} 1`,
},
},
}
Expand Down
15 changes: 9 additions & 6 deletions ray-operator/controllers/ray/metrics/ray_job_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand All @@ -14,7 +15,7 @@ import (

//go:generate mockgen -destination=mocks/ray_job_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayJobMetricsObserver
type RayJobMetricsObserver interface {
ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64)
ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64)
}

// RayJobMetricsManager implements the prometheus.Collector and RayJobMetricsObserver interface to collect ray job metrics.
Expand All @@ -34,20 +35,20 @@ func NewRayJobMetricsManager(ctx context.Context, client client.Client) *RayJobM
Name: "kuberay_job_execution_duration_seconds",
Help: "Duration from when the RayJob CR’s JobDeploymentStatus transitions from Initializing to either the Retrying state or a terminal state, such as Complete or Failed. The Retrying state indicates that the CR previously failed and that spec.backoffLimit is enabled.",
},
[]string{"name", "namespace", "job_deployment_status", "retry_count"},
[]string{"name", "namespace", "uid", "job_deployment_status", "retry_count"},
),
// rayJobInfo is a gauge metric that indicates the metadata information about RayJob custom resources.
rayJobInfo: prometheus.NewDesc(
"kuberay_job_info",
"Metadata information about RayJob custom resources",
[]string{"name", "namespace"},
[]string{"name", "namespace", "uid"},
nil,
),
// rayJobDeploymentStatus is a gauge metric that indicates the current deployment status of the RayJob custom resources.
rayJobDeploymentStatus: prometheus.NewDesc(
"kuberay_job_deployment_status",
"The RayJob's current deployment status",
[]string{"name", "namespace", "deployment_status"},
[]string{"name", "namespace", "uid", "deployment_status"},
nil,
),
client: client,
Expand Down Expand Up @@ -80,8 +81,8 @@ func (r *RayJobMetricsManager) Collect(ch chan<- prometheus.Metric) {
}
}

func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) {
r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration)
func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) {
r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(uid), string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration)
}

// DeleteRayJobMetrics removes metrics that belongs to the specified RayJob.
Expand All @@ -97,6 +98,7 @@ func (r *RayJobMetricsManager) collectRayJobInfo(rayJob *rayv1.RayJob, ch chan<-
1,
rayJob.Name,
rayJob.Namespace,
string(rayJob.UID),
)
}

Expand All @@ -107,6 +109,7 @@ func (r *RayJobMetricsManager) collectRayJobDeploymentStatus(rayJob *rayv1.RayJo
1,
rayJob.Name,
rayJob.Namespace,
string(rayJob.UID),
string(rayJob.Status.JobDeploymentStatus),
)
}
Loading
Loading