Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 9 additions & 6 deletions ray-operator/controllers/ray/metrics/ray_cluster_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand All @@ -16,7 +17,7 @@ import (

//go:generate mockgen -destination=mocks/ray_cluster_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayClusterMetricsObserver
type RayClusterMetricsObserver interface {
ObserveRayClusterProvisionedDuration(name, namespace string, duration float64)
ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64)
}

// RayClusterMetricsManager implements the prometheus.Collector and RayClusterMetricsObserver interface to collect ray cluster metrics.
Expand All @@ -36,7 +37,7 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray
Name: "kuberay_cluster_provisioned_duration_seconds",
Help: "The time, in seconds, when a RayCluster's `RayClusterProvisioned` status transitions from false (or unset) to true",
},
[]string{"name", "namespace"},
[]string{"name", "namespace", "uid"},
),
// rayClusterInfo is a gauge metric that indicates the metadata information about RayCluster custom resources.
// The `owner_kind` label indicates the CRD type that originated the RayCluster.
Expand All @@ -47,13 +48,13 @@ func NewRayClusterMetricsManager(ctx context.Context, client client.Client) *Ray
rayClusterInfo: prometheus.NewDesc(
"kuberay_cluster_info",
"Metadata information about RayCluster custom resources",
[]string{"name", "namespace", "owner_kind"},
[]string{"name", "namespace", "uid", "owner_kind"},
nil,
),
rayClusterConditionProvisioned: prometheus.NewDesc(
"kuberay_cluster_condition_provisioned",
"Indicates whether the RayCluster is provisioned",
[]string{"name", "namespace", "condition"},
[]string{"name", "namespace", "uid", "condition"},
nil,
),
client: client,
Expand Down Expand Up @@ -85,8 +86,8 @@ func (r *RayClusterMetricsManager) Collect(ch chan<- prometheus.Metric) {
}
}

func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) {
r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace).Set(duration)
func (r *RayClusterMetricsManager) ObserveRayClusterProvisionedDuration(name, namespace string, uid types.UID, duration float64) {
r.rayClusterProvisionedDurationSeconds.WithLabelValues(name, namespace, string(uid)).Set(duration)
}

func (r *RayClusterMetricsManager) collectRayClusterInfo(cluster *rayv1.RayCluster, ch chan<- prometheus.Metric) {
Expand All @@ -101,6 +102,7 @@ func (r *RayClusterMetricsManager) collectRayClusterInfo(cluster *rayv1.RayClust
1,
cluster.Name,
cluster.Namespace,
string(cluster.UID),
ownerKind,
)
}
Expand All @@ -112,6 +114,7 @@ func (r *RayClusterMetricsManager) collectRayClusterConditionProvisioned(cluster
1,
cluster.Name,
cluster.Namespace,
string(cluster.UID),
strconv.FormatBool(meta.IsStatusConditionTrue(cluster.Status.Conditions, string(rayv1.RayClusterProvisioned))),
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

Expand All @@ -31,6 +32,7 @@ func TestRayClusterInfo(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "test-ray-cluster",
Namespace: "default",
UID: types.UID("test-ray-cluster-uid"),
Labels: map[string]string{
"ray.io/originated-from-crd": "RayJob",
},
Expand All @@ -40,13 +42,14 @@ func TestRayClusterInfo(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "test-ray-cluster-2",
Namespace: "default",
UID: types.UID("test-ray-cluster-2-uid"),
Labels: map[string]string{},
},
},
},
expectedMetrics: []string{
`kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob"} 1`,
`kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None"} 1`,
`kuberay_cluster_info{name="test-ray-cluster",namespace="default",owner_kind="RayJob",uid="test-ray-cluster-uid"} 1`,
`kuberay_cluster_info{name="test-ray-cluster-2",namespace="default",owner_kind="None",uid="test-ray-cluster-2-uid"} 1`,
},
},
}
Expand Down Expand Up @@ -109,6 +112,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "provisioned-cluster",
Namespace: "default",
UID: types.UID("provisioned-cluster-uid"),
},
Status: rayv1.RayClusterStatus{
Conditions: []metav1.Condition{
Expand All @@ -123,6 +127,7 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "unprovisioned-cluster",
Namespace: "default",
UID: types.UID("unprovisioned-cluster-uid"),
},
Status: rayv1.RayClusterStatus{
Conditions: []metav1.Condition{
Expand All @@ -135,8 +140,8 @@ func TestRayClusterConditionProvisioned(t *testing.T) {
},
},
expectedMetrics: []string{
`kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default"} 1`,
`kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default"} 1`,
`kuberay_cluster_condition_provisioned{condition="true",name="provisioned-cluster",namespace="default",uid="provisioned-cluster-uid"} 1`,
`kuberay_cluster_condition_provisioned{condition="false",name="unprovisioned-cluster",namespace="default",uid="unprovisioned-cluster-uid"} 1`,
},
},
}
Expand Down
15 changes: 9 additions & 6 deletions ray-operator/controllers/ray/metrics/ray_job_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand All @@ -14,7 +15,7 @@ import (

//go:generate mockgen -destination=mocks/ray_job_metrics_mock.go -package=mocks github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics RayJobMetricsObserver
type RayJobMetricsObserver interface {
ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64)
ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64)
}

// RayJobMetricsManager implements the prometheus.Collector and RayJobMetricsObserver interface to collect ray job metrics.
Expand All @@ -34,20 +35,20 @@ func NewRayJobMetricsManager(ctx context.Context, client client.Client) *RayJobM
Name: "kuberay_job_execution_duration_seconds",
Help: "Duration from when the RayJob CR’s JobDeploymentStatus transitions from Initializing to either the Retrying state or a terminal state, such as Complete or Failed. The Retrying state indicates that the CR previously failed and that spec.backoffLimit is enabled.",
},
[]string{"name", "namespace", "job_deployment_status", "retry_count"},
[]string{"name", "namespace", "uid", "job_deployment_status", "retry_count"},
),
// rayJobInfo is a gauge metric that indicates the metadata information about RayJob custom resources.
rayJobInfo: prometheus.NewDesc(
"kuberay_job_info",
"Metadata information about RayJob custom resources",
[]string{"name", "namespace"},
[]string{"name", "namespace", "uid"},
nil,
),
// rayJobDeploymentStatus is a gauge metric that indicates the current deployment status of the RayJob custom resources.
rayJobDeploymentStatus: prometheus.NewDesc(
"kuberay_job_deployment_status",
"The RayJob's current deployment status",
[]string{"name", "namespace", "deployment_status"},
[]string{"name", "namespace", "uid", "deployment_status"},
nil,
),
client: client,
Expand Down Expand Up @@ -80,8 +81,8 @@ func (r *RayJobMetricsManager) Collect(ch chan<- prometheus.Metric) {
}
}

func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) {
r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration)
func (r *RayJobMetricsManager) ObserveRayJobExecutionDuration(name, namespace string, uid types.UID, jobDeploymentStatus rayv1.JobDeploymentStatus, retryCount int, duration float64) {
r.rayJobExecutionDurationSeconds.WithLabelValues(name, namespace, string(uid), string(jobDeploymentStatus), strconv.Itoa(retryCount)).Set(duration)
}

func (r *RayJobMetricsManager) collectRayJobInfo(rayJob *rayv1.RayJob, ch chan<- prometheus.Metric) {
Expand All @@ -91,6 +92,7 @@ func (r *RayJobMetricsManager) collectRayJobInfo(rayJob *rayv1.RayJob, ch chan<-
1,
rayJob.Name,
rayJob.Namespace,
string(rayJob.UID),
)
}

Expand All @@ -101,6 +103,7 @@ func (r *RayJobMetricsManager) collectRayJobDeploymentStatus(rayJob *rayv1.RayJo
1,
rayJob.Name,
rayJob.Namespace,
string(rayJob.UID),
string(rayJob.Status.JobDeploymentStatus),
)
}
13 changes: 9 additions & 4 deletions ray-operator/controllers/ray/metrics/ray_job_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

Expand All @@ -31,18 +32,20 @@ func TestMetricRayJobInfo(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "ray-job-1",
Namespace: "ns1",
UID: types.UID("ray-job-1-uid"),
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "ray-job-2",
Namespace: "ns2",
UID: types.UID("ray-job-2-uid"),
},
},
},
expectedMetrics: []string{
`kuberay_job_info{name="ray-job-1",namespace="ns1"} 1`,
`kuberay_job_info{name="ray-job-2",namespace="ns2"} 1`,
`kuberay_job_info{name="ray-job-1",namespace="ns1",uid="ray-job-1-uid"} 1`,
`kuberay_job_info{name="ray-job-2",namespace="ns2",uid="ray-job-2-uid"} 1`,
},
},
}
Expand Down Expand Up @@ -105,6 +108,7 @@ func TestMetricRayJobDeploymentStatus(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "ray-job-1",
Namespace: "ns1",
UID: types.UID("ray-job-1-uid"),
},
Status: rayv1.RayJobStatus{
JobDeploymentStatus: rayv1.JobDeploymentStatusRunning,
Expand All @@ -114,15 +118,16 @@ func TestMetricRayJobDeploymentStatus(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "ray-job-2",
Namespace: "ns2",
UID: types.UID("ray-job-2-uid"),
},
Status: rayv1.RayJobStatus{
JobDeploymentStatus: rayv1.JobDeploymentStatusFailed,
},
},
},
expectedMetrics: []string{
`kuberay_job_deployment_status{deployment_status="Running",name="ray-job-1",namespace="ns1"} 1`,
`kuberay_job_deployment_status{deployment_status="Failed",name="ray-job-2",namespace="ns2"} 1`,
`kuberay_job_deployment_status{deployment_status="Running",name="ray-job-1",namespace="ns1",uid="ray-job-1-uid"} 1`,
`kuberay_job_deployment_status{deployment_status="Failed",name="ray-job-2",namespace="ns2",uid="ray-job-2-uid"} 1`,
},
},
}
Expand Down
9 changes: 6 additions & 3 deletions ray-operator/controllers/ray/metrics/ray_service_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,19 @@ func NewRayServiceMetricsManager(ctx context.Context, client client.Client) *Ray
rayServiceInfo: prometheus.NewDesc(
"kuberay_service_info",
"Metadata information about RayService custom resources",
[]string{"name", "namespace"},
[]string{"name", "namespace", "uid"},
nil,
),
rayServiceConditionReady: prometheus.NewDesc(
"kuberay_service_condition_ready",
"Describes whether the RayService is ready. Ready means users can send requests to the underlying cluster and the number of serve endpoints is greater than 0.",
[]string{"name", "namespace", "condition"},
[]string{"name", "namespace", "uid", "condition"},
nil,
),
rayServiceConditionUpgradeInProgress: prometheus.NewDesc(
"kuberay_service_condition_upgrade_in_progress",
"Describes whether the RayService is performing a zero-downtime upgrade.",
[]string{"name", "namespace", "condition"},
[]string{"name", "namespace", "uid", "condition"},
nil,
),
client: client,
Expand Down Expand Up @@ -76,6 +76,7 @@ func (c *RayServiceMetricsManager) collectRayServiceInfo(service *rayv1.RayServi
1,
service.Name,
service.Namespace,
string(service.UID),
)
}

Expand All @@ -87,6 +88,7 @@ func (c *RayServiceMetricsManager) collectRayServiceConditionMetrics(service *ra
1,
service.Name,
service.Namespace,
string(service.UID),
strconv.FormatBool(ready),
)
upgradeInProgress := meta.IsStatusConditionTrue(service.Status.Conditions, string(rayv1.UpgradeInProgress))
Expand All @@ -96,6 +98,7 @@ func (c *RayServiceMetricsManager) collectRayServiceConditionMetrics(service *ra
1,
service.Name,
service.Namespace,
string(service.UID),
strconv.FormatBool(upgradeInProgress),
)
}
Loading
Loading