support scheduler plugins (#3612) (#3766)

kevin85421 · KunWuLuan · yueming.wk · web-flow · commit 8007e3a407eb · 2025-06-13T12:43:24.000+08:00
* support scheduler plugins



* add unit test

update ValidateBatchSchedulerConfig()

update helm chart

Rename the function.



* Update the role in helm chart. And ensure the crd is installed before start the operator.



* Fix CI lint



* Modify ray version in raycluster sample.



* Update the scheduler name



* Update the test



* update and add TODOs



---------

Signed-off-by: kunwuluan &lt;kunwuluan@gmail.com&gt;
Signed-off-by: KunWuLuan &lt;kunwuluan@gmail.com&gt;
Signed-off-by: kaihsun &lt;kaihsun@anyscale.com&gt;
Co-authored-by: GreenHand &lt;kunwuluan@gmail.com&gt;
Co-authored-by: yueming.wk &lt;yueming.wk@alibaba-inc.com&gt;
diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl
@@ -308,4 +308,15 @@ rules:
   verbs:
   - get
 {{- end -}}
+{{- if or .batchSchedulerEnabled (eq .batchSchedulerName "scheduler-plugins") }}
+- apiGroups:
+  - scheduling.x-k8s.io
+  resources:
+  - podgroups
+  verbs:
+  - create
+  - get
+  - list
+  - watch
+{{- end -}}
 {{- end -}}
diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml
@@ -50,7 +50,7 @@ logging:
 # by the customized scheduler.
 #  * "enabled" is the legacy option and will be deprecated soon.
 #  * "name" is the standard option, expecting a scheduler name, supported values are
-#    "default", "volcano", and "yunikorn".
+#    "default", "volcano", "yunikorn", and "scheduler-plugins".
 #
 # Note: "enabled" and "name" should not be set at the same time. If both are set, an error will be thrown.
 #
@@ -67,6 +67,10 @@ logging:
 #       batchScheduler:
 #         name: yunikorn
 #
+#  4. Use PodGroup
+#       batchScheduler:
+#         name: scheduler-plugins
+#
 batchScheduler:
   # Deprecated. This option will be removed in the future.
   # Note, for backwards compatibility. When it sets to true, it enables volcano scheduler integration.
diff --git a/ray-operator/apis/config/v1alpha1/config_utils.go b/ray-operator/apis/config/v1alpha1/config_utils.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/go-logr/logr"
 
+	schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
 )
@@ -22,7 +23,7 @@ func ValidateBatchSchedulerConfig(logger logr.Logger, config Configuration) erro
 
 	if len(config.BatchScheduler) > 0 {
 		// if a customized scheduler is configured, check it is supported
-		if config.BatchScheduler == volcano.GetPluginName() || config.BatchScheduler == yunikorn.GetPluginName() {
+		if config.BatchScheduler == volcano.GetPluginName() || config.BatchScheduler == yunikorn.GetPluginName() || config.BatchScheduler == schedulerplugins.GetPluginName() {
 			logger.Info("Feature flag batch-scheduler is enabled",
 				"scheduler name", config.BatchScheduler)
 		} else {
diff --git a/ray-operator/config/samples/ray-cluster.scheduler-plugins.yaml b/ray-operator/config/samples/ray-cluster.scheduler-plugins.yaml
@@ -0,0 +1,40 @@
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: test-podgroup-0
+  labels:
+    ray.io/gang-scheduling-enabled: "true"
+spec:
+  rayVersion: "2.46.0"
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+        - name: ray-head
+          image: rayproject/ray:2.46.0
+          resources:
+            limits:
+              cpu: "1"
+              memory: "2Gi"
+            requests:
+              cpu: "1"
+              memory: "2Gi"
+  workerGroupSpecs:
+  - groupName: worker
+    rayStartParams: {}
+    replicas: 2
+    minReplicas: 2
+    maxReplicas: 2
+    template:
+      spec:
+        containers:
+        - name: ray-head
+          image: rayproject/ray:2.46.0
+          resources:
+            limits:
+              cpu: "1"
+              memory: "1Gi"
+            requests:
+              cpu: "1"
+              memory: "1Gi"
diff --git a/ray-operator/controllers/ray/batchscheduler/scheduler-plugins/scheduler_plugins.go b/ray-operator/controllers/ray/batchscheduler/scheduler-plugins/scheduler_plugins.go
@@ -0,0 +1,150 @@
+package schedulerplugins
+
+import (
+	"context"
+	"fmt"
+
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	ktypes "k8s.io/apimachinery/pkg/types"
+	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	"k8s.io/client-go/rest"
+	"sigs.k8s.io/controller-runtime/pkg/builder"
+	"sigs.k8s.io/controller-runtime/pkg/cache"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
+
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
+	"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
+)
+
+const (
+	schedulerName                 string = "scheduler-plugins"
+	kubeSchedulerPodGroupLabelKey string = "scheduling.x-k8s.io/pod-group"
+)
+
+type KubeScheduler struct {
+	cli client.Client
+}
+
+type KubeSchedulerFactory struct{}
+
+func GetPluginName() string {
+	return schedulerName
+}
+
+func (k *KubeScheduler) Name() string {
+	return GetPluginName()
+}
+
+func createPodGroup(_ context.Context, app *rayv1.RayCluster) *v1alpha1.PodGroup {
+	// we set replica as 1 for the head pod
+	replica := int32(1)
+	for _, workerGroup := range app.Spec.WorkerGroupSpecs {
+		if workerGroup.Replicas == nil {
+			continue
+		}
+		// TODO(kevin85421): We should consider the case of `numOfHosts` is not 1.
+		replica += *workerGroup.Replicas
+	}
+
+	podGroup := &v1alpha1.PodGroup{
+		ObjectMeta: metav1.ObjectMeta{
+			Namespace: app.Namespace,
+			Name:      app.Name,
+			OwnerReferences: []metav1.OwnerReference{
+				{
+					Name:       app.Name,
+					UID:        app.UID,
+					APIVersion: app.APIVersion,
+					Kind:       app.Kind,
+				},
+			},
+		},
+		Spec: v1alpha1.PodGroupSpec{
+			MinMember:    replica,
+			MinResources: utils.CalculateDesiredResources(app),
+		},
+	}
+	return podGroup
+}
+
+func (k *KubeScheduler) DoBatchSchedulingOnSubmission(ctx context.Context, app *rayv1.RayCluster) error {
+	if !k.isGangSchedulingEnabled(app) {
+		return nil
+	}
+	podGroup := &v1alpha1.PodGroup{}
+	if err := k.cli.Get(ctx, ktypes.NamespacedName{Namespace: app.Namespace, Name: app.Name}, podGroup); err != nil {
+		if !errors.IsNotFound(err) {
+			return err
+		}
+		podGroup = createPodGroup(ctx, app)
+		if err := k.cli.Create(ctx, podGroup); err != nil {
+			if errors.IsAlreadyExists(err) {
+				return nil
+			}
+			return fmt.Errorf("failed to create PodGroup: %w", err)
+		}
+	}
+	return nil
+}
+
+// AddMetadataToPod adds essential labels and annotations to the Ray pods
+// the scheduler needs these labels and annotations in order to do the scheduling properly
+func (k *KubeScheduler) AddMetadataToPod(_ context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
+	// when gang scheduling is enabled, extra labels need to be added to all pods
+	if k.isGangSchedulingEnabled(app) {
+		pod.Labels[kubeSchedulerPodGroupLabelKey] = app.Name
+	}
+	// TODO(kevin85421): Currently, we only support "single scheduler" mode. If we want to support
+	// "second scheduler" mode, we need to add `schedulerName` to the pod spec.
+}
+
+func (k *KubeScheduler) isGangSchedulingEnabled(app *rayv1.RayCluster) bool {
+	_, exist := app.Labels[utils.RayClusterGangSchedulingEnabled]
+	return exist
+}
+
+func (kf *KubeSchedulerFactory) New(ctx context.Context, c *rest.Config) (schedulerinterface.BatchScheduler, error) {
+	// TODO(kevin85421): We should not initialize the informer cache here. We should reuse
+	// the reconciler's cache instead.
+	scheme := runtime.NewScheme()
+	utilruntime.Must(v1alpha1.AddToScheme(scheme))
+	ccache, err := cache.New(c, cache.Options{
+		Scheme: scheme,
+	})
+	if err != nil {
+		return nil, err
+	}
+	go func() {
+		if err := ccache.Start(ctx); err != nil {
+			panic(err)
+		}
+	}()
+	if synced := ccache.WaitForCacheSync(ctx); !synced {
+		return nil, fmt.Errorf("failed to sync cache")
+	}
+	cli, err := client.New(c, client.Options{
+		Scheme: scheme,
+		Cache: &client.CacheOptions{
+			Reader: ccache,
+		},
+	})
+	if err != nil {
+		return nil, err
+	}
+	return &KubeScheduler{
+		cli: cli,
+	}, nil
+}
+
+func (kf *KubeSchedulerFactory) AddToScheme(sche *runtime.Scheme) {
+	utilruntime.Must(v1alpha1.AddToScheme(sche))
+}
+
+func (kf *KubeSchedulerFactory) ConfigureReconciler(b *builder.Builder) *builder.Builder {
+	return b
+}
diff --git a/ray-operator/controllers/ray/batchscheduler/scheduler-plugins/scheduler_plugins_test.go b/ray-operator/controllers/ray/batchscheduler/scheduler-plugins/scheduler_plugins_test.go
@@ -0,0 +1,98 @@
+package schedulerplugins
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
+
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+)
+
+func createTestRayCluster(numOfHosts int32) rayv1.RayCluster {
+	headSpec := corev1.PodSpec{
+		Containers: []corev1.Container{
+			{
+				Name: "ray-head",
+				Resources: corev1.ResourceRequirements{
+					Limits: corev1.ResourceList{
+						corev1.ResourceCPU:    resource.MustParse("500m"),
+						corev1.ResourceMemory: resource.MustParse("512Mi"),
+					},
+					Requests: corev1.ResourceList{
+						corev1.ResourceCPU:    resource.MustParse("256m"),
+						corev1.ResourceMemory: resource.MustParse("256Mi"),
+					},
+				},
+			},
+		},
+	}
+
+	workerSpec := corev1.PodSpec{
+		Containers: []corev1.Container{
+			{
+				Name: "ray-worker",
+				Resources: corev1.ResourceRequirements{
+					Limits: corev1.ResourceList{
+						corev1.ResourceCPU:    resource.MustParse("500m"),
+						corev1.ResourceMemory: resource.MustParse("512Mi"),
+						"nvidia.com/gpu":      resource.MustParse("1"),
+					},
+					Requests: corev1.ResourceList{
+						corev1.ResourceCPU:    resource.MustParse("256m"),
+						corev1.ResourceMemory: resource.MustParse("256Mi"),
+					},
+				},
+			},
+		},
+	}
+
+	return rayv1.RayCluster{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "raycluster-sample",
+			Namespace: "default",
+		},
+		Spec: rayv1.RayClusterSpec{
+			HeadGroupSpec: rayv1.HeadGroupSpec{
+				Template: corev1.PodTemplateSpec{
+					Spec: headSpec,
+				},
+			},
+			WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
+				{
+					Template: corev1.PodTemplateSpec{
+						Spec: workerSpec,
+					},
+					Replicas:    ptr.To[int32](2),
+					NumOfHosts:  numOfHosts,
+					MinReplicas: ptr.To[int32](1),
+					MaxReplicas: ptr.To[int32](4),
+				},
+			},
+		},
+	}
+}
+
+func TestCreatePodGroup(t *testing.T) {
+	a := assert.New(t)
+
+	cluster := createTestRayCluster(1)
+
+	podGroup := createPodGroup(context.TODO(), &cluster)
+
+	// 256m * 3 (requests, not limits)
+	a.Equal("768m", podGroup.Spec.MinResources.Cpu().String())
+
+	// 256Mi * 3 (requests, not limits)
+	a.Equal("768Mi", podGroup.Spec.MinResources.Memory().String())
+
+	// 2 GPUs total
+	a.Equal("2", podGroup.Spec.MinResources.Name("nvidia.com/gpu", resource.BinarySI).String())
+
+	// 1 head and 2 workers
+	a.Equal(int32(3), podGroup.Spec.MinMember)
+}
diff --git a/ray-operator/controllers/ray/batchscheduler/schedulermanager.go b/ray-operator/controllers/ray/batchscheduler/schedulermanager.go
@@ -11,6 +11,7 @@ import (
 
 	configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
+	schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
 )
@@ -58,6 +59,8 @@ func getSchedulerFactory(rayConfigs configapi.Configuration) (schedulerinterface
 			factory = &volcano.VolcanoBatchSchedulerFactory{}
 		case yunikorn.GetPluginName():
 			factory = &yunikorn.YuniKornSchedulerFactory{}
+		case schedulerplugins.GetPluginName():
+			factory = &schedulerplugins.KubeSchedulerFactory{}
 		default:
 			return nil, fmt.Errorf("the scheduler is not supported, name=%s", rayConfigs.BatchScheduler)
 		}
diff --git a/ray-operator/go.mod b/ray-operator/go.mod
@@ -27,6 +27,7 @@ require (
 	k8s.io/klog/v2 v2.130.1
 	k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
 	sigs.k8s.io/controller-runtime v0.21.0
+	sigs.k8s.io/scheduler-plugins v0.31.8
 	sigs.k8s.io/structured-merge-diff/v4 v4.7.0
 	sigs.k8s.io/yaml v1.4.0
 	volcano.sh/apis v1.12.1
@@ -60,7 +61,7 @@ require (
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
-	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
 	github.com/prometheus/common v0.62.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
diff --git a/ray-operator/go.sum b/ray-operator/go.sum

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ logging:`
`50`	`50`	`# by the customized scheduler.`
`51`	`51`	`# * "enabled" is the legacy option and will be deprecated soon.`
`52`	`52`	`# * "name" is the standard option, expecting a scheduler name, supported values are`
`53`		`-# "default", "volcano", and "yunikorn".`
	`53`	`+# "default", "volcano", "yunikorn", and "scheduler-plugins".`
`54`	`54`	`#`
`55`	`55`	`# Note: "enabled" and "name" should not be set at the same time. If both are set, an error will be thrown.`
`56`	`56`	`#`
`@@ -67,6 +67,10 @@ logging:`
`67`	`67`	`# batchScheduler:`
`68`	`68`	`# name: yunikorn`
`69`	`69`	`#`
	`70`	`+# 4. Use PodGroup`
	`71`	`+# batchScheduler:`
	`72`	`+# name: scheduler-plugins`
	`73`	`+#`
`70`	`74`	`batchScheduler:`
`71`	`75`	`# Deprecated. This option will be removed in the future.`
`72`	`76`	`# Note, for backwards compatibility. When it sets to true, it enables volcano scheduler integration.`