Skip to content

Commit 02d8560

Browse files
committed
[KAI integration] Adding integration and example yamls
1 parent 8ad2c1b commit 02d8560

File tree

10 files changed

+247
-5
lines changed

10 files changed

+247
-5
lines changed

helm-chart/kuberay-operator/values.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,16 @@ logging:
7070
# 4. Use PodGroup
7171
# batchScheduler:
7272
# name: scheduler-plugins
73-
#
73+
74+
# 5. Use Kai Scheduler
75+
# batchScheduler:
76+
# name: kai-scheduler
77+
7478
batchScheduler:
7579
# Deprecated. This option will be removed in the future.
7680
# Note, for backwards compatibility. When it sets to true, it enables volcano scheduler integration.
7781
enabled: false
78-
# Set the customized scheduler name, supported values are "volcano" or "yunikorn", do not set
82+
# Set the customized scheduler name, supported values are "volcano", "yunikorn", "kai-scheduler", do not set
7983
# "batchScheduler.enabled=true" at the same time as it will override this option.
8084
name: ""
8185

ray-operator/apis/config/v1alpha1/config_utils.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55

66
"github.com/go-logr/logr"
77

8+
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
89
schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
910
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
1011
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
@@ -23,7 +24,7 @@ func ValidateBatchSchedulerConfig(logger logr.Logger, config Configuration) erro
2324

2425
if len(config.BatchScheduler) > 0 {
2526
// if a customized scheduler is configured, check it is supported
26-
if config.BatchScheduler == volcano.GetPluginName() || config.BatchScheduler == yunikorn.GetPluginName() || config.BatchScheduler == schedulerplugins.GetPluginName() {
27+
if config.BatchScheduler == volcano.GetPluginName() || config.BatchScheduler == yunikorn.GetPluginName() || config.BatchScheduler == schedulerplugins.GetPluginName() || config.BatchScheduler == kaischeduler.GetPluginName() {
2728
logger.Info("Feature flag batch-scheduler is enabled",
2829
"scheduler name", config.BatchScheduler)
2930
} else {

ray-operator/apis/config/v1alpha1/config_utils_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"github.com/go-logr/logr"
77
"github.com/go-logr/logr/testr"
88

9+
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
910
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
1011
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
1112
)
@@ -60,6 +61,16 @@ func TestValidateBatchSchedulerConfig(t *testing.T) {
6061
},
6162
wantErr: false,
6263
},
64+
{
65+
name: "valid option, batch-scheduler=kai-scheduler",
66+
args: args{
67+
logger: testr.New(t),
68+
config: Configuration{
69+
BatchScheduler: kaischeduler.GetPluginName(),
70+
},
71+
},
72+
wantErr: false,
73+
},
6374
{
6475
name: "invalid option, invalid scheduler name",
6576
args: args{

ray-operator/apis/config/v1alpha1/configuration_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ type Configuration struct {
4444
LogStdoutEncoder string `json:"logStdoutEncoder,omitempty"`
4545

4646
// BatchScheduler enables the batch scheduler integration with a specific scheduler
47-
// based on the given name, currently, supported values are volcano and yunikorn.
47+
// based on the given name, currently, supported values are volcano, yunikorn, kai-scheduler.
4848
BatchScheduler string `json:"batchScheduler,omitempty"`
4949

5050
// HeadSidecarContainers includes specification for a sidecar container
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
apiVersion: scheduling.run.ai/v2
2+
kind: Queue
3+
metadata:
4+
name: default
5+
spec:
6+
resources:
7+
cpu:
8+
quota: -1
9+
limit: -1
10+
overQuotaWeight: 1
11+
gpu:
12+
quota: -1
13+
limit: -1
14+
overQuotaWeight: 1
15+
memory:
16+
quota: -1
17+
limit: -1
18+
overQuotaWeight: 1
19+
---
20+
apiVersion: scheduling.run.ai/v2
21+
kind: Queue
22+
metadata:
23+
name: test
24+
spec:
25+
parentQueue: default
26+
resources:
27+
cpu:
28+
quota: -1
29+
limit: -1
30+
overQuotaWeight: 1
31+
gpu:
32+
quota: -1
33+
limit: -1
34+
overQuotaWeight: 1
35+
memory:
36+
quota: -1
37+
limit: -1
38+
overQuotaWeight: 1
39+
---
40+
41+
apiVersion: ray.io/v1
42+
kind: RayCluster
43+
metadata:
44+
name: rc-half-gpu
45+
labels:
46+
kai.scheduler/queue: test
47+
spec:
48+
headGroupSpec:
49+
template:
50+
spec:
51+
containers:
52+
- name: head
53+
image: rayproject/ray:2.46.0
54+
resources:
55+
limits:
56+
cpu: "1"
57+
memory: "2Gi"
58+
59+
# ---- Two workers share one GPU (0.5 each) ----
60+
workerGroupSpecs:
61+
- groupName: shared-gpu
62+
replicas: 2
63+
minReplicas: 2
64+
template:
65+
metadata:
66+
annotations:
67+
gpu-fraction: "0.5"
68+
spec:
69+
containers:
70+
- name: worker
71+
image: rayproject/ray:2.46.0
72+
resources:
73+
limits:
74+
cpu: "1"
75+
memory: "2Gi"
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#A simple example raycluster with KAI
2+
apiVersion: scheduling.run.ai/v2
3+
kind: Queue
4+
metadata:
5+
name: default
6+
spec:
7+
resources:
8+
cpu:
9+
quota: -1
10+
limit: -1
11+
overQuotaWeight: 1
12+
gpu:
13+
quota: -1
14+
limit: -1
15+
overQuotaWeight: 1
16+
memory:
17+
quota: -1
18+
limit: -1
19+
overQuotaWeight: 1
20+
---
21+
apiVersion: scheduling.run.ai/v2
22+
kind: Queue
23+
metadata:
24+
name: test
25+
spec:
26+
parentQueue: default
27+
resources:
28+
cpu:
29+
quota: -1
30+
limit: -1
31+
overQuotaWeight: 1
32+
gpu:
33+
quota: -1
34+
limit: -1
35+
overQuotaWeight: 1
36+
memory:
37+
quota: -1
38+
limit: -1
39+
overQuotaWeight: 1
40+
---
41+
apiVersion: ray.io/v1
42+
kind: RayCluster
43+
metadata:
44+
name: ray-sample
45+
labels:
46+
kai.scheduler/queue: test
47+
spec:
48+
headGroupSpec:
49+
template:
50+
spec:
51+
containers:
52+
- name: ray-head
53+
image: rayproject/ray:2.41.0
54+
resources:
55+
requests:
56+
cpu: "1"
57+
memory: "2Gi"
58+
workerGroupSpecs:
59+
- groupName: worker
60+
replicas: 2
61+
minReplicas: 2
62+
template:
63+
spec:
64+
containers:
65+
- name: ray-worker
66+
image: rayproject/ray:2.41.0
67+
resources:
68+
requests:
69+
cpu: "1"
70+
memory: "1Gi"
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package kaischeduler
2+
3+
// This KAI plugin relies on KAI-Scheduler's
4+
// built-in PodGrouper to create PodGroups at
5+
// runtime, so the plugin itself only needs to:
6+
// 1. expose the scheduler name,
7+
// 2. stamp pods with schedulerName + queue label.
8+
// No PodGroup create/patch logic is included.
9+
10+
import (
11+
"context"
12+
13+
corev1 "k8s.io/api/core/v1"
14+
"k8s.io/apimachinery/pkg/runtime"
15+
"k8s.io/client-go/rest"
16+
"sigs.k8s.io/controller-runtime/pkg/builder"
17+
18+
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
19+
schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
20+
)
21+
22+
const (
23+
QueueLabelName = "kai.scheduler/queue"
24+
)
25+
26+
type KaiScheduler struct{}
27+
28+
type KaiSchedulerFactory struct{}
29+
30+
func GetPluginName() string { return "kai-scheduler" }
31+
32+
func (k *KaiScheduler) Name() string { return GetPluginName() }
33+
34+
func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1.RayCluster) error {
35+
return nil
36+
}
37+
38+
func (k *KaiScheduler) AddMetadataToPod(_ context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
39+
if queue, ok := app.Labels[QueueLabelName]; ok {
40+
if pod.Labels == nil {
41+
pod.Labels = map[string]string{}
42+
}
43+
pod.Labels[QueueLabelName] = queue
44+
}
45+
pod.Spec.SchedulerName = k.Name()
46+
}
47+
48+
func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config) (schedulerinterface.BatchScheduler, error) {
49+
return &KaiScheduler{}, nil
50+
}
51+
52+
func (kf *KaiSchedulerFactory) AddToScheme(_ *runtime.Scheme) {
53+
}
54+
55+
func (kf *KaiSchedulerFactory) ConfigureReconciler(b *builder.Builder) *builder.Builder {
56+
return b
57+
}

ray-operator/controllers/ray/batchscheduler/schedulermanager.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
1414
schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
15+
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
1516
schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
1617
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
1718
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
@@ -60,6 +61,8 @@ func getSchedulerFactory(rayConfigs configapi.Configuration) (schedulerinterface
6061
factory = &volcano.VolcanoBatchSchedulerFactory{}
6162
case yunikorn.GetPluginName():
6263
factory = &yunikorn.YuniKornSchedulerFactory{}
64+
case kaischeduler.GetPluginName():
65+
factory = &kaischeduler.KaiSchedulerFactory{}
6366
case schedulerplugins.GetPluginName():
6467
factory = &schedulerplugins.KubeSchedulerFactory{}
6568
default:

ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
"github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
1010
schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
11+
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
1112
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
1213
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
1314
)
@@ -16,6 +17,7 @@ func TestGetSchedulerFactory(t *testing.T) {
1617
DefaultFactory := &schedulerinterface.DefaultBatchSchedulerFactory{}
1718
VolcanoFactory := &volcano.VolcanoBatchSchedulerFactory{}
1819
YuniKornFactory := &yunikorn.YuniKornSchedulerFactory{}
20+
KaiFactory := &kaischeduler.KaiSchedulerFactory{}
1921

2022
type args struct {
2123
rayConfigs v1alpha1.Configuration
@@ -65,6 +67,16 @@ func TestGetSchedulerFactory(t *testing.T) {
6567
},
6668
want: reflect.TypeOf(VolcanoFactory),
6769
},
70+
{
71+
name: "enableBatchScheduler=false, batchScheduler set to kai-scheduler",
72+
args: args{
73+
rayConfigs: v1alpha1.Configuration{
74+
EnableBatchScheduler: false,
75+
BatchScheduler: kaischeduler.GetPluginName(),
76+
},
77+
},
78+
want: reflect.TypeOf(KaiFactory),
79+
},
6880
{
6981
name: "enableBatchScheduler not set, batchScheduler set to yunikorn",
7082
args: args{
@@ -83,6 +95,15 @@ func TestGetSchedulerFactory(t *testing.T) {
8395
},
8496
want: reflect.TypeOf(VolcanoFactory),
8597
},
98+
{
99+
name: "enableBatchScheduler not set, batchScheduler set to kai-scheduler",
100+
args: args{
101+
rayConfigs: v1alpha1.Configuration{
102+
BatchScheduler: kaischeduler.GetPluginName(),
103+
},
104+
},
105+
want: reflect.TypeOf(KaiFactory),
106+
},
86107
{
87108
name: "enableBatchScheduler not set, batchScheduler set to unknown value",
88109
args: args{

ray-operator/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func main() {
9494
flag.BoolVar(&enableBatchScheduler, "enable-batch-scheduler", false,
9595
"(Deprecated) Enable batch scheduler. Currently is volcano, which supports gang scheduler policy. Please use --batch-scheduler instead.")
9696
flag.StringVar(&batchScheduler, "batch-scheduler", "",
97-
"Batch scheduler name, supported values are volcano and yunikorn.")
97+
"Batch scheduler name, supported values are volcano, yunikorn, kai-scheduler.")
9898
flag.StringVar(&configFile, "config", "", "Path to structured config file. Flags are ignored if config file is set.")
9999
flag.BoolVar(&useKubernetesProxy, "use-kubernetes-proxy", false,
100100
"Use Kubernetes proxy subresource when connecting to the Ray Head node.")

0 commit comments

Comments
 (0)