diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md index ed22aab08..a46d3f98d 100644 --- a/docs/api-reference/operator-api.md +++ b/docs/api-reference/operator-api.md @@ -742,6 +742,10 @@ _Appears in:_ | `enableProfiling` _boolean_ | EnableProfiling enables profiling via host:port/debug/pprof/ endpoints. | | | + + + + #### LeaderElectionConfiguration @@ -865,6 +869,57 @@ _Appears in:_ | `concurrentSyncs` _integer_ | ConcurrentSyncs is the number of workers used for the controller to concurrently work on events. | | | +#### SchedulerConfiguration + + + +SchedulerConfiguration configures scheduler profiles and which is the default. + + + +_Appears in:_ +- [OperatorConfiguration](#operatorconfiguration) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `profiles` _[SchedulerProfile](#schedulerprofile) array_ | Profiles is the list of scheduler profiles. Each profile has a backend name and optional config.
The kube-scheduler backend is always enabled; use profile name "kube-scheduler" to configure or set it as default.
Valid profile names: "kube-scheduler", "kai-scheduler". Use defaultProfileName to designate the default backend. If not set, defaulting sets it to "kube-scheduler". | | | +| `defaultProfileName` _string_ | DefaultProfileName is the name of the default scheduler profile. If unset, defaulting sets it to "kube-scheduler". | | | + + +#### SchedulerName + +_Underlying type:_ _string_ + +SchedulerName defines the name of the scheduler backend (used in OperatorConfiguration scheduler.profiles[].name). + + + +_Appears in:_ +- [SchedulerProfile](#schedulerprofile) + +| Field | Description | +| --- | --- | +| `kai-scheduler` | SchedulerNameKai is the KAI scheduler backend.
| +| `kube-scheduler` | SchedulerNameKube is the profile name for the Kubernetes default scheduler in OperatorConfiguration.
| + + +#### SchedulerProfile + + + +SchedulerProfile defines a scheduler backend profile with optional backend-specific config. + + + +_Appears in:_ +- [SchedulerConfiguration](#schedulerconfiguration) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _[SchedulerName](#schedulername)_ | Name is the scheduler profile name. Valid values: "kube-scheduler", "kai-scheduler".
For the Kubernetes default scheduler use "kube-scheduler"; Pod.Spec.SchedulerName will be set to "default-scheduler". | | Enum: [kai-scheduler kube-scheduler]
Required: \{\}
| +| `config` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#rawextension-runtime-pkg)_ | Config holds backend-specific options. The operator unmarshals it into the config type for this backend (see backend config types). | | | + + #### Server diff --git a/docs/proposals/375-scheduler-backend-framework/README.md b/docs/proposals/375-scheduler-backend-framework/README.md index 1a6576f27..95ac8e42a 100644 --- a/docs/proposals/375-scheduler-backend-framework/README.md +++ b/docs/proposals/375-scheduler-backend-framework/README.md @@ -215,7 +215,7 @@ func Initialize(client client.Client, scheme *runtime.Scheme, eventRecorder reco // Get returns the backend for the given name. kube-scheduler is always available; other backends return nil if not enabled via a profile. func Get(name string) SchedulerBackend -// GetDefault returns the backend designated as default in OperatorConfiguration (the profile with default: true; if none, kube-scheduler). The manager does not define the default; it exposes the one from config. +// GetDefault returns the backend designated as default in OperatorConfiguration (scheduler.defaultProfileName). func GetDefault() SchedulerBackend ``` @@ -241,12 +241,15 @@ type OperatorConfiguration struct { // SchedulerConfiguration configures scheduler profiles and which is the default. type SchedulerConfiguration struct { - // Profiles is the list of scheduler profiles. Each profile has a backend name, optional config, and whether it is the default. + // Profiles is the list of scheduler profiles. Each profile has a backend name and optional config. // The kube-scheduler backend is always enabled and active even if not listed here. Listing "kube-scheduler" in profiles - // only adds a profile (e.g. with config like GangScheduling: false) and allows marking it as default. - // Valid backend names: "kube-scheduler", "kai-scheduler". Exactly one profile should have default: true; if none, kube-scheduler is the default. + // only adds a profile (e.g. with config like GangScheduling: false). Use defaultProfileName to designate the default backend. + // Valid backend names: "kube-scheduler", "kai-scheduler". If defaultProfileName is unset, defaulting sets it to "kube-scheduler". // +optional Profiles []SchedulerProfile `json:"profiles,omitempty"` + // DefaultProfileName is the name of the default scheduler profile. + // +optional + DefaultProfileName string `json:"defaultProfileName,omitempty"` } // SchedulerName is the name for a supported scheduler backend. @@ -270,7 +273,7 @@ var SupportedSchedulerNames = []SchedulerName { // } -// SchedulerProfile defines a scheduler backend profile with optional backend-specific config and default flag. +// SchedulerProfile defines a scheduler backend profile with optional backend-specific config. type SchedulerProfile struct { // Name is the scheduler backend name. Valid values: "kube-scheduler", "kai-scheduler". // +kubebuilder:validation:Enum=kai-scheduler;kube-scheduler @@ -279,10 +282,6 @@ type SchedulerProfile struct { // Config holds backend-specific options. The operator unmarshals it into the config type for this backend (see backend config types below). // +optional Config *runtime.RawExtension `json:"config,omitempty"` - - // Default indicates this profile is the default backend when a workload does not specify one. Exactly one profile should have default: true. - // +optional - Default bool `json:"default,omitempty"` } ``` @@ -290,7 +289,8 @@ The `OperatorConfiguration` provides a way to enable and configure one or more s - **Name:** This is the name of the scheduler backend. This must be one of the supported schedulers. - **Config:** Optional scheduler-specific configuration as `runtime.RawExtension`. It is the responsibility of the scheduler backend implementation to interpret and possibly deserialize it to type. -- **Default:** Indicates if this scheduler should be the default. In case no scheduler name is set in any `PodSpec` across all `PodCliqueTemplateSpec` then the default scheduler as indicated via this field will be set. + +`SchedulerConfiguration.defaultProfileName` designates which profile is the default. When no scheduler name is set in any `PodSpec` across all `PodCliqueTemplateSpec`, the default scheduler indicated by `defaultProfileName` will be used. **Backend Enabling Behavior:** @@ -300,22 +300,20 @@ The kube-scheduler backend has special behavior compared to other scheduler back 2. **Explicit Configuration Optional**: You only need to add kube-scheduler to `profiles` if you want to: - Configure it with specific options (e.g., `gangScheduling: true`) - - Explicitly mark it as the default (though it's already the default if no other profile sets `default: true`) + - Set it as the default via `defaultProfileName` (defaulting sets kube-scheduler as default when `defaultProfileName` is unset) 3. **Other Schedulers Require Explicit Enablement**: All non-kube-scheduler backends (kai-scheduler, third-party schedulers) must be explicitly listed in `profiles` to be enabled. If a workload references a scheduler that is not in the profiles list, the validating webhook will reject the PodCliqueSet. 4. **Default Selection Logic**: - - If `profiles` is empty → kube-scheduler is the default - - If exactly one profile has `default: true` → that backend is the default - - If multiple profiles have `default: true` → operator startup fails with validation error - - If no profile has `default: true` → kube-scheduler is the default (even if not in the list) + - If `profiles` is empty → defaulting adds kube-scheduler and sets `defaultProfileName: "kube-scheduler"` + - `defaultProfileName` must be one of the configured profile names; validation rejects invalid or missing default profile name If no `SchedulerProfile` has been set, then Grove operator behaves as if you specified: ```yaml scheduler: + defaultProfileName: kube-scheduler profiles: - - name: "kube-scheduler" - default: true + - name: kube-scheduler ``` > NOTE: If you as a workload operator wish to use a specific scheduler, please ensure that it has been enabled and properly configured as part of `OperatorConfiguration`. If PodCliqueSet uses a scheduler which has not been enabled, then the validating webhook will reject any creation request for this PodCliqueSet. @@ -336,46 +334,46 @@ type KubeSchedulerConfig struct { ```yaml # --- Omit scheduler profiles completely --- -# Same as profiles: [{ name: "kube-scheduler", default: true }] +# Same as defaultProfileName: kube-scheduler, profiles: [{ name: "kube-scheduler" }] ``` ```yaml # --- Single scheduler profile, no specific configuration --- scheduler: + defaultProfileName: kube-scheduler profiles: - - name: "kube-scheduler" - default: true + - name: kube-scheduler # In this configuration Gang Scheduling will not be enabled ``` ```yaml # --- Single scheduler profile with configuration --- scheduler: + defaultProfileName: kube-scheduler profiles: - - name: "kube-scheduler" + - name: kube-scheduler config: gangScheduling: true - default: true ``` ```yaml # --- Multiple scheduler profiles; default is kube-scheduler --- scheduler: + defaultProfileName: kube-scheduler profiles: - - name: "kube-scheduler" + - name: kube-scheduler config: gangScheduling: true - default: true - - name: "kai-scheduler" # no scheduler-specific configuration is defined + - name: kai-scheduler # no scheduler-specific configuration is defined ``` ```yaml # --- Only kai-scheduler profile; kube-scheduler is still implicitly available but kai-scheduler is the default --- scheduler: + defaultProfileName: kai-scheduler profiles: - - name: "kai-scheduler" + - name: kai-scheduler config: {} - default: true ``` diff --git a/operator/api/common/labels.go b/operator/api/common/labels.go index b453fea33..5aa81a194 100644 --- a/operator/api/common/labels.go +++ b/operator/api/common/labels.go @@ -43,6 +43,8 @@ const ( LabelPodCliqueScalingGroupReplicaIndex = "grove.io/podcliquescalinggroup-replica-index" // LabelPodTemplateHash is a key for a label that sets the hash of the PodSpec. This label will be set on a PodClique and will be shared by all pods in the PodClique. LabelPodTemplateHash = "grove.io/pod-template-hash" + // LabelSchedulerName is a label on PodGang that indicates which scheduler backend should sync this PodGang. + LabelSchedulerName = "grove.io/scheduler-name" ) // Labels for setting component names for all managed resources whose lifecycle diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go index ec91bc3aa..7652e1032 100644 --- a/operator/api/config/v1alpha1/defaults.go +++ b/operator/api/config/v1alpha1/defaults.go @@ -69,6 +69,37 @@ func SetDefaults_OperatorConfiguration(operatorConfig *OperatorConfiguration) { } } +// SetDefaults_SchedulerConfiguration sets defaults for scheduler configuration. +// Principle: respect all user-explicit values first. +// +// 1. If user did not include kube in profiles, add kube. +// 2. If defaultProfileName is unset, set it to "kube-scheduler". Validation will reject invalid cases. +func SetDefaults_SchedulerConfiguration(cfg *SchedulerConfiguration) { + if len(cfg.Profiles) == 0 { + cfg.Profiles = []SchedulerProfile{ + {Name: SchedulerNameKube}, + } + cfg.DefaultProfileName = string(SchedulerNameKube) + return + } + // 1. If user didn't add kube, add it. + hasKube := false + for i := range cfg.Profiles { + if cfg.Profiles[i].Name == SchedulerNameKube { + hasKube = true + break + } + } + if !hasKube { + cfg.Profiles = append(cfg.Profiles, SchedulerProfile{Name: SchedulerNameKube}) + } + + // 2. No default profile name → set kube as default. + if cfg.DefaultProfileName == "" { + cfg.DefaultProfileName = string(SchedulerNameKube) + } +} + // SetDefaults_ServerConfiguration sets defaults for the server configuration. func SetDefaults_ServerConfiguration(serverConfig *ServerConfiguration) { if serverConfig.Webhooks.Port == 0 { diff --git a/operator/api/config/v1alpha1/defaults_test.go b/operator/api/config/v1alpha1/defaults_test.go new file mode 100644 index 000000000..b8fa6cc91 --- /dev/null +++ b/operator/api/config/v1alpha1/defaults_test.go @@ -0,0 +1,128 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package v1alpha1 + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSetDefaults_SchedulerConfiguration(t *testing.T) { + tests := []struct { + name string + cfg *SchedulerConfiguration + wantProfiles []SchedulerProfile + wantDefaultProfile string + }{ + { + name: "empty profiles: add kube and set defaultProfileName", + cfg: &SchedulerConfiguration{}, + wantProfiles: []SchedulerProfile{{Name: SchedulerNameKube}}, + wantDefaultProfile: string(SchedulerNameKube), + }, + { + name: "nil profiles (len 0): add kube and set defaultProfileName", + cfg: &SchedulerConfiguration{ + Profiles: nil, + DefaultProfileName: "", + }, + wantProfiles: []SchedulerProfile{{Name: SchedulerNameKube}}, + wantDefaultProfile: string(SchedulerNameKube), + }, + { + name: "only kai in profiles: append kube and set defaultProfileName", + cfg: &SchedulerConfiguration{ + Profiles: []SchedulerProfile{{Name: SchedulerNameKai}}, + DefaultProfileName: "", + }, + wantProfiles: []SchedulerProfile{{Name: SchedulerNameKai}, {Name: SchedulerNameKube}}, + wantDefaultProfile: string(SchedulerNameKube), + }, + { + name: "only kube in profiles, defaultProfileName unset: set defaultProfileName", + cfg: &SchedulerConfiguration{ + Profiles: []SchedulerProfile{{Name: SchedulerNameKube}}, + DefaultProfileName: "", + }, + wantProfiles: []SchedulerProfile{{Name: SchedulerNameKube}}, + wantDefaultProfile: string(SchedulerNameKube), + }, + { + name: "kube and kai in profiles, defaultProfileName unset: set defaultProfileName to kube", + cfg: &SchedulerConfiguration{ + Profiles: []SchedulerProfile{ + {Name: SchedulerNameKube}, + {Name: SchedulerNameKai}, + }, + DefaultProfileName: "", + }, + wantProfiles: []SchedulerProfile{ + {Name: SchedulerNameKube}, + {Name: SchedulerNameKai}, + }, + wantDefaultProfile: string(SchedulerNameKube), + }, + { + name: "kube and kai in profiles, defaultProfileName already set to kube: no change", + cfg: &SchedulerConfiguration{ + Profiles: []SchedulerProfile{ + {Name: SchedulerNameKube}, + {Name: SchedulerNameKai}, + }, + DefaultProfileName: string(SchedulerNameKube), + }, + wantProfiles: []SchedulerProfile{ + {Name: SchedulerNameKube}, + {Name: SchedulerNameKai}, + }, + wantDefaultProfile: string(SchedulerNameKube), + }, + { + name: "kube and kai in profiles, defaultProfileName already set to kai: no change", + cfg: &SchedulerConfiguration{ + Profiles: []SchedulerProfile{ + {Name: SchedulerNameKube}, + {Name: SchedulerNameKai}, + }, + DefaultProfileName: string(SchedulerNameKai), + }, + wantProfiles: []SchedulerProfile{ + {Name: SchedulerNameKube}, + {Name: SchedulerNameKai}, + }, + wantDefaultProfile: string(SchedulerNameKai), + }, + { + name: "only kai in profiles, defaultProfileName already kai: append kube only", + cfg: &SchedulerConfiguration{ + Profiles: []SchedulerProfile{{Name: SchedulerNameKai}}, + DefaultProfileName: string(SchedulerNameKai), + }, + wantProfiles: []SchedulerProfile{{Name: SchedulerNameKai}, {Name: SchedulerNameKube}}, + wantDefaultProfile: string(SchedulerNameKai), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + SetDefaults_SchedulerConfiguration(tt.cfg) + assert.Equal(t, tt.wantProfiles, tt.cfg.Profiles, "Profiles after defaulting") + assert.Equal(t, tt.wantDefaultProfile, tt.cfg.DefaultProfileName, "DefaultProfileName after defaulting") + }) + } +} diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go index 9420783df..fea869cff 100644 --- a/operator/api/config/v1alpha1/types.go +++ b/operator/api/config/v1alpha1/types.go @@ -20,6 +20,7 @@ import ( corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" ) // LogFormat defines the format of the log. @@ -51,6 +52,59 @@ var ( AllLogFormats = []LogFormat{LogFormatJSON, LogFormatText} ) +// SchedulerName defines the name of the scheduler backend (used in OperatorConfiguration scheduler.profiles[].name). +type SchedulerName string + +const ( + // SchedulerNameKai is the KAI scheduler backend. + SchedulerNameKai SchedulerName = "kai-scheduler" + // SchedulerNameKube is the profile name for the Kubernetes default scheduler in OperatorConfiguration. + SchedulerNameKube SchedulerName = "kube-scheduler" +) + +var ( + // SupportedSchedulerNames is the list of profile names allowed in scheduler.profiles[].name. + SupportedSchedulerNames = []SchedulerName{SchedulerNameKai, SchedulerNameKube} +) + +// SchedulerConfiguration configures scheduler profiles and which is the default. +type SchedulerConfiguration struct { + // Profiles is the list of scheduler profiles. Each profile has a backend name and optional config. + // The kube-scheduler backend is always enabled; use profile name "kube-scheduler" to configure or set it as default. + // Valid profile names: "kube-scheduler", "kai-scheduler". Use defaultProfileName to designate the default backend. If not set, defaulting sets it to "kube-scheduler". + // +optional + Profiles []SchedulerProfile `json:"profiles,omitempty"` + // DefaultProfileName is the name of the default scheduler profile. If unset, defaulting sets it to "kube-scheduler". + // +optional + DefaultProfileName string `json:"defaultProfileName,omitempty"` +} + +// SchedulerProfile defines a scheduler backend profile with optional backend-specific config. +type SchedulerProfile struct { + // Name is the scheduler profile name. Valid values: "kube-scheduler", "kai-scheduler". + // For the Kubernetes default scheduler use "kube-scheduler"; Pod.Spec.SchedulerName will be set to "default-scheduler". + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=kai-scheduler;kube-scheduler + Name SchedulerName `json:"name"` + + // Config holds backend-specific options. The operator unmarshals it into the config type for this backend (see backend config types). + // +optional + Config *runtime.RawExtension `json:"config,omitempty"` +} + +// KaiSchedulerConfiguration defines the configuration for the kai-scheduler backend. +type KaiSchedulerConfiguration struct { + // Reserved for future kai-scheduler-specific options. +} + +// KubeSchedulerConfig holds the configuration for the default scheduler. +// Used when unmarshalling SchedulerProfile.Config for default-scheduler. +type KubeSchedulerConfig struct { + // GangScheduling indicates if Gang scheduling capability is enabled. + // +optional + GangScheduling bool `json:"gangScheduling,omitempty"` +} + // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // OperatorConfiguration defines the configuration for the Grove operator. @@ -67,6 +121,8 @@ type OperatorConfiguration struct { TopologyAwareScheduling TopologyAwareSchedulingConfiguration `json:"topologyAwareScheduling"` // +optional Network NetworkAcceleration `json:"network,omitempty"` // Network is the configuration for network acceleration features like MNNVL. + // Scheduler configures which scheduler backends are active and their per-backend options. + Scheduler SchedulerConfiguration `json:"scheduler"` } // LeaderElectionConfiguration defines the configuration for the leader election. diff --git a/operator/api/config/v1alpha1/zz_generated.deepcopy.go b/operator/api/config/v1alpha1/zz_generated.deepcopy.go index fa34b9f1e..6a4804b23 100644 --- a/operator/api/config/v1alpha1/zz_generated.deepcopy.go +++ b/operator/api/config/v1alpha1/zz_generated.deepcopy.go @@ -103,6 +103,38 @@ func (in *DebuggingConfiguration) DeepCopy() *DebuggingConfiguration { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KaiSchedulerConfiguration) DeepCopyInto(out *KaiSchedulerConfiguration) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KaiSchedulerConfiguration. +func (in *KaiSchedulerConfiguration) DeepCopy() *KaiSchedulerConfiguration { + if in == nil { + return nil + } + out := new(KaiSchedulerConfiguration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KubeSchedulerConfig) DeepCopyInto(out *KubeSchedulerConfig) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeSchedulerConfig. +func (in *KubeSchedulerConfig) DeepCopy() *KubeSchedulerConfig { + if in == nil { + return nil + } + out := new(KubeSchedulerConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *LeaderElectionConfiguration) DeepCopyInto(out *LeaderElectionConfiguration) { *out = *in @@ -154,6 +186,7 @@ func (in *OperatorConfiguration) DeepCopyInto(out *OperatorConfiguration) { in.Authorizer.DeepCopyInto(&out.Authorizer) in.TopologyAwareScheduling.DeepCopyInto(&out.TopologyAwareScheduling) out.Network = in.Network + in.Scheduler.DeepCopyInto(&out.Scheduler) return } @@ -238,6 +271,50 @@ func (in *PodCliqueSetControllerConfiguration) DeepCopy() *PodCliqueSetControlle return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SchedulerConfiguration) DeepCopyInto(out *SchedulerConfiguration) { + *out = *in + if in.Profiles != nil { + in, out := &in.Profiles, &out.Profiles + *out = make([]SchedulerProfile, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulerConfiguration. +func (in *SchedulerConfiguration) DeepCopy() *SchedulerConfiguration { + if in == nil { + return nil + } + out := new(SchedulerConfiguration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SchedulerProfile) DeepCopyInto(out *SchedulerProfile) { + *out = *in + if in.Config != nil { + in, out := &in.Config, &out.Config + *out = new(runtime.RawExtension) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulerProfile. +func (in *SchedulerProfile) DeepCopy() *SchedulerProfile { + if in == nil { + return nil + } + out := new(SchedulerProfile) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Server) DeepCopyInto(out *Server) { *out = *in diff --git a/operator/api/config/v1alpha1/zz_generated.defaults.go b/operator/api/config/v1alpha1/zz_generated.defaults.go index 25590c97a..45365fa50 100644 --- a/operator/api/config/v1alpha1/zz_generated.defaults.go +++ b/operator/api/config/v1alpha1/zz_generated.defaults.go @@ -41,4 +41,5 @@ func SetObjectDefaults_OperatorConfiguration(in *OperatorConfiguration) { SetDefaults_PodCliqueSetControllerConfiguration(&in.Controllers.PodCliqueSet) SetDefaults_PodCliqueControllerConfiguration(&in.Controllers.PodClique) SetDefaults_PodCliqueScalingGroupControllerConfiguration(&in.Controllers.PodCliqueScalingGroup) + SetDefaults_SchedulerConfiguration(&in.Scheduler) } diff --git a/operator/api/config/validation/validation.go b/operator/api/config/validation/validation.go index 32f46758e..057780579 100644 --- a/operator/api/config/validation/validation.go +++ b/operator/api/config/validation/validation.go @@ -34,6 +34,7 @@ import ( func ValidateOperatorConfiguration(config *configv1alpha1.OperatorConfiguration) field.ErrorList { allErrs := field.ErrorList{} allErrs = append(allErrs, validateLogConfiguration(config)...) + allErrs = append(allErrs, validateSchedulerConfiguration(&config.Scheduler, field.NewPath("scheduler"))...) allErrs = append(allErrs, validateLeaderElectionConfiguration(config.LeaderElection, field.NewPath("leaderElection"))...) allErrs = append(allErrs, validateClientConnectionConfiguration(config.ClientConnection, field.NewPath("clientConnection"))...) allErrs = append(allErrs, validateControllerConfiguration(config.Controllers, field.NewPath("controllers"))...) @@ -52,6 +53,32 @@ func validateLogConfiguration(config *configv1alpha1.OperatorConfiguration) fiel return allErrs } +func validateSchedulerConfiguration(scheduler *configv1alpha1.SchedulerConfiguration, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + profilesPath := fldPath.Child("profiles") + defaultProfileNamePath := fldPath.Child("defaultProfileName") + seenNames := sets.New[configv1alpha1.SchedulerName]() + for i, p := range scheduler.Profiles { + idxPath := profilesPath.Index(i) + if len(strings.TrimSpace(string(p.Name))) == 0 { + allErrs = append(allErrs, field.Required(idxPath.Child("name"), "scheduler profile name is required")) + } else if !slices.Contains(configv1alpha1.SupportedSchedulerNames, p.Name) { + allErrs = append(allErrs, field.NotSupported(idxPath.Child("name"), p.Name, configv1alpha1.SupportedSchedulerNames)) + } else { + if seenNames.Has(p.Name) { + allErrs = append(allErrs, field.Duplicate(idxPath.Child("name"), p.Name)) + } + seenNames.Insert(p.Name) + } + } + if strings.TrimSpace(scheduler.DefaultProfileName) == "" { + allErrs = append(allErrs, field.Required(defaultProfileNamePath, "default scheduler profile name is required")) + } else if !seenNames.Has(configv1alpha1.SchedulerName(scheduler.DefaultProfileName)) { + allErrs = append(allErrs, field.Invalid(defaultProfileNamePath, scheduler.DefaultProfileName, "default profile must be one of the configured profiles")) + } + return allErrs +} + func validateLeaderElectionConfiguration(cfg configv1alpha1.LeaderElectionConfiguration, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} if !cfg.Enabled { diff --git a/operator/api/config/validation/validation_test.go b/operator/api/config/validation/validation_test.go index d802ead67..116dbcd0d 100644 --- a/operator/api/config/validation/validation_test.go +++ b/operator/api/config/validation/validation_test.go @@ -208,3 +208,164 @@ func TestValidateTopologyAwareSchedulingConfiguration(t *testing.T) { }) } } + +func TestValidateSchedulerConfiguration(t *testing.T) { + fldPath := field.NewPath("scheduler") + tests := []struct { + name string + scheduler *configv1alpha1.SchedulerConfiguration + expectErrors int + expectedFields []string + expectedTypes []field.ErrorType + }{ + // Here we test pre-defaulting: empty profiles + empty defaultProfileName → Required for defaultProfileName + { + name: "invalid: empty profiles and empty defaultProfileName", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{}, + DefaultProfileName: "", + }, + expectErrors: 1, + expectedFields: []string{"scheduler.defaultProfileName"}, + expectedTypes: []field.ErrorType{field.ErrorTypeRequired}, + }, + // single kube + { + name: "valid: single kube default", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, + DefaultProfileName: string(configv1alpha1.SchedulerNameKube), + }, + expectErrors: 0, + }, + // single kai + { + name: "valid: single kai default", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKai}}, + DefaultProfileName: string(configv1alpha1.SchedulerNameKai), + }, + expectErrors: 0, + }, + // multiple schedulers, kube default + { + name: "valid: multiple schedulers kube default", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: configv1alpha1.SchedulerNameKube}, + {Name: configv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(configv1alpha1.SchedulerNameKube), + }, + expectErrors: 0, + }, + // multiple schedulers, kai default + { + name: "valid: multiple schedulers kai default", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: configv1alpha1.SchedulerNameKube}, + {Name: configv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(configv1alpha1.SchedulerNameKai), + }, + expectErrors: 0, + }, + // defaultProfileName omitted (pre-defaulting → Required) + { + name: "invalid: defaultProfileName omitted", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: configv1alpha1.SchedulerNameKube}, + {Name: configv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: "", + }, + expectErrors: 1, + expectedFields: []string{"scheduler.defaultProfileName"}, + expectedTypes: []field.ErrorType{field.ErrorTypeRequired}, + }, + // invalid defaultProfileName (not in supported list; not in profiles → Invalid) + { + name: "invalid: defaultProfileName not in profiles (e.g. invalid-scheduler)", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: configv1alpha1.SchedulerNameKube}, + {Name: configv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: "invalid-scheduler", + }, + expectErrors: 1, + expectedFields: []string{"scheduler.defaultProfileName"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid}, + }, + // defaultProfileName is kube but kube not in profiles + { + name: "invalid: defaultProfileName not in profiles (kube-scheduler but only kai in profiles)", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKai}}, + DefaultProfileName: string(configv1alpha1.SchedulerNameKube), + }, + expectErrors: 1, + expectedFields: []string{"scheduler.defaultProfileName"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid}, + }, + // empty name in profile + { + name: "invalid: profile with empty name", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: ""}, + }, + DefaultProfileName: "kube-scheduler", + }, + expectErrors: 2, + expectedFields: []string{"scheduler.profiles[0].name", "scheduler.defaultProfileName"}, + expectedTypes: []field.ErrorType{field.ErrorTypeRequired, field.ErrorTypeInvalid}, + }, + // unsupported profile name + { + name: "invalid: unsupported profile name", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: configv1alpha1.SchedulerName("volcano")}, + }, + DefaultProfileName: "volcano", + }, + expectErrors: 2, + expectedFields: []string{"scheduler.profiles[0].name", "scheduler.defaultProfileName"}, + expectedTypes: []field.ErrorType{field.ErrorTypeNotSupported, field.ErrorTypeInvalid}, + }, + // duplicate profile names + { + name: "invalid: duplicate profile names", + scheduler: &configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: configv1alpha1.SchedulerNameKube}, + {Name: configv1alpha1.SchedulerNameKube}, + }, + DefaultProfileName: string(configv1alpha1.SchedulerNameKube), + }, + expectErrors: 1, + expectedFields: []string{"scheduler.profiles[1].name"}, + expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + errs := validateSchedulerConfiguration(test.scheduler, fldPath) + + assert.Len(t, errs, test.expectErrors, "expected %d validation errors but got %d: %v", test.expectErrors, len(errs), errs) + + if test.expectErrors > 0 { + for i, expectedField := range test.expectedFields { + assert.Equal(t, expectedField, errs[i].Field, "error %d: expected field %s but got %s", i, expectedField, errs[i].Field) + if i < len(test.expectedTypes) { + assert.Equal(t, test.expectedTypes[i], errs[i].Type, "error %d: expected type %s but got %s", i, test.expectedTypes[i], errs[i].Type) + } + } + } + }) + } +} diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl index 2337e3e39..7d5298d16 100644 --- a/operator/charts/templates/_helpers.tpl +++ b/operator/charts/templates/_helpers.tpl @@ -30,6 +30,19 @@ config.yaml: | concurrentSyncs: {{ .Values.config.controllers.podClique.concurrentSyncs }} podCliqueScalingGroup: concurrentSyncs: {{ .Values.config.controllers.podCliqueScalingGroup.concurrentSyncs }} + {{- if and .Values.config.scheduler .Values.config.scheduler.profiles }} + scheduler: + {{- if .Values.config.scheduler.defaultProfileName }} + defaultProfileName: {{ .Values.config.scheduler.defaultProfileName }} + {{- end }} + profiles: + {{- range .Values.config.scheduler.profiles }} + - name: {{ .name }} + {{- if hasKey . "config" }} + config: {{ toYaml .config | nindent 4 }} + {{- end }} + {{- end }} + {{- end }} {{- if .Values.config.debugging }} debugging: enableProfiling: {{ .Values.config.debugging.enableProfiling }} diff --git a/operator/charts/templates/clusterrole.yaml b/operator/charts/templates/clusterrole.yaml index af0661fc0..d6da405f6 100644 --- a/operator/charts/templates/clusterrole.yaml +++ b/operator/charts/templates/clusterrole.yaml @@ -9,6 +9,7 @@ rules: - scheduler.grove.io resources: - podgangs + - podgangs/status verbs: - create - get diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index 1cbb88993..5c2c59492 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -83,6 +83,12 @@ config: concurrentSyncs: 3 podCliqueScalingGroup: concurrentSyncs: 3 + # Scheduler configures which scheduler backends are active. default-scheduler is always available. + # List profiles to enable backends; set defaultProfileName to the profile that is the default backend. + scheduler: + profiles: + - name: kube-scheduler + - name: kai-scheduler logLevel: info logFormat: json topologyAwareScheduling: diff --git a/operator/cmd/cli/cli.go b/operator/cmd/cli/cli.go index e5f24f1fd..56aa4730f 100644 --- a/operator/cmd/cli/cli.go +++ b/operator/cmd/cli/cli.go @@ -41,6 +41,8 @@ const ( // ExitErrInitializeManager indicates that the application exited due to an error initializing the manager. // This includes registration of controllers and webhooks and setting up probes. ExitErrInitializeManager + // ExitErrInitializeSchedulerBackend indicates that the application exited due to an error initializing the scheduler backend. + ExitErrInitializeSchedulerBackend // ExitErrStart indicates that the application exited due to an error when starting the application. ExitErrStart // ExitErrMNNVLPrerequisites indicates that the application exited because MNNVL prerequisites are not met. diff --git a/operator/cmd/cli/testdata/valid-config-mnnvl-enabled.yaml b/operator/cmd/cli/testdata/valid-config-mnnvl-enabled.yaml index b7c06f7a1..a9af417e7 100644 --- a/operator/cmd/cli/testdata/valid-config-mnnvl-enabled.yaml +++ b/operator/cmd/cli/testdata/valid-config-mnnvl-enabled.yaml @@ -29,6 +29,9 @@ controllers: concurrentSyncs: 3 podCliqueScalingGroup: concurrentSyncs: 2 +scheduler: + profiles: + - name: kai-scheduler logLevel: info logFormat: json authorizer: diff --git a/operator/cmd/cli/testdata/valid-config.yaml b/operator/cmd/cli/testdata/valid-config.yaml index 2fe57fc4c..d928a9e9a 100644 --- a/operator/cmd/cli/testdata/valid-config.yaml +++ b/operator/cmd/cli/testdata/valid-config.yaml @@ -29,6 +29,9 @@ controllers: concurrentSyncs: 3 podCliqueScalingGroup: concurrentSyncs: 2 +scheduler: + profiles: + - name: kai-scheduler logLevel: info logFormat: json authorizer: diff --git a/operator/cmd/main.go b/operator/cmd/main.go index 309e0abcf..25702e725 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -31,6 +31,7 @@ import ( "github.com/ai-dynamo/grove/operator/internal/controller/cert" grovelogger "github.com/ai-dynamo/grove/operator/internal/logger" "github.com/ai-dynamo/grove/operator/internal/mnnvl" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend" groveversion "github.com/ai-dynamo/grove/operator/internal/version" "github.com/spf13/pflag" @@ -87,6 +88,18 @@ func main() { handleErrorAndExit(err, cli.ExitErrInitializeManager) } + // Initialize scheduler backends with the configured schedulers. + if err := schedulerbackend.Initialize( + mgr.GetClient(), + mgr.GetScheme(), + mgr.GetEventRecorderFor("scheduler-backend"), + operatorConfig.Scheduler, + ); err != nil { + logger.Error(err, "failed to initialize scheduler backend") + handleErrorAndExit(err, cli.ExitErrInitializeSchedulerBackend) + } + + // TODO: Move this to the proper scheduler backend. // Initialize or clean up ClusterTopology based on operator configuration. // This must be done before starting the controllers that may depend on the ClusterTopology resource. // NOTE: In this version of the operator the synchronization will additionally ensure that the KAI Topology resource diff --git a/operator/go.mod b/operator/go.mod index f138b4eb7..6358dd004 100644 --- a/operator/go.mod +++ b/operator/go.mod @@ -131,7 +131,7 @@ require ( go.opentelemetry.io/otel/trace v1.35.0 // indirect go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.45.0 // indirect golang.org/x/net v0.47.0 // indirect diff --git a/operator/go.sum b/operator/go.sum index 8fc19746b..a2a83fcb7 100644 --- a/operator/go.sum +++ b/operator/go.sum @@ -365,8 +365,8 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= diff --git a/operator/internal/controller/manager.go b/operator/internal/controller/manager.go index 8191c32c1..e0cc00fbc 100644 --- a/operator/internal/controller/manager.go +++ b/operator/internal/controller/manager.go @@ -59,10 +59,10 @@ func RegisterControllersAndWebhooks(mgr ctrl.Manager, logger logr.Logger, operat // Controllers will not work unless the webhooks are fully configured and operational. // For webhooks to work cert-controller should finish its work of generating and injecting certificates. waitTillWebhookCertsReady(logger, certsReady) - if err := registerControllersWithMgr(mgr, operatorCfg.Controllers, operatorCfg.TopologyAwareScheduling, operatorCfg.Network); err != nil { + if err := registerControllersWithMgr(mgr, operatorCfg); err != nil { return err } - if err := registerWebhooksWithMgr(mgr, operatorCfg.Authorizer, operatorCfg.TopologyAwareScheduling, operatorCfg.Network); err != nil { + if err := registerWebhooksWithMgr(mgr, operatorCfg); err != nil { return err } return nil diff --git a/operator/internal/controller/manager_test.go b/operator/internal/controller/manager_test.go index 6c55daa18..b17ef6467 100644 --- a/operator/internal/controller/manager_test.go +++ b/operator/internal/controller/manager_test.go @@ -552,11 +552,12 @@ func TestRegisterControllersAndWebhooks(t *testing.T) { tc.waitFn(logger, ch) } } - registerControllersWithMgr = func(_ ctrl.Manager, _ configv1alpha1.ControllerConfiguration, _ configv1alpha1.TopologyAwareSchedulingConfiguration, _ configv1alpha1.NetworkAcceleration) error { + + registerControllersWithMgr = func(_ ctrl.Manager, _ *configv1alpha1.OperatorConfiguration) error { controllersCalled = true return tc.controllerErr } - registerWebhooksWithMgr = func(_ ctrl.Manager, _ configv1alpha1.AuthorizerConfig, _ configv1alpha1.TopologyAwareSchedulingConfiguration, _ configv1alpha1.NetworkAcceleration) error { + registerWebhooksWithMgr = func(_ ctrl.Manager, _ *configv1alpha1.OperatorConfiguration) error { webhooksCalled = true return tc.webhookErr } diff --git a/operator/internal/controller/podclique/components/pod/pod.go b/operator/internal/controller/podclique/components/pod/pod.go index 61d434bee..fff56f9da 100644 --- a/operator/internal/controller/podclique/components/pod/pod.go +++ b/operator/internal/controller/podclique/components/pod/pod.go @@ -28,6 +28,7 @@ import ( componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" groveerr "github.com/ai-dynamo/grove/operator/internal/errors" "github.com/ai-dynamo/grove/operator/internal/expect" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend" "github.com/ai-dynamo/grove/operator/internal/utils" k8sutils "github.com/ai-dynamo/grove/operator/internal/utils/kubernetes" @@ -160,6 +161,20 @@ func (r _resource) buildResource(pcs *grovecorev1alpha1.PodCliqueSet, pclq *grov } pod.Spec = *pclq.Spec.PodSpec.DeepCopy() pod.Spec.SchedulingGates = []corev1.PodSchedulingGate{{Name: podGangSchedulingGate}} + + // Resolve scheduler: from template or default backend; then prepare pod (schedulerName, annotations, etc.) + schedulerName := pclq.Spec.PodSpec.SchedulerName + backend := schedulerbackend.Get(schedulerName) + if backend == nil { + return groveerr.WrapError( + fmt.Errorf("scheduler backend not found or not initialized: %q", schedulerName), + errCodeBuildPodResource, + component.OperationSync, + "failed to prepare pod spec with scheduler backend", + ) + } + backend.PreparePod(pod) + // Add GROVE specific Pod environment variables addEnvironmentVariables(pod, pclq, pcsName, pcsReplicaIndex, podIndex) // Configure hostname and subdomain for service discovery diff --git a/operator/internal/controller/podclique/components/pod/syncflow.go b/operator/internal/controller/podclique/components/pod/syncflow.go index 575bcc642..cc540cb6d 100644 --- a/operator/internal/controller/podclique/components/pod/syncflow.go +++ b/operator/internal/controller/podclique/components/pod/syncflow.go @@ -238,7 +238,7 @@ func selectExcessPodsToDelete(sc *syncContext, logger logr.Logger) []*corev1.Pod return candidatePodsToDelete } -// checkAndRemovePodSchedulingGates removes scheduling gates from pods when their dependencies are satisfied +// checkAndRemovePodSchedulingGates removes scheduling gates from pods when PodGang is initialized func (r _resource) checkAndRemovePodSchedulingGates(sc *syncContext, logger logr.Logger) ([]string, error) { tasks := make([]utils.Task, 0, len(sc.existingPCLQPods)) skippedScheduleGatedPods := make([]string, 0, len(sc.existingPCLQPods)) diff --git a/operator/internal/controller/podclique/register.go b/operator/internal/controller/podclique/register.go index e77fef883..c5839cd22 100644 --- a/operator/internal/controller/podclique/register.go +++ b/operator/internal/controller/podclique/register.go @@ -31,6 +31,7 @@ import ( groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -288,16 +289,49 @@ func extractPCLQNameFromPodName(podName string) string { return podName[:endIndex] } -// podGangPredicate allows all PodGang create and update events to trigger PodClique reconciliation +// podGangPredicate filters PodGang events to trigger on initialization and spec updates func podGangPredicate() predicate.Predicate { return predicate.Funcs{ - CreateFunc: func(_ event.CreateEvent) bool { return true }, - DeleteFunc: func(_ event.DeleteEvent) bool { return false }, - UpdateFunc: func(_ event.UpdateEvent) bool { return true }, + CreateFunc: func(_ event.CreateEvent) bool { return false }, + DeleteFunc: func(_ event.DeleteEvent) bool { return false }, + UpdateFunc: func(e event.UpdateEvent) bool { + oldPG, okOld := e.ObjectOld.(*groveschedulerv1alpha1.PodGang) + newPG, okNew := e.ObjectNew.(*groveschedulerv1alpha1.PodGang) + if !okOld || !okNew { + return false + } + + // Trigger when PodGang transitions to Initialized=True + oldInitialized := isPodGangInitialized(e.ObjectOld) + newInitialized := isPodGangInitialized(e.ObjectNew) + if !oldInitialized && newInitialized { + return true + } + + // Also trigger when PodGang spec changes (e.g., scale out/in adds/removes pod references) + // This ensures scheduling gates are removed from newly added pods + // Check if metadata.generation changed (Kubernetes increments this on spec changes) + if newInitialized && oldPG.GetGeneration() != newPG.GetGeneration() { + return true + } + + return false + }, GenericFunc: func(_ event.GenericEvent) bool { return false }, } } +// isPodGangInitialized checks if a PodGang has Initialized condition set to True. +func isPodGangInitialized(obj client.Object) bool { + podGang, ok := obj.(*groveschedulerv1alpha1.PodGang) + if !ok { + return false + } + + // Check if Initialized condition is True + return meta.IsStatusConditionTrue(podGang.Status.Conditions, string(groveschedulerv1alpha1.PodGangConditionTypeInitialized)) +} + // isManagedPod checks if a Pod is managed by Grove and owned by a PodClique func isManagedPod(obj client.Object) bool { pod, ok := obj.(*corev1.Pod) diff --git a/operator/internal/controller/podcliqueset/components/podgang/podgang.go b/operator/internal/controller/podcliqueset/components/podgang/podgang.go index b5379223c..aad897f1c 100644 --- a/operator/internal/controller/podcliqueset/components/podgang/podgang.go +++ b/operator/internal/controller/podcliqueset/components/podgang/podgang.go @@ -27,6 +27,7 @@ import ( "github.com/ai-dynamo/grove/operator/internal/controller/common/component" componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" groveerr "github.com/ai-dynamo/grove/operator/internal/errors" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend" k8sutils "github.com/ai-dynamo/grove/operator/internal/utils/kubernetes" groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" @@ -50,6 +51,7 @@ const ( errCodeSetControllerReference grovecorev1alpha1.ErrorCode = "ERR_SET_CONTROLLER_REFERENCE" errCodeCreateOrPatchPodGang grovecorev1alpha1.ErrorCode = "ERR_CREATE_OR_PATCH_PODGANG" errCodeGetClusterTopologyLevels grovecorev1alpha1.ErrorCode = "ERR_GET_CLUSTER_TOPOLOGY_LEVELS" + errCodeUpdatePodGang grovecorev1alpha1.ErrorCode = "ERR_UPDATE_PODGANG_WITH_POD_REFS" ) type _resource struct { @@ -89,6 +91,7 @@ func (r _resource) GetExistingResourceNames(ctx context.Context, logger logr.Log } // Sync creates, updates, or deletes PodGang resources to match the desired state. +// NEW FLOW: PodGangs are created with empty podReferences before Pods are created. func (r _resource) Sync(ctx context.Context, logger logr.Logger, pcs *grovecorev1alpha1.PodCliqueSet) error { logger.Info("Syncing PodGang resources") sc, err := r.prepareSyncFlow(ctx, logger, pcs) @@ -99,12 +102,6 @@ func (r _resource) Sync(ctx context.Context, logger logr.Logger, pcs *grovecorev if result.hasErrors() { return result.getAggregatedError() } - if result.hasPodGangsPendingCreation() { - return groveerr.New(groveerr.ErrCodeRequeueAfter, - component.OperationSync, - fmt.Sprintf("PodGangs pending creation: %v", result.podsGangsPendingCreation), - ) - } return nil } @@ -128,6 +125,13 @@ func (r _resource) Delete(ctx context.Context, logger logr.Logger, pcsObjectMeta // buildResource configures a PodGang with pod groups and priority. func (r _resource) buildResource(pcs *grovecorev1alpha1.PodCliqueSet, pgi *podGangInfo, pg *groveschedulerv1alpha1.PodGang) error { pg.Labels = getLabels(pcs.Name) + // Set scheduler name so the podgang controller can resolve the correct backend + if schedName := getSchedulerNameForPCS(pcs); schedName != "" { + if pg.Labels == nil { + pg.Labels = make(map[string]string) + } + pg.Labels[apicommon.LabelSchedulerName] = schedName + } if r.tasConfig.Enabled { if pg.Annotations == nil { pg.Annotations = make(map[string]string) @@ -142,13 +146,58 @@ func (r _resource) buildResource(pcs *grovecorev1alpha1.PodCliqueSet, pgi *podGa fmt.Sprintf("failed to set the controller reference on PodGang %s to PodCliqueSet %v", pgi.fqn, client.ObjectKeyFromObject(pcs)), ) } - pg.Spec.PodGroups = createPodGroupsForPodGang(pg.Namespace, pgi) pg.Spec.PriorityClassName = pcs.Spec.Template.PriorityClassName pg.Spec.TopologyConstraint = pgi.topologyConstraint pg.Spec.TopologyConstraintGroupConfigs = pgi.pcsgTopologyConstraints + + // Only create PodGroups if they don't exist yet (initial creation) + // Once populated, we preserve existing podReferences to avoid clearing them on subsequent reconciles + if len(pg.Spec.PodGroups) == 0 { + // Create PodGroups with EMPTY podReferences initially + pg.Spec.PodGroups = createEmptyPodGroupsForPodGang(*pgi) + } else { + // PodGroups already exist - preserve them but update MinReplicas and TopologyConstraint if needed + expectedPodGroups := make(map[string]struct { + minAvailable int32 + topologyConstraint *groveschedulerv1alpha1.TopologyConstraint + }) + for _, pclq := range pgi.pclqs { + expectedPodGroups[pclq.fqn] = struct { + minAvailable int32 + topologyConstraint *groveschedulerv1alpha1.TopologyConstraint + }{ + minAvailable: pclq.minAvailable, + topologyConstraint: pclq.topologyConstraint, + } + } + + // Update MinReplicas and TopologyConstraint for existing PodGroups + for i := range pg.Spec.PodGroups { + podGroup := &pg.Spec.PodGroups[i] + if expectedPG, ok := expectedPodGroups[podGroup.Name]; ok { + podGroup.MinReplicas = expectedPG.minAvailable + podGroup.TopologyConstraint = expectedPG.topologyConstraint + } + } + } + return nil } +// createEmptyPodGroupsForPodGang creates PodGroups with empty podReferences. +// These will be populated later when pods are created. +func createEmptyPodGroupsForPodGang(pgInfo podGangInfo) []groveschedulerv1alpha1.PodGroup { + podGroups := lo.Map(pgInfo.pclqs, func(pclq pclqInfo, _ int) groveschedulerv1alpha1.PodGroup { + return groveschedulerv1alpha1.PodGroup{ + Name: pclq.fqn, + PodReferences: []groveschedulerv1alpha1.NamespacedName{}, + MinReplicas: pclq.minAvailable, + TopologyConstraint: pclq.topologyConstraint, + } + }) + return podGroups +} + // getPodGangSelectorLabels returns labels for selecting all PodGangs of a PodCliqueSet. func getPodGangSelectorLabels(pcsObjMeta metav1.ObjectMeta) map[string]string { return lo.Assign( @@ -176,3 +225,51 @@ func getLabels(pcsName string) map[string]string { apicommon.LabelComponentKey: apicommon.LabelComponentNamePodGang, }) } + +// getSchedulerNameForPCS returns the scheduler backend name for the PodCliqueSet: +// the template's schedulerName if set (same across all cliques per validation), else the default backend. +func getSchedulerNameForPCS(pcs *grovecorev1alpha1.PodCliqueSet) string { + for _, c := range pcs.Spec.Template.Cliques { + if c != nil && c.Spec.PodSpec.SchedulerName != "" { + return c.Spec.PodSpec.SchedulerName + } + } + if def := schedulerbackend.GetDefault(); def != nil { + return def.Name() + } + return "" +} + +// setInitializedCondition sets or updates the PodGangInitialized condition on the PodGang status. +func setInitializedCondition(pg *groveschedulerv1alpha1.PodGang, status metav1.ConditionStatus, reason, message string) { + condition := metav1.Condition{ + Type: string(groveschedulerv1alpha1.PodGangConditionTypeInitialized), + Status: status, + ObservedGeneration: pg.Generation, + LastTransitionTime: metav1.Now(), + Reason: reason, + Message: message, + } + + found := false + for i, cond := range pg.Status.Conditions { + if cond.Type == string(groveschedulerv1alpha1.PodGangConditionTypeInitialized) { + pg.Status.Conditions[i] = condition + found = true + break + } + } + if !found { + pg.Status.Conditions = append(pg.Status.Conditions, condition) + } +} + +// hasInitializedCondition returns true if the PodGang has an Initialized condition. +func hasInitializedCondition(pg *groveschedulerv1alpha1.PodGang) bool { + for _, cond := range pg.Status.Conditions { + if cond.Type == string(groveschedulerv1alpha1.PodGangConditionTypeInitialized) { + return true + } + } + return false +} diff --git a/operator/internal/controller/podcliqueset/components/podgang/podgang_test.go b/operator/internal/controller/podcliqueset/components/podgang/podgang_test.go new file mode 100644 index 000000000..23988408a --- /dev/null +++ b/operator/internal/controller/podcliqueset/components/podgang/podgang_test.go @@ -0,0 +1,54 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package podgang + +import ( + "testing" + + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestSetInitializedCondition(t *testing.T) { + pg := &groveschedulerv1alpha1.PodGang{ + ObjectMeta: metav1.ObjectMeta{Name: "pg-1", Namespace: "default", Generation: 1}, + } + setInitializedCondition(pg, metav1.ConditionFalse, "PodsPending", "waiting") + require.Len(t, pg.Status.Conditions, 1) + assert.Equal(t, string(groveschedulerv1alpha1.PodGangConditionTypeInitialized), pg.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionFalse, pg.Status.Conditions[0].Status) + assert.Equal(t, "PodsPending", pg.Status.Conditions[0].Reason) + assert.Equal(t, "waiting", pg.Status.Conditions[0].Message) + + // Update existing condition to ready + setInitializedCondition(pg, metav1.ConditionTrue, "Ready", "all ready") + require.Len(t, pg.Status.Conditions, 1) + assert.Equal(t, metav1.ConditionTrue, pg.Status.Conditions[0].Status) + assert.Equal(t, "Ready", pg.Status.Conditions[0].Reason) +} + +func TestHasInitializedCondition(t *testing.T) { + pg := &groveschedulerv1alpha1.PodGang{ + ObjectMeta: metav1.ObjectMeta{Name: "pg-1", Namespace: "default"}, + } + assert.False(t, hasInitializedCondition(pg)) + + setInitializedCondition(pg, metav1.ConditionFalse, "PodsPending", "waiting") + assert.True(t, hasInitializedCondition(pg)) +} diff --git a/operator/internal/controller/podcliqueset/components/podgang/syncflow.go b/operator/internal/controller/podcliqueset/components/podgang/syncflow.go index 85d25379d..b07decfd4 100644 --- a/operator/internal/controller/podcliqueset/components/podgang/syncflow.go +++ b/operator/internal/controller/podcliqueset/components/podgang/syncflow.go @@ -462,20 +462,15 @@ func (r _resource) deleteExcessPodGangs(sc *syncContext) error { return nil } -// createOrUpdatePodGangs creates or updates all expected PodGangs when ready. +// createOrUpdatePodGangs creates or updates all expected PodGangs. +// PodGangs are created with empty podReferences, Initialized=False. +// Once all pods are created, PodReferences are populated and the PodGang is marked as Initialized=True. func (r _resource) createOrUpdatePodGangs(sc *syncContext) syncFlowResult { result := syncFlowResult{} - pendingPodGangNames := sc.getPodGangNamesPendingCreation() + + // Step 1: Create or update all expected PodGangs with basic structure for _, podGang := range sc.expectedPodGangs { sc.logger.Info("[createOrUpdatePodGangs] processing PodGang", "fqn", podGang.fqn) - isPodGangPendingCreation := slices.Contains(pendingPodGangNames, podGang.fqn) - // check the health of each podclique - numPendingPods := r.getPodsPendingCreationOrAssociation(sc, podGang) - if isPodGangPendingCreation && numPendingPods > 0 { - sc.logger.Info("skipping creation of PodGang as all desired replicas have not yet been created or assigned", "fqn", podGang.fqn, "numPendingPodsToCreateOrAssociate", numPendingPods) - result.recordPodGangPendingCreation(podGang.fqn) - continue - } if err := r.createOrUpdatePodGang(sc, podGang); err != nil { sc.logger.Error(err, "failed to create PodGang", "PodGangName", podGang.fqn) result.recordError(err) @@ -483,9 +478,189 @@ func (r _resource) createOrUpdatePodGangs(sc *syncContext) syncFlowResult { } result.recordPodGangCreation(podGang.fqn) } + + // Step 2: For existing PodGangs, try to update PodReferences if all pods are created + // Skip newly created PodGangs as their pods won't be ready yet + for _, podGangName := range sc.existingPodGangNames { + if err := r.updatePodGangWithPodReferences(sc, podGangName); err != nil { + // Check if this is a "waiting for pods" error + var groveErr *groveerr.GroveError + if errors.As(err, &groveErr) && groveErr.Code == groveerr.ErrCodeRequeueAfter { + // Expected error: pods not ready yet, record but continue with other PodGangs + sc.logger.Info("PodGang waiting for pods to be created, will retry in next reconcile", + "podGang", podGangName) + result.recordError(err) + } else { + // Unexpected error: log and record, but continue with other PodGangs + sc.logger.Error(err, "Failed to update PodGang with pod references", + "podGang", podGangName) + result.recordError(err) + } + } + } + return result } +// updatePodGangWithPodReferences updates a PodGang with pod references and sets Initialized condition. +func (r _resource) updatePodGangWithPodReferences(sc *syncContext, podGangName string) error { + // Find the podGangInfo from expectedPodGangs + podGangInfo, found := r.findPodGangInfo(sc, podGangName) + if !found { + return nil + } + + // Verify all pods are created before proceeding + if err := r.verifyAllPodsCreated(sc, podGangName, podGangInfo); err != nil { + return err + } + + // Update pod references using Patch (no need to fetch from API server!) + if err := r.patchPodGangWithPodReferences(sc, podGangName, podGangInfo); err != nil { + return err + } + + // Update status to set Initialized=True (idempotent - no need to check current state) + if err := r.patchPodGangInitializedStatus(sc, podGangName, metav1.ConditionTrue, "Ready", "PodGang is fully initialized"); err != nil { + return err + } + return nil +} + +// patchPodGangInitializedStatus patches the Initialized condition with the given status. +func (r _resource) patchPodGangInitializedStatus(sc *syncContext, podGangName string, status metav1.ConditionStatus, reason, message string) error { + // Create a PodGang object with only the status we want to patch + statusPatch := &groveschedulerv1alpha1.PodGang{ + ObjectMeta: metav1.ObjectMeta{ + Name: podGangName, + Namespace: sc.pcs.Namespace, + }, + } + + setInitializedCondition(statusPatch, status, reason, message) + statusPatch.Status.Phase = groveschedulerv1alpha1.PodGangPhasePending + + if err := r.client.Status().Patch(sc.ctx, statusPatch, client.Merge); err != nil { + return err + } + + sc.logger.Info("Successfully patched PodGang Initialized condition", + "podGang", podGangName, "status", status) + return nil +} + +// patchPodGangWithPodReferences uses strategic merge patch to update pod references +func (r _resource) patchPodGangWithPodReferences(sc *syncContext, podGangName string, podGangInfo *podGangInfo) error { + // Build PodGroups with pod references from syncContext + podGroups := r.buildPodGroupsFromContext(sc, podGangInfo) + + // Create patch object + patchPodGang := &groveschedulerv1alpha1.PodGang{ + ObjectMeta: metav1.ObjectMeta{ + Name: podGangName, + Namespace: sc.pcs.Namespace, + }, + Spec: groveschedulerv1alpha1.PodGangSpec{ + PodGroups: podGroups, + }, + } + + // Apply patch + if err := r.client.Patch(sc.ctx, patchPodGang, client.Merge); err != nil { + return groveerr.WrapError(err, + errCodeCreateOrPatchPodGang, + component.OperationSync, + fmt.Sprintf("Failed to patch PodGang %s with pod references", podGangName), + ) + } + + sc.logger.Info("Successfully patched PodGang with pod references", + "podGang", podGangName, + "numPodGroups", len(podGroups)) + return nil +} + +// buildPodGroupsFromContext constructs PodGroups with pod references from syncContext data +func (r _resource) buildPodGroupsFromContext(sc *syncContext, podGangInfo *podGangInfo) []groveschedulerv1alpha1.PodGroup { + podsByGroup := r.groupPodsByPodClique(sc, podGangInfo) + + podGroups := make([]groveschedulerv1alpha1.PodGroup, 0, len(podGangInfo.pclqs)) + for _, pclqInfo := range podGangInfo.pclqs { + pods := podsByGroup[pclqInfo.fqn] + + // Build podReferences list + podReferences := make([]groveschedulerv1alpha1.NamespacedName, 0, len(pods)) + for _, pod := range pods { + podReferences = append(podReferences, groveschedulerv1alpha1.NamespacedName{ + Namespace: pod.Namespace, + Name: pod.Name, + }) + } + + // Sort for consistency + // TODO: Consider not trying to sort the podReferences here + sort.Slice(podReferences, func(i, j int) bool { + return podReferences[i].Name < podReferences[j].Name + }) + + podGroups = append(podGroups, groveschedulerv1alpha1.PodGroup{ + Name: pclqInfo.fqn, + PodReferences: podReferences, + MinReplicas: pclqInfo.minAvailable, + TopologyConstraint: pclqInfo.topologyConstraint, // Preserve PodClique-level topology constraint + }) + } + + return podGroups +} + +// findPodGangInfo locates the podGangInfo from expectedPodGangs +func (r _resource) findPodGangInfo(sc *syncContext, podGangName string) (*podGangInfo, bool) { + podGangInfo, found := lo.Find(sc.expectedPodGangs, func(pg *podGangInfo) bool { + return pg.fqn == podGangName + }) + if !found { + sc.logger.Info("PodGang not found in expectedPodGangs, skipping update", + "podGang", podGangName) + return nil, false + } + return podGangInfo, true +} + +// verifyAllPodsCreated checks if all required pods exist before updating PodGang +func (r _resource) verifyAllPodsCreated(sc *syncContext, podGangName string, podGangInfo *podGangInfo) error { + pclqs := sc.getPodCliques(podGangInfo) + if len(pclqs) != len(podGangInfo.pclqs) { + // Not all constituent PCLQs exist yet + sc.logger.Info("Not all constituent PCLQs exist yet", "podGang", podGangName, "expected", len(podGangInfo.pclqs), "actual", len(pclqs)) + return groveerr.New(groveerr.ErrCodeRequeueAfter, + component.OperationSync, + fmt.Sprintf("Waiting for all pods to be created for PodGang %s", podGangName), + ) + } + // check the health of each podclique + numPendingPods := r.getPodsPendingCreationOrAssociation(sc, podGangInfo) + if numPendingPods > 0 { + sc.logger.Info("skipping creation of PodGang as all desired replicas have not yet been created or assigned", "podGang", podGangName, "numPendingPodsToCreateOrAssociate", numPendingPods) + return groveerr.New(groveerr.ErrCodeRequeueAfter, + component.OperationSync, + fmt.Sprintf("Waiting for all pods to be created or assigned for PodGang %s", podGangName), + ) + } + return nil +} + +// groupPodsByPodClique organizes pods by their PodClique names +func (r _resource) groupPodsByPodClique(sc *syncContext, podGangInfo *podGangInfo) map[string][]corev1.Pod { + podsByGroup := make(map[string][]corev1.Pod) + for _, pclqInfo := range podGangInfo.pclqs { + if pods, ok := sc.existingPCLQPods[pclqInfo.fqn]; ok { + podsByGroup[pclqInfo.fqn] = pods + } + } + return podsByGroup +} + // getPodsForPodCliquesPendingCreation counts expected pods from non-existent PodCliques. func (r _resource) getPodsForPodCliquesPendingCreation(sc *syncContext, podGang *podGangInfo) int { existingPCLQNames := lo.Map(sc.existingPCLQs, func(pclq grovecorev1alpha1.PodClique, _ int) string { @@ -551,35 +726,20 @@ func (r _resource) createOrUpdatePodGang(sc *syncContext, pgInfo *podGangInfo) e fmt.Sprintf("Failed to CreateOrPatch PodGang %v", pgObjectKey), ) } + + // Update status with Initialized=False condition and Phase if not already set + // This needs to be done separately since CreateOrPatch doesn't handle status subresource + if !hasInitializedCondition(pg) { + if err := r.patchPodGangInitializedStatus(sc, pg.Name, metav1.ConditionFalse, "PodsPending", "Not all constituent pods have been created yet"); err != nil { + return err + } + } + r.eventRecorder.Eventf(sc.pcs, corev1.EventTypeNormal, constants.ReasonPodGangCreateOrUpdateSuccessful, "Created/Updated PodGang %v", pgObjectKey) sc.logger.Info("Triggered CreateOrPatch of PodGang", "objectKey", pgObjectKey) return nil } -// createPodGroupsForPodGang constructs PodGroups from constituent PodCliques. -func createPodGroupsForPodGang(namespace string, pgInfo *podGangInfo) []groveschedulerv1alpha1.PodGroup { - podGroups := lo.Map(pgInfo.pclqs, func(pi pclqInfo, _ int) groveschedulerv1alpha1.PodGroup { - namespacedNames := lo.Map(pi.associatedPodNames, func(associatedPodName string, _ int) groveschedulerv1alpha1.NamespacedName { - return groveschedulerv1alpha1.NamespacedName{ - Namespace: namespace, - Name: associatedPodName, - } - }) - // sorting the slice of NamespaceName. This prevents unnecessary updates to the PodGang resource if the only thing - // that is difference is the order of NamespaceNames. - sort.Slice(namespacedNames, func(i, j int) bool { - return namespacedNames[i].Name < namespacedNames[j].Name - }) - return groveschedulerv1alpha1.PodGroup{ - Name: pi.fqn, - PodReferences: namespacedNames, - MinReplicas: pi.minAvailable, - TopologyConstraint: pi.topologyConstraint, - } - }) - return podGroups -} - // Convenience types and methods on these types that are used during sync flow run. // ------------------------------------------------------------------------------------------------ @@ -656,9 +816,6 @@ func (sc *syncContext) determinePCSGReplicas(pcsgFQN string, pcsgConfig grovecor // syncFlowResult captures the result of a sync flow run. type syncFlowResult struct { - // podsGangsPendingCreation are the names of PodGangs that could not be created in this sync run. - // It could be due to all PCLQs not present, or it could be due to presence of at least one PCLQ that is not ready. - podsGangsPendingCreation []string // createdPodGangNames are the names of the PodGangs that got created during the sync flow run. createdPodGangNames []string // errs are the list of errors during the sync flow run. @@ -675,21 +832,11 @@ func (sfr *syncFlowResult) recordError(err error) { sfr.errs = append(sfr.errs, err) } -// hasPodGangsPendingCreation returns true if any PodGangs are waiting to be created. -func (sfr *syncFlowResult) hasPodGangsPendingCreation() bool { - return len(sfr.podsGangsPendingCreation) > 0 -} - // recordPodGangCreation adds a PodGang to the created list. func (sfr *syncFlowResult) recordPodGangCreation(podGangName string) { sfr.createdPodGangNames = append(sfr.createdPodGangNames, podGangName) } -// recordPodGangPendingCreation adds a PodGang to the pending creation list. -func (sfr *syncFlowResult) recordPodGangPendingCreation(podGangName string) { - sfr.podsGangsPendingCreation = append(sfr.podsGangsPendingCreation, podGangName) -} - // getAggregatedError combines all errors into a single error. func (sfr *syncFlowResult) getAggregatedError() error { return errors.Join(sfr.errs...) diff --git a/operator/internal/controller/podcliqueset/components/podgang/syncflow_test.go b/operator/internal/controller/podcliqueset/components/podgang/syncflow_test.go index 8275c2c61..d95c1dc44 100644 --- a/operator/internal/controller/podcliqueset/components/podgang/syncflow_test.go +++ b/operator/internal/controller/podcliqueset/components/podgang/syncflow_test.go @@ -18,19 +18,26 @@ package podgang import ( "context" + "errors" "slices" "testing" apicommon "github.com/ai-dynamo/grove/operator/api/common" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + groveclientscheme "github.com/ai-dynamo/grove/operator/internal/client" + groveerr "github.com/ai-dynamo/grove/operator/internal/errors" testutils "github.com/ai-dynamo/grove/operator/test/utils" groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" "github.com/samber/lo" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" ctrllogger "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -200,6 +207,95 @@ func TestMinAvailableWithHPAScaling(t *testing.T) { } } +// TestVerifyAllPodsCreated tests verifyAllPodsCreated with minimal sc + podGangInfo (no PCS/prepareSyncFlow). +// It covers both the PCLQ existence check and getPodsPendingCreationOrAssociation logic (Replicas and podgang label). +func TestVerifyAllPodsCreated(t *testing.T) { + makePod := func(name string, podGangLabel string) v1.Pod { + pod := v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "default"}} + if podGangLabel != "" { + pod.Labels = map[string]string{apicommon.LabelPodGang: podGangLabel} + } + return pod + } + makePCLQ := func(name string, replicas, minAvailable int32) grovecorev1alpha1.PodClique { + return grovecorev1alpha1.PodClique{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "default"}, + Spec: grovecorev1alpha1.PodCliqueSpec{Replicas: replicas, MinAvailable: ptr.To(minAvailable)}, + } + } + + tests := []struct { + name string + existingPods map[string][]v1.Pod + existingPCLQs []grovecorev1alpha1.PodClique + podGang *podGangInfo + wantRequeue bool + }{ + { + name: "requeue when not all constituent PCLQs exist yet", + existingPods: map[string][]v1.Pod{"pclq-a": {makePod("a1", "pg-1")}}, + existingPCLQs: []grovecorev1alpha1.PodClique{makePCLQ("pclq-a", 1, 1)}, + podGang: &podGangInfo{fqn: "pg-1", pclqs: []pclqInfo{{fqn: "pclq-a", replicas: 1, minAvailable: 1}, {fqn: "pclq-b", replicas: 1, minAvailable: 1}}}, + wantRequeue: true, + }, + { + name: "requeue when PCLQ has fewer pods than Replicas (even if >= MinAvailable)", + existingPods: map[string][]v1.Pod{ + "pclq-a": {makePod("a1", "pg-1"), makePod("a2", "pg-1")}, // 2 pods, Replicas=5, MinAvailable=2 + }, + existingPCLQs: []grovecorev1alpha1.PodClique{makePCLQ("pclq-a", 5, 2)}, + podGang: &podGangInfo{fqn: "pg-1", pclqs: []pclqInfo{{fqn: "pclq-a", replicas: 5, minAvailable: 2}}}, + wantRequeue: true, // Still pending: 5-2=3 pods to create + }, + { + name: "requeue when Pod missing podgang label", + existingPods: map[string][]v1.Pod{ + "pclq-a": {makePod("a1", ""), makePod("a2", "pg-1")}, // a1 missing label + }, + existingPCLQs: []grovecorev1alpha1.PodClique{makePCLQ("pclq-a", 2, 1)}, + podGang: &podGangInfo{fqn: "pg-1", pclqs: []pclqInfo{{fqn: "pclq-a", replicas: 2, minAvailable: 1}}}, + wantRequeue: true, // a1 needs association + }, + { + name: "requeue when Pod has wrong podgang label", + existingPods: map[string][]v1.Pod{ + "pclq-a": {makePod("a1", "pg-wrong"), makePod("a2", "pg-1")}, + }, + existingPCLQs: []grovecorev1alpha1.PodClique{makePCLQ("pclq-a", 2, 1)}, + podGang: &podGangInfo{fqn: "pg-1", pclqs: []pclqInfo{{fqn: "pclq-a", replicas: 2, minAvailable: 1}}}, + wantRequeue: true, // a1 has wrong label + }, + { + name: "success when all Replicas created and all pods have correct podgang label", + existingPods: map[string][]v1.Pod{ + "pclq-a": {makePod("a1", "pg-1"), makePod("a2", "pg-1"), makePod("a3", "pg-1"), makePod("a4", "pg-1"), makePod("a5", "pg-1")}, + }, + existingPCLQs: []grovecorev1alpha1.PodClique{makePCLQ("pclq-a", 5, 2)}, + podGang: &podGangInfo{fqn: "pg-1", pclqs: []pclqInfo{{fqn: "pclq-a", replicas: 5, minAvailable: 2}}}, + wantRequeue: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sc := &syncContext{ + logger: ctrllogger.FromContext(context.Background()).WithName("test"), + existingPCLQPods: tt.existingPods, + existingPCLQs: tt.existingPCLQs, + } + r := &_resource{} + err := r.verifyAllPodsCreated(sc, tt.podGang.fqn, tt.podGang) + if tt.wantRequeue { + require.Error(t, err) + var groveErr *groveerr.GroveError + require.True(t, errors.As(err, &groveErr)) + assert.Equal(t, groveerr.ErrCodeRequeueAfter, groveErr.Code) + } else { + require.NoError(t, err) + } + }) + } +} + // This test checks the accounting of the number of pending pods before creating a PodGang func TestGetPodsPendingCreation(t *testing.T) { tests := []struct { @@ -311,6 +407,183 @@ func TestGetPodsPendingCreation(t *testing.T) { } } +// TestUpdatePodGangWithPodReferences unit tests updatePodGangWithPodReferences. +func TestUpdatePodGangWithPodReferences(t *testing.T) { + ctx := context.Background() + logger := ctrllogger.FromContext(ctx).WithName("test") + + t.Run("returns nil when PodGang not in expectedPodGangs", func(t *testing.T) { + sc := &syncContext{logger: logger, expectedPodGangs: []*podGangInfo{{fqn: "pg-a"}}} + r := &_resource{} + err := r.updatePodGangWithPodReferences(sc, "pg-other") + require.NoError(t, err) + }) + + t.Run("returns requeue error when verifyAllPodsCreated fails", func(t *testing.T) { + sc := &syncContext{ + logger: logger, + pcs: &grovecorev1alpha1.PodCliqueSet{ObjectMeta: metav1.ObjectMeta{Namespace: "default"}}, + expectedPodGangs: []*podGangInfo{{fqn: "pg-a", pclqs: []pclqInfo{{fqn: "pclq-a", minAvailable: 1}}}}, + existingPCLQPods: map[string][]v1.Pod{}, + existingPCLQs: []grovecorev1alpha1.PodClique{}, + } + r := &_resource{} + err := r.updatePodGangWithPodReferences(sc, "pg-a") + require.Error(t, err) + var groveErr *groveerr.GroveError + require.True(t, errors.As(err, &groveErr)) + assert.Equal(t, groveerr.ErrCodeRequeueAfter, groveErr.Code) + }) + + t.Run("patches PodReferences and Initialized when all pods ready", func(t *testing.T) { + ns := "default" + pcs := &grovecorev1alpha1.PodCliqueSet{ObjectMeta: metav1.ObjectMeta{Name: "pcs", Namespace: ns}} + pclq := &grovecorev1alpha1.PodClique{ + ObjectMeta: metav1.ObjectMeta{Name: "pclq-a", Namespace: ns}, + Spec: grovecorev1alpha1.PodCliqueSpec{Replicas: 1, MinAvailable: ptr.To(int32(1))}, + } + pgExisting := &groveschedulerv1alpha1.PodGang{ + ObjectMeta: metav1.ObjectMeta{Name: "pg-a", Namespace: ns}, + Spec: groveschedulerv1alpha1.PodGangSpec{}, + } + // Pod must have podgang label to pass verifyAllPodsCreated + pod := v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-1", + Namespace: ns, + Labels: map[string]string{apicommon.LabelPodGang: "pg-a"}, + }, + } + fakeClient := testutils.NewTestClientBuilder(). + WithObjects(pcs, pclq, pgExisting). + WithStatusSubresource(&groveschedulerv1alpha1.PodGang{}). + Build() + sc := &syncContext{ + ctx: ctx, + logger: logger, + pcs: pcs, + expectedPodGangs: []*podGangInfo{{fqn: "pg-a", pclqs: []pclqInfo{{fqn: "pclq-a", replicas: 1, minAvailable: 1}}}}, + existingPodGangNames: []string{"pg-a"}, + existingPCLQPods: map[string][]v1.Pod{"pclq-a": {pod}}, + existingPCLQs: []grovecorev1alpha1.PodClique{*pclq}, + } + r := &_resource{client: fakeClient} + err := r.updatePodGangWithPodReferences(sc, "pg-a") + require.NoError(t, err) + pgAfter := &groveschedulerv1alpha1.PodGang{} + require.NoError(t, fakeClient.Get(ctx, client.ObjectKey{Namespace: ns, Name: "pg-a"}, pgAfter)) + require.Len(t, pgAfter.Spec.PodGroups, 1) + assert.Equal(t, "pclq-a", pgAfter.Spec.PodGroups[0].Name) + assert.Equal(t, []groveschedulerv1alpha1.NamespacedName{{Namespace: ns, Name: "pod-1"}}, pgAfter.Spec.PodGroups[0].PodReferences) + if len(pgAfter.Status.Conditions) > 0 { + assert.True(t, lo.ContainsBy(pgAfter.Status.Conditions, func(c metav1.Condition) bool { + return c.Type == string(groveschedulerv1alpha1.PodGangConditionTypeInitialized) && c.Status == metav1.ConditionTrue + })) + } + }) +} + +// TestCreateOrUpdatePodGangs tests the new flow: create PodGangs first, then update PodReferences when all pods are ready. +func TestCreateOrUpdatePodGangs(t *testing.T) { + ctx := context.Background() + ns := "default" + pcsName := "test-pcs" + pcsLabels := apicommon.GetDefaultLabelsForPodCliqueSetManagedResources(pcsName) + pcs := &grovecorev1alpha1.PodCliqueSet{ + ObjectMeta: metav1.ObjectMeta{Name: pcsName, Namespace: ns, UID: "pcs-uid"}, + Spec: grovecorev1alpha1.PodCliqueSetSpec{ + Replicas: 1, + Template: grovecorev1alpha1.PodCliqueSetTemplateSpec{ + Cliques: []*grovecorev1alpha1.PodCliqueTemplateSpec{ + {Name: "worker", Spec: grovecorev1alpha1.PodCliqueSpec{Replicas: 2, MinAvailable: ptr.To(int32(1))}}, + }, + }, + }, + } + pclqName := "test-pcs-0-worker" + pclq := &grovecorev1alpha1.PodClique{ + ObjectMeta: metav1.ObjectMeta{ + Name: pclqName, Namespace: ns, UID: types.UID("pclq-uid"), + Labels: pcsLabels, + OwnerReferences: []metav1.OwnerReference{{Name: pcsName, UID: pcs.UID, Controller: ptr.To(true)}}, + }, + Spec: grovecorev1alpha1.PodCliqueSpec{Replicas: 2, MinAvailable: ptr.To(int32(1))}, + } + pgLabels := lo.Assign(pcsLabels, map[string]string{apicommon.LabelComponentKey: apicommon.LabelComponentNamePodGang}) + pgCreated := &groveschedulerv1alpha1.PodGang{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pcs-0", Namespace: ns, + Labels: pgLabels, + OwnerReferences: []metav1.OwnerReference{{APIVersion: "grove.io/v1alpha1", Kind: "PodCliqueSet", Name: pcsName, UID: pcs.UID, Controller: ptr.To(true)}}, + }, + Spec: groveschedulerv1alpha1.PodGangSpec{}, + } + pod1 := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", Namespace: ns, + Labels: lo.Assign(pcsLabels, map[string]string{apicommon.LabelPodGang: "test-pcs-0"}), + OwnerReferences: []metav1.OwnerReference{{Name: pclqName, UID: pclq.UID, Controller: ptr.To(true)}}, + }, + } + pod2 := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", Namespace: ns, + Labels: lo.Assign(pcsLabels, map[string]string{apicommon.LabelPodGang: "test-pcs-0"}), + OwnerReferences: []metav1.OwnerReference{{Name: pclqName, UID: pclq.UID, Controller: ptr.To(true)}}, + }, + } + + t.Run("creates PodGang when not present (Step 1 loop)", func(t *testing.T) { + // No PodGang in cluster: Step 1 must create it via createOrUpdatePodGang. + fakeClient := testutils.NewTestClientBuilder(). + WithObjects(pcs, pclq). + WithStatusSubresource(&groveschedulerv1alpha1.PodGang{}). + Build() + r := &_resource{client: fakeClient, scheme: groveclientscheme.Scheme, eventRecorder: &record.FakeRecorder{}} + sc, err := r.prepareSyncFlow(ctx, ctrllogger.FromContext(ctx).WithName("test"), pcs) + require.NoError(t, err) + require.Len(t, sc.expectedPodGangs, 1, "expected one PodGang to create") + require.Empty(t, sc.existingPodGangNames, "PodGang should not exist yet") + + result := r.createOrUpdatePodGangs(sc) + require.False(t, result.hasErrors(), "createOrUpdatePodGangs should not fail: %v", result.errs) + require.Len(t, result.createdPodGangNames, 1, "Step 1 loop should have recorded one creation") + assert.Equal(t, "test-pcs-0", result.createdPodGangNames[0]) + + pgAfter := &groveschedulerv1alpha1.PodGang{} + require.NoError(t, fakeClient.Get(ctx, client.ObjectKey{Namespace: ns, Name: "test-pcs-0"}, pgAfter), + "PodGang should exist after Step 1 create") + assert.Equal(t, pcsName, pgAfter.OwnerReferences[0].Name) + }) + + t.Run("updates existing PodGang and fills PodReferences (Step 1 + Step 2)", func(t *testing.T) { + fakeClient := testutils.NewTestClientBuilder(). + WithObjects(pcs, pclq, pgCreated, pod1, pod2). + WithStatusSubresource(&groveschedulerv1alpha1.PodGang{}). + Build() + r := &_resource{client: fakeClient, scheme: groveclientscheme.Scheme, eventRecorder: &record.FakeRecorder{}} + sc, err := r.prepareSyncFlow(ctx, ctrllogger.FromContext(ctx).WithName("test"), pcs) + require.NoError(t, err) + require.Contains(t, sc.existingPodGangNames, "test-pcs-0") + + result := r.createOrUpdatePodGangs(sc) + require.False(t, result.hasErrors(), "createOrUpdatePodGangs should not fail: %v", result.errs) + // Step 1 still runs and records (createOrUpdatePodGang does patch when exists) + require.Len(t, result.createdPodGangNames, 1) + + pgAfter := &groveschedulerv1alpha1.PodGang{} + require.NoError(t, fakeClient.Get(ctx, client.ObjectKey{Namespace: ns, Name: "test-pcs-0"}, pgAfter)) + require.Len(t, pgAfter.Spec.PodGroups, 1) + assert.Equal(t, "test-pcs-0-worker", pgAfter.Spec.PodGroups[0].Name) + assert.Len(t, pgAfter.Spec.PodGroups[0].PodReferences, 2) + if len(pgAfter.Status.Conditions) > 0 { + assert.True(t, lo.ContainsBy(pgAfter.Status.Conditions, func(c metav1.Condition) bool { + return c.Type == string(groveschedulerv1alpha1.PodGangConditionTypeInitialized) && c.Status == metav1.ConditionTrue + })) + } + }) +} + // TestComputeExpectedPodGangs tests the computeExpectedPodGangs function func TestComputeExpectedPodGangs(t *testing.T) { tests := []struct { diff --git a/operator/internal/controller/podgang/reconciler.go b/operator/internal/controller/podgang/reconciler.go new file mode 100644 index 000000000..bc67aaa81 --- /dev/null +++ b/operator/internal/controller/podgang/reconciler.go @@ -0,0 +1,87 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package podgang + +import ( + "context" + + apicommon "github.com/ai-dynamo/grove/operator/api/common" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend" + + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// Reconciler reconciles PodGang objects and converts them to scheduler-specific CRs +type Reconciler struct { + client.Client + scheme *runtime.Scheme +} + +// NewReconciler creates a new Reconciler. Backend is resolved per PodGang from the grove.io/scheduler-name label or default. +func NewReconciler(mgr ctrl.Manager) *Reconciler { + return &Reconciler{ + Client: mgr.GetClient(), + scheme: mgr.GetScheme(), + } +} + +func resolveBackend(podGang *groveschedulerv1alpha1.PodGang) schedulerbackend.SchedBackend { + if name := podGang.Labels[apicommon.LabelSchedulerName]; name != "" { + if b := schedulerbackend.Get(name); b != nil { + return b + } + } + return schedulerbackend.GetDefault() +} + +// Reconcile processes PodGang changes and synchronizes to backend-specific CRs +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + podGang := &groveschedulerv1alpha1.PodGang{} + if err := r.Get(ctx, req.NamespacedName, podGang); err != nil { + if client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + + backend := resolveBackend(podGang) + if backend == nil { + log.FromContext(ctx).Error(nil, "No scheduler backend available for PodGang", "podgang", req.NamespacedName) + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx).WithValues("scheduler", backend.Name(), "podGang", req.NamespacedName) + if !podGang.DeletionTimestamp.IsZero() { + logger.Info("PodGang is being deleted") + if err := backend.OnPodGangDelete(ctx, podGang); err != nil { + logger.Error(err, "Failed to delete scheduler backend resources on-delete of PodGang") + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + + if err := backend.SyncPodGang(ctx, podGang); err != nil { + logger.Error(err, "Failed to SyncPodGang on spec change") + return ctrl.Result{}, err + } + logger.Info("Successfully synced PodGang") + return ctrl.Result{}, nil +} diff --git a/operator/internal/controller/podgang/register.go b/operator/internal/controller/podgang/register.go new file mode 100644 index 000000000..6243f3d6b --- /dev/null +++ b/operator/internal/controller/podgang/register.go @@ -0,0 +1,54 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package podgang + +import ( + grovectrlutils "github.com/ai-dynamo/grove/operator/internal/controller/utils" + + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// RegisterWithManager registers the backend controller with the manager +func (r *Reconciler) RegisterWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&groveschedulerv1alpha1.PodGang{}, builder.WithPredicates(podGangSpecChangePredicate())). + Named("podgang"). + Complete(r) +} + +// podGangSpecChangePredicate filters PodGang events to only process spec changes +// Status-only updates (like Initialized condition) are ignored +func podGangSpecChangePredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return grovectrlutils.IsManagedPodGang(e.Object) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return grovectrlutils.IsManagedPodGang(e.Object) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return grovectrlutils.IsManagedPodGang(e.ObjectOld) && + grovectrlutils.IsManagedPodGang(e.ObjectNew) && + (e.ObjectOld.GetGeneration() != e.ObjectNew.GetGeneration()) + }, + GenericFunc: func(_ event.GenericEvent) bool { return false }, + } +} diff --git a/operator/internal/controller/podgang/register_test.go b/operator/internal/controller/podgang/register_test.go new file mode 100644 index 000000000..9d3fa8607 --- /dev/null +++ b/operator/internal/controller/podgang/register_test.go @@ -0,0 +1,135 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package podgang + +import ( + "testing" + + testutils "github.com/ai-dynamo/grove/operator/test/utils" + + "github.com/stretchr/testify/assert" + "sigs.k8s.io/controller-runtime/pkg/event" +) + +// predicateTestCase describes a scenario and expected predicate result per event type. +type predicateTestCase struct { + name string + managedOld bool + managedNew bool + generationChanged bool + shouldAllowCreateEvent bool + shouldAllowDeleteEvent bool + shouldAllowGenericEvent bool + shouldAllowUpdateEvent bool +} + +func TestPodGangSpecChangePredicate(t *testing.T) { + pred := podGangSpecChangePredicate() + + tests := []predicateTestCase{ + { + name: "managed PodGang create", + managedOld: true, + managedNew: true, + shouldAllowCreateEvent: true, + shouldAllowDeleteEvent: true, + shouldAllowGenericEvent: false, + shouldAllowUpdateEvent: false, + }, + { + name: "unmanaged PodGang create", + managedOld: false, + managedNew: false, + shouldAllowCreateEvent: false, + shouldAllowDeleteEvent: false, + shouldAllowGenericEvent: false, + shouldAllowUpdateEvent: false, + }, + { + name: "managed PodGang update with spec change (generation changed)", + managedOld: true, + managedNew: true, + generationChanged: true, + shouldAllowCreateEvent: true, + shouldAllowDeleteEvent: true, + shouldAllowGenericEvent: false, + shouldAllowUpdateEvent: true, + }, + { + name: "managed PodGang update with status-only change (generation unchanged)", + managedOld: true, + managedNew: true, + generationChanged: false, + shouldAllowCreateEvent: true, + shouldAllowDeleteEvent: true, + shouldAllowGenericEvent: false, + shouldAllowUpdateEvent: false, + }, + { + name: "update with old managed and new unmanaged", + managedOld: true, + managedNew: false, + generationChanged: true, + shouldAllowCreateEvent: false, // Create/Delete use newPG which is unmanaged + shouldAllowDeleteEvent: false, + shouldAllowGenericEvent: false, + shouldAllowUpdateEvent: false, + }, + { + name: "update with old unmanaged and new managed", + managedOld: false, + managedNew: true, + generationChanged: true, + shouldAllowCreateEvent: true, // Create/Delete use newPG which is managed + shouldAllowDeleteEvent: true, + shouldAllowGenericEvent: false, + shouldAllowUpdateEvent: false, // old is unmanaged + }, + { + name: "generic event always rejected", + managedOld: true, + managedNew: true, + shouldAllowCreateEvent: true, + shouldAllowDeleteEvent: true, + shouldAllowGenericEvent: false, + shouldAllowUpdateEvent: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + oldPG := testutils.NewPodGangBuilder("test-pg", "default"). + WithGeneration(1). + WithManaged(tc.managedOld). + WithPodGroup("pg0", 1). + Build() + newPG := testutils.NewPodGangBuilder("test-pg", "default"). + WithGeneration(1). + WithManaged(tc.managedNew). + WithPodGroup("pg0", 1). + Build() + if tc.generationChanged { + newPG.SetGeneration(oldPG.GetGeneration() + 1) + } + + assert.Equal(t, tc.shouldAllowCreateEvent, pred.Create(event.CreateEvent{Object: newPG}), "Create") + assert.Equal(t, tc.shouldAllowDeleteEvent, pred.Delete(event.DeleteEvent{Object: newPG}), "Delete") + assert.Equal(t, tc.shouldAllowGenericEvent, pred.Generic(event.GenericEvent{Object: newPG}), "Generic") + assert.Equal(t, tc.shouldAllowUpdateEvent, pred.Update(event.UpdateEvent{ObjectOld: oldPG, ObjectNew: newPG}), "Update") + }) + } +} diff --git a/operator/internal/controller/register.go b/operator/internal/controller/register.go index 5e613d80f..c8bcb817e 100644 --- a/operator/internal/controller/register.go +++ b/operator/internal/controller/register.go @@ -17,27 +17,39 @@ package controller import ( + "fmt" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" "github.com/ai-dynamo/grove/operator/internal/controller/podclique" "github.com/ai-dynamo/grove/operator/internal/controller/podcliquescalinggroup" "github.com/ai-dynamo/grove/operator/internal/controller/podcliqueset" + "github.com/ai-dynamo/grove/operator/internal/controller/podgang" ctrl "sigs.k8s.io/controller-runtime" ) // RegisterControllers registers all controllers with the manager. -func RegisterControllers(mgr ctrl.Manager, controllerConfig configv1alpha1.ControllerConfiguration, topologyAwareSchedulingConfig configv1alpha1.TopologyAwareSchedulingConfiguration, networkConfig configv1alpha1.NetworkAcceleration) error { - pcsReconciler := podcliqueset.NewReconciler(mgr, controllerConfig.PodCliqueSet, topologyAwareSchedulingConfig, networkConfig) +func RegisterControllers(mgr ctrl.Manager, config *configv1alpha1.OperatorConfiguration) error { + if config == nil { + return fmt.Errorf("operator configuration must not be nil") + } + pcsReconciler := podcliqueset.NewReconciler(mgr, config.Controllers.PodCliqueSet, config.TopologyAwareScheduling, config.Network) if err := pcsReconciler.RegisterWithManager(mgr); err != nil { return err } - pcReconciler := podclique.NewReconciler(mgr, controllerConfig.PodClique) + pcReconciler := podclique.NewReconciler(mgr, config.Controllers.PodClique) if err := pcReconciler.RegisterWithManager(mgr); err != nil { return err } - pcsgReconciler := podcliquescalinggroup.NewReconciler(mgr, controllerConfig.PodCliqueScalingGroup) + pcsgReconciler := podcliquescalinggroup.NewReconciler(mgr, config.Controllers.PodCliqueScalingGroup) if err := pcsgReconciler.RegisterWithManager(mgr); err != nil { return err } + + podgangReconciler := podgang.NewReconciler(mgr) + if err := podgangReconciler.RegisterWithManager(mgr); err != nil { + return err + } + return nil } diff --git a/operator/internal/controller/register_test.go b/operator/internal/controller/register_test.go index d647ba4c2..d42250471 100644 --- a/operator/internal/controller/register_test.go +++ b/operator/internal/controller/register_test.go @@ -49,40 +49,22 @@ func TestRegisterControllers(t *testing.T) { mgr, err := ctrl.NewManager(cfg, ctrl.Options{}) require.NoError(t, err) - controllerConfig := configv1alpha1.ControllerConfiguration{ - PodCliqueSet: configv1alpha1.PodCliqueSetControllerConfiguration{ - ConcurrentSyncs: ptr.To(1), - }, - PodClique: configv1alpha1.PodCliqueControllerConfiguration{ - ConcurrentSyncs: ptr.To(1), - }, - PodCliqueScalingGroup: configv1alpha1.PodCliqueScalingGroupControllerConfiguration{ - ConcurrentSyncs: ptr.To(1), - }, - } - - err = RegisterControllers(mgr, controllerConfig, configv1alpha1.TopologyAwareSchedulingConfiguration{}, configv1alpha1.NetworkAcceleration{}) - require.NoError(t, err) - }) - - // Test registration with different concurrency settings - t.Run("registration with higher concurrency", func(t *testing.T) { - mgr, err := ctrl.NewManager(cfg, ctrl.Options{}) - require.NoError(t, err) - - controllerConfig := configv1alpha1.ControllerConfiguration{ - PodCliqueSet: configv1alpha1.PodCliqueSetControllerConfiguration{ - ConcurrentSyncs: ptr.To(5), - }, - PodClique: configv1alpha1.PodCliqueControllerConfiguration{ - ConcurrentSyncs: ptr.To(10), - }, - PodCliqueScalingGroup: configv1alpha1.PodCliqueScalingGroupControllerConfiguration{ - ConcurrentSyncs: ptr.To(3), + operatorConfig := configv1alpha1.OperatorConfiguration{ + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKai}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKai)}, + Controllers: configv1alpha1.ControllerConfiguration{ + PodCliqueSet: configv1alpha1.PodCliqueSetControllerConfiguration{ + ConcurrentSyncs: ptr.To(1), + }, + PodClique: configv1alpha1.PodCliqueControllerConfiguration{ + ConcurrentSyncs: ptr.To(1), + }, + PodCliqueScalingGroup: configv1alpha1.PodCliqueScalingGroupControllerConfiguration{ + ConcurrentSyncs: ptr.To(1), + }, }, } - err = RegisterControllers(mgr, controllerConfig, configv1alpha1.TopologyAwareSchedulingConfiguration{}, configv1alpha1.NetworkAcceleration{}) + err = RegisterControllers(mgr, &operatorConfig) require.NoError(t, err) }) } diff --git a/operator/internal/controller/utils/managedresource.go b/operator/internal/controller/utils/managedresource.go index ed93503d0..de9752342 100644 --- a/operator/internal/controller/utils/managedresource.go +++ b/operator/internal/controller/utils/managedresource.go @@ -20,6 +20,7 @@ import ( apicommon "github.com/ai-dynamo/grove/operator/api/common" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" "github.com/samber/lo" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -53,3 +54,12 @@ func IsManagedPodClique(obj client.Object, expectedOwnerKinds ...string) bool { }, false) return IsManagedByGrove(podClique.GetLabels()) && hasExpectedOwner } + +// IsManagedPodGang checks if the PodGang is managed by Grove. +func IsManagedPodGang(obj client.Object) bool { + podGang, ok := obj.(*groveschedulerv1alpha1.PodGang) + if !ok { + return false + } + return IsManagedByGrove(podGang.Labels) +} diff --git a/operator/internal/schedulerbackend/kaischeduler/backend.go b/operator/internal/schedulerbackend/kaischeduler/backend.go new file mode 100644 index 000000000..adfc2feb3 --- /dev/null +++ b/operator/internal/schedulerbackend/kaischeduler/backend.go @@ -0,0 +1,83 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package kaischeduler + +import ( + "context" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Backend implements the scheduler backend interface (SchedBackend in schedulerbackend package) for KAI scheduler. +// TODO: Converts PodGang → PodGroup +type Backend struct { + client client.Client + scheme *runtime.Scheme + name string + eventRecorder record.EventRecorder + profile configv1alpha1.SchedulerProfile +} + +// New creates a new KAI backend instance. profile is the scheduler profile for kai-scheduler; +// Backend uses profile.Name and may unmarshal profile.Config for kai-specific options. +func New(cl client.Client, scheme *runtime.Scheme, eventRecorder record.EventRecorder, profile configv1alpha1.SchedulerProfile) *Backend { + return &Backend{ + client: cl, + scheme: scheme, + name: "kai-scheduler", + eventRecorder: eventRecorder, + profile: profile, + } +} + +// Name returns the pod-facing scheduler name (kai-scheduler), for lookup and logging. +func (b *Backend) Name() string { + return b.name +} + +// Init initializes the KAI backend +func (b *Backend) Init() error { + return nil +} + +// SyncPodGang converts PodGang to KAI PodGroup and synchronizes it +func (b *Backend) SyncPodGang(_ context.Context, _ *groveschedulerv1alpha1.PodGang) error { + return nil +} + +// OnPodGangDelete removes the PodGroup owned by this PodGang +func (b *Backend) OnPodGangDelete(_ context.Context, _ *groveschedulerv1alpha1.PodGang) error { + return nil +} + +// PreparePod adds KAI scheduler-specific configuration to the Pod. +// Sets Pod.Spec.SchedulerName so the pod is scheduled by KAI. +func (b *Backend) PreparePod(pod *corev1.Pod) { + pod.Spec.SchedulerName = b.Name() +} + +// ValidatePodCliqueSet runs KAI-specific validations on the PodCliqueSet. +func (b *Backend) ValidatePodCliqueSet(_ context.Context, _ *grovecorev1alpha1.PodCliqueSet) error { + return nil +} diff --git a/operator/internal/schedulerbackend/kaischeduler/backend_test.go b/operator/internal/schedulerbackend/kaischeduler/backend_test.go new file mode 100644 index 000000000..3fbd11ce1 --- /dev/null +++ b/operator/internal/schedulerbackend/kaischeduler/backend_test.go @@ -0,0 +1,42 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package kaischeduler + +import ( + "testing" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + testutils "github.com/ai-dynamo/grove/operator/test/utils" + + "github.com/stretchr/testify/assert" + "k8s.io/client-go/tools/record" +) + +func TestBackend_PreparePod(t *testing.T) { + cl := testutils.CreateDefaultFakeClient(nil) + recorder := record.NewFakeRecorder(10) + profile := configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKai} + b := New(cl, cl.Scheme(), recorder, profile) + + pod := testutils.NewPodBuilder("test-pod", "default"). + WithSchedulerName("default-scheduler"). + Build() + + b.PreparePod(pod) + + assert.Equal(t, "kai-scheduler", pod.Spec.SchedulerName) +} diff --git a/operator/internal/schedulerbackend/kube/backend.go b/operator/internal/schedulerbackend/kube/backend.go new file mode 100644 index 000000000..2a678097d --- /dev/null +++ b/operator/internal/schedulerbackend/kube/backend.go @@ -0,0 +1,91 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package kube + +import ( + "context" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// PodSchedulerName is the value set on Pod.Spec.SchedulerName for the Kubernetes default scheduler. +const PodSchedulerName = "default-scheduler" + +// Backend implements the scheduler backend interface (SchedBackend in schedulerbackend package) for Kubernetes default scheduler. +// This backend does minimal work - just sets the scheduler name on pods +type Backend struct { + client client.Client + scheme *runtime.Scheme + name string + eventRecorder record.EventRecorder + profile configv1alpha1.SchedulerProfile +} + +// New creates a new Kube backend instance. profile is the scheduler profile for default-scheduler; +// Backend uses profile.Name and may unmarshal profile.Config into KubeSchedulerConfig. +func New(cl client.Client, scheme *runtime.Scheme, eventRecorder record.EventRecorder, profile configv1alpha1.SchedulerProfile) *Backend { + return &Backend{ + client: cl, + scheme: scheme, + name: "default-scheduler", + eventRecorder: eventRecorder, + profile: profile, + } +} + +// Name returns the pod-facing scheduler name (default-scheduler), for lookup and logging. +func (b *Backend) Name() string { + return b.name +} + +// Init initializes the Kube backend +// For Kube backend, no special initialization is needed +func (b *Backend) Init() error { + return nil +} + +// SyncPodGang synchronizes PodGang resources +// For default kube scheduler, no additional resources are needed +func (b *Backend) SyncPodGang(_ context.Context, _ *groveschedulerv1alpha1.PodGang) error { + // No-op: default kube scheduler doesn't need any custom resources + return nil +} + +// OnPodGangDelete handles PodGang deletion +// For default kube scheduler, no cleanup is needed +func (b *Backend) OnPodGangDelete(_ context.Context, _ *groveschedulerv1alpha1.PodGang) error { + // No-op: default kube scheduler doesn't have any resources to clean up + return nil +} + +// PreparePod adds Kubernetes default scheduler-specific configuration to the Pod. +// Pod.Spec.SchedulerName is set to "default-scheduler" (the value expected by kube-apiserver / kube-scheduler). +func (b *Backend) PreparePod(pod *corev1.Pod) { + pod.Spec.SchedulerName = b.name +} + +// ValidatePodCliqueSet runs default-scheduler-specific validations on the PodCliqueSet. +func (b *Backend) ValidatePodCliqueSet(_ context.Context, _ *grovecorev1alpha1.PodCliqueSet) error { + return nil +} diff --git a/operator/internal/schedulerbackend/kube/backend_test.go b/operator/internal/schedulerbackend/kube/backend_test.go new file mode 100644 index 000000000..dbed8885b --- /dev/null +++ b/operator/internal/schedulerbackend/kube/backend_test.go @@ -0,0 +1,40 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package kube + +import ( + "testing" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + testutils "github.com/ai-dynamo/grove/operator/test/utils" + + "github.com/stretchr/testify/assert" + "k8s.io/client-go/tools/record" +) + +func TestBackend_PreparePod(t *testing.T) { + cl := testutils.CreateDefaultFakeClient(nil) + recorder := record.NewFakeRecorder(10) + profile := configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKube} + b := New(cl, cl.Scheme(), recorder, profile) + + pod := testutils.NewPodBuilder("test-pod", "default").Build() + + b.PreparePod(pod) + + assert.Equal(t, PodSchedulerName, pod.Spec.SchedulerName) +} diff --git a/operator/internal/schedulerbackend/manager.go b/operator/internal/schedulerbackend/manager.go new file mode 100644 index 000000000..a42218067 --- /dev/null +++ b/operator/internal/schedulerbackend/manager.go @@ -0,0 +1,95 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package schedulerbackend + +import ( + "fmt" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend/kaischeduler" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend/kube" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Compile-time checks that backend implementations satisfy SchedBackend. +var ( + _ SchedBackend = (*kaischeduler.Backend)(nil) + _ SchedBackend = (*kube.Backend)(nil) +) + +// newBackendForProfile creates and initializes a SchedBackend for the given profile. +// Add new scheduler backends by extending this switch (no global registry). +func newBackendForProfile(cl client.Client, scheme *runtime.Scheme, rec record.EventRecorder, p configv1alpha1.SchedulerProfile) (SchedBackend, error) { + switch p.Name { + case configv1alpha1.SchedulerNameKube: + b := kube.New(cl, scheme, rec, p) + if err := b.Init(); err != nil { + return nil, err + } + return b, nil + case configv1alpha1.SchedulerNameKai: + b := kaischeduler.New(cl, scheme, rec, p) + if err := b.Init(); err != nil { + return nil, err + } + return b, nil + default: + return nil, fmt.Errorf("scheduler profile %q is not supported", p.Name) + } +} + +var ( + backends map[string]SchedBackend + defaultBackend SchedBackend +) + +// Initialize creates and registers backend instances for each profile in config.Profiles. +// Defaults are applied to config so that kube-scheduler is always present; only backends +// named in config.Profiles are started. Called once during operator startup before controllers start. +func Initialize(client client.Client, scheme *runtime.Scheme, eventRecorder record.EventRecorder, cfg configv1alpha1.SchedulerConfiguration) error { + backends = make(map[string]SchedBackend) + + // New and init each backend from cfg.Profiles (order follows config; duplicate name overwrites). + for _, p := range cfg.Profiles { + backend, err := newBackendForProfile(client, scheme, eventRecorder, p) + if err != nil { + return fmt.Errorf("failed to initialize %s backend: %w", p.Name, err) + } + backends[backend.Name()] = backend + if cfg.DefaultProfileName != "" && string(p.Name) == cfg.DefaultProfileName { + defaultBackend = backend + } + } + return nil +} + +// Get returns the backend for the given name. Empty string is valid and returns the default backend (e.g. when Pod.Spec.SchedulerName is unset). +// default-scheduler is always available; other backends return nil if not enabled via a profile. +func Get(name string) SchedBackend { + if name == "" { + return defaultBackend + } + return backends[name] +} + +// GetDefault returns the backend designated as default in OperatorConfiguration (scheduler.defaultProfileName). +func GetDefault() SchedBackend { + return defaultBackend +} diff --git a/operator/internal/schedulerbackend/manager_test.go b/operator/internal/schedulerbackend/manager_test.go new file mode 100644 index 000000000..85b3b51de --- /dev/null +++ b/operator/internal/schedulerbackend/manager_test.go @@ -0,0 +1,89 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package schedulerbackend + +import ( + "testing" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + testutils "github.com/ai-dynamo/grove/operator/test/utils" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/client-go/tools/record" +) + +// TestInitialize tests backend initialization with different schedulers. +func TestInitialize(t *testing.T) { + tests := []struct { + name string + schedulerName configv1alpha1.SchedulerName + wantErr bool + errContains string + expectedName string + }{ + { + name: "kai scheduler initialization", + schedulerName: configv1alpha1.SchedulerNameKai, + wantErr: false, + expectedName: "kai-scheduler", + }, + { + name: "default scheduler initialization", + schedulerName: configv1alpha1.SchedulerNameKube, + wantErr: false, + expectedName: "default-scheduler", // kube backend's Name() is the pod-facing name + }, + { + name: "unsupported scheduler", + schedulerName: "volcano", + wantErr: true, + errContains: "not supported", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Reset global state before each test + backends = nil + defaultBackend = nil + + cl := testutils.CreateDefaultFakeClient(nil) + recorder := record.NewFakeRecorder(10) + + cfg := configv1alpha1.SchedulerConfiguration{ + Profiles: []configv1alpha1.SchedulerProfile{ + {Name: tt.schedulerName}, + }, + DefaultProfileName: string(tt.schedulerName), + } + err := Initialize(cl, cl.Scheme(), recorder, cfg) + + if tt.wantErr { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.errContains) + assert.Nil(t, GetDefault()) + } else { + require.NoError(t, err) + require.NotNil(t, GetDefault()) + name := GetDefault().Name() + assert.Equal(t, tt.expectedName, name) + assert.Equal(t, GetDefault(), Get(name)) // backend is stored under its Name() + } + }) + } +} diff --git a/operator/internal/schedulerbackend/types.go b/operator/internal/schedulerbackend/types.go new file mode 100644 index 000000000..f23055766 --- /dev/null +++ b/operator/internal/schedulerbackend/types.go @@ -0,0 +1,56 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package schedulerbackend + +import ( + "context" + + grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" + corev1 "k8s.io/api/core/v1" +) + +// SchedBackend defines the interface that different scheduler backends must implement. +// It is defined in this package (consumer side) so that kube and kaischeduler subpackages +// need not import schedulerbackend, avoiding circular dependencies (see "accept interfaces, +// return structs" and consumer-defined interfaces in Go / Kubernetes). +// +// Architecture: SchedBackend validates PodCliqueSet at admission, converts PodGang to scheduler-specific +// CR (PodGroup/Workload/etc), and prepares Pods with scheduler-specific configurations. +type SchedBackend interface { + // Name is a unique name of the scheduler backend. + Name() string + + // Init provides a hook to initialize/setup one-time scheduler resources, + // called at the startup of grove operator. + Init() error + + // SyncPodGang synchronizes (creates/updates) scheduler-specific resources for a PodGang + // reacting to a creation or update of a PodGang resource. + SyncPodGang(ctx context.Context, podGang *groveschedulerv1alpha1.PodGang) error + + // OnPodGangDelete cleans up scheduler-specific resources for the given PodGang. + OnPodGangDelete(ctx context.Context, podGang *groveschedulerv1alpha1.PodGang) error + + // PreparePod adds scheduler-backend-specific configuration to the given Pod object + // prior to its creation (schedulerName, annotations, etc.). + PreparePod(pod *corev1.Pod) + + // ValidatePodCliqueSet runs scheduler-specific validations on the PodCliqueSet (e.g. TAS required but not supported). + ValidatePodCliqueSet(ctx context.Context, pcs *grovecorev1alpha1.PodCliqueSet) error +} diff --git a/operator/internal/webhook/admission/pcs/validation/handler.go b/operator/internal/webhook/admission/pcs/validation/handler.go index b0c779e28..f5dc885c3 100644 --- a/operator/internal/webhook/admission/pcs/validation/handler.go +++ b/operator/internal/webhook/admission/pcs/validation/handler.go @@ -24,6 +24,7 @@ import ( "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" "github.com/ai-dynamo/grove/operator/internal/errors" "github.com/ai-dynamo/grove/operator/internal/mnnvl" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend" "github.com/go-logr/logr" admissionv1 "k8s.io/api/admission/v1" @@ -42,17 +43,21 @@ const ( // Handler is a handler for validating PodCliqueSet resources. type Handler struct { - logger logr.Logger - tasConfig configv1alpha1.TopologyAwareSchedulingConfiguration - networkConfig configv1alpha1.NetworkAcceleration + logger logr.Logger + tasConfig configv1alpha1.TopologyAwareSchedulingConfiguration + networkConfig configv1alpha1.NetworkAcceleration + schedulerConfig configv1alpha1.SchedulerConfiguration } // NewHandler creates a new handler for PodCliqueSet Webhook. -func NewHandler(mgr manager.Manager, tasConfig configv1alpha1.TopologyAwareSchedulingConfiguration, networkConfig configv1alpha1.NetworkAcceleration) *Handler { +// It reads TopologyAwareScheduling, Network, and Scheduler from the operator configuration. +// operatorCfg must not be nil. +func NewHandler(mgr manager.Manager, operatorCfg *configv1alpha1.OperatorConfiguration) *Handler { return &Handler{ - logger: mgr.GetLogger().WithName("webhook").WithName(Name), - tasConfig: tasConfig, - networkConfig: networkConfig, + logger: mgr.GetLogger().WithName("webhook").WithName(Name), + tasConfig: operatorCfg.TopologyAwareScheduling, + networkConfig: operatorCfg.Network, + schedulerConfig: operatorCfg.Scheduler, } } @@ -64,7 +69,7 @@ func (h *Handler) ValidateCreate(ctx context.Context, obj runtime.Object) (admis return nil, errors.WrapError(err, ErrValidateCreatePodCliqueSet, string(admissionv1.Create), "failed to cast object to PodCliqueSet") } - v := newPCSValidator(pcs, admissionv1.Create, h.tasConfig) + v := newPCSValidator(pcs, admissionv1.Create, h.tasConfig, h.schedulerConfig) var allErrs field.ErrorList allErrs = append(allErrs, v.validateTopologyConstraintsOnCreate()...) warnings, errs := v.validate() @@ -73,6 +78,11 @@ func (h *Handler) ValidateCreate(ctx context.Context, obj runtime.Object) (admis // Validate MNNVL annotation: reject if annotation="true" but feature is disabled allErrs = append(allErrs, mnnvl.ValidateMetadataOnCreate(pcs, h.networkConfig.AutoMNNVLEnabled)...) + // Scheduler-backend-specific validation + if err := validatePodCliqueSetWithBackend(ctx, pcs); err != nil { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec"), pcs.Spec, err.Error())) + } + return warnings, allErrs.ToAggregate() } @@ -88,12 +98,17 @@ func (h *Handler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Obj return nil, errors.WrapError(err, ErrValidateUpdatePodCliqueSet, string(admissionv1.Update), "failed to cast old object to PodCliqueSet") } - v := newPCSValidator(newPCS, admissionv1.Update, h.tasConfig) + v := newPCSValidator(newPCS, admissionv1.Update, h.tasConfig, h.schedulerConfig) warnings, errs := v.validate() // Validate MNNVL annotation immutability errs = append(errs, mnnvl.ValidateMetadataOnUpdate(oldPCS, newPCS)...) + // Scheduler-backend-specific validation + if err := validatePodCliqueSetWithBackend(ctx, newPCS); err != nil { + errs = append(errs, field.Invalid(field.NewPath("spec"), newPCS.Spec, err.Error())) + } + if len(errs) > 0 { return warnings, errs.ToAggregate() } @@ -105,6 +120,20 @@ func (h *Handler) ValidateDelete(_ context.Context, _ runtime.Object) (admission return nil, nil } +// validatePodCliqueSetWithBackend resolves the scheduler backend for the PCS and runs backend-specific validation. +// All cliques share the same (resolved) schedulerName after validateSchedulerNames, so we use the first clique; empty is resolved by Get(""). +func validatePodCliqueSetWithBackend(ctx context.Context, pcs *v1alpha1.PodCliqueSet) error { + schedulerName := "" + if len(pcs.Spec.Template.Cliques) > 0 && pcs.Spec.Template.Cliques[0] != nil { + schedulerName = pcs.Spec.Template.Cliques[0].Spec.PodSpec.SchedulerName + } + backend := schedulerbackend.Get(schedulerName) + if backend == nil { + return nil + } + return backend.ValidatePodCliqueSet(ctx, pcs) +} + // castToPodCliqueSet attempts to cast a runtime.Object to a PodCliqueSet. func castToPodCliqueSet(obj runtime.Object) (*v1alpha1.PodCliqueSet, error) { pcs, ok := obj.(*v1alpha1.PodCliqueSet) diff --git a/operator/internal/webhook/admission/pcs/validation/handler_mnnvl_test.go b/operator/internal/webhook/admission/pcs/validation/handler_mnnvl_test.go index 318497528..d45f1fd33 100644 --- a/operator/internal/webhook/admission/pcs/validation/handler_mnnvl_test.go +++ b/operator/internal/webhook/admission/pcs/validation/handler_mnnvl_test.go @@ -84,7 +84,12 @@ func TestValidateCreate_MNNVL(t *testing.T) { networkConfig := configv1alpha1.NetworkAcceleration{ AutoMNNVLEnabled: tt.autoMNNVLEnabled, } - handler := NewHandler(mgr, getDefaultTASConfig(), networkConfig) + cfg := configv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: networkConfig, + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) ctx := context.Background() warnings, err := handler.ValidateCreate(ctx, tt.pcs) @@ -163,7 +168,12 @@ func TestValidateUpdate_MNNVL(t *testing.T) { } // MNNVL validation on update doesn't depend on feature flag - handler := NewHandler(mgr, getDefaultTASConfig(), getDefaultNetworkConfig()) + cfg := configv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: getDefaultNetworkConfig(), + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) ctx := context.Background() warnings, err := handler.ValidateUpdate(ctx, tt.oldPCS, tt.newPCS) @@ -245,7 +255,12 @@ func TestMNNVL_WebhookPipeline_LegacyPCSUpdate(t *testing.T) { require.NoError(t, err, "defaulting webhook should not error on update") // Step 2: Simulate the validating webhook running with oldPCS vs (possibly mutated) newPCS. - validationHandler := NewHandler(mgr, getDefaultTASConfig(), networkConfig) + validationCfg := configv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: networkConfig, + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + validationHandler := NewHandler(mgr, &validationCfg) ctx := context.Background() warnings, err := validationHandler.ValidateUpdate(ctx, oldPCS, newPCS) diff --git a/operator/internal/webhook/admission/pcs/validation/handler_test.go b/operator/internal/webhook/admission/pcs/validation/handler_test.go index ccf15f11b..2b5b134e7 100644 --- a/operator/internal/webhook/admission/pcs/validation/handler_test.go +++ b/operator/internal/webhook/admission/pcs/validation/handler_test.go @@ -47,7 +47,12 @@ func TestNewHandler(t *testing.T) { Logger: logr.Discard(), } - handler := NewHandler(mgr, getDefaultTASConfig(), getDefaultNetworkConfig()) + cfg := groveconfigv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: getDefaultNetworkConfig(), + Scheduler: groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) require.NotNil(t, handler) assert.NotNil(t, handler.logger) } @@ -113,7 +118,12 @@ func TestValidateCreate(t *testing.T) { Logger: logr.Discard(), } - handler := NewHandler(mgr, getDefaultTASConfig(), getDefaultNetworkConfig()) + cfg := groveconfigv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: getDefaultNetworkConfig(), + Scheduler: groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) ctx := context.Background() warnings, err := handler.ValidateCreate(ctx, tt.obj) @@ -244,7 +254,12 @@ func TestValidateUpdate(t *testing.T) { Logger: logr.Discard(), } - handler := NewHandler(mgr, getDefaultTASConfig(), getDefaultNetworkConfig()) + cfg := groveconfigv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: getDefaultNetworkConfig(), + Scheduler: groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) ctx := context.Background() warnings, err := handler.ValidateUpdate(ctx, tt.newObj, tt.oldObj) @@ -271,7 +286,12 @@ func TestValidateDelete(t *testing.T) { Logger: logr.Discard(), } - handler := NewHandler(mgr, getDefaultTASConfig(), getDefaultNetworkConfig()) + cfg := groveconfigv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: getDefaultNetworkConfig(), + Scheduler: groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) // Deletion validation always succeeds ctx := context.Background() @@ -382,7 +402,12 @@ func TestLogValidatorFunctionInvocation(t *testing.T) { Logger: logr.Discard(), } - handler := NewHandler(mgr, getDefaultTASConfig(), getDefaultNetworkConfig()) + cfg := groveconfigv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: getDefaultTASConfig(), + Network: getDefaultNetworkConfig(), + Scheduler: groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) // This function doesn't return an error, but we can verify it doesn't panic assert.NotPanics(t, func() { diff --git a/operator/internal/webhook/admission/pcs/validation/podcliqueset.go b/operator/internal/webhook/admission/pcs/validation/podcliqueset.go index 303dccac1..ac22e00f2 100644 --- a/operator/internal/webhook/admission/pcs/validation/podcliqueset.go +++ b/operator/internal/webhook/admission/pcs/validation/podcliqueset.go @@ -23,6 +23,7 @@ import ( groveconfigv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend" "github.com/ai-dynamo/grove/operator/internal/utils" "github.com/samber/lo" @@ -46,10 +47,13 @@ type pcsValidator struct { pcs *grovecorev1alpha1.PodCliqueSet tasEnabled bool clusterTopologyDomains []string + schedulerConfig groveconfigv1alpha1.SchedulerConfiguration } // newPCSValidator creates a new PodCliqueSet validator for the given operation. -func newPCSValidator(pcs *grovecorev1alpha1.PodCliqueSet, operation admissionv1.Operation, tasConfig groveconfigv1alpha1.TopologyAwareSchedulingConfiguration) *pcsValidator { +// schedulerConfig is the full scheduler configuration; the validator uses it for +// scheduler-name matching and may use per-scheduler config for future validations. +func newPCSValidator(pcs *grovecorev1alpha1.PodCliqueSet, operation admissionv1.Operation, tasConfig groveconfigv1alpha1.TopologyAwareSchedulingConfiguration, schedulerConfig groveconfigv1alpha1.SchedulerConfiguration) *pcsValidator { topologyDomains := lo.Map(tasConfig.Levels, func(level grovecorev1alpha1.TopologyLevel, _ int) string { return string(level.Domain) }) @@ -58,6 +62,7 @@ func newPCSValidator(pcs *grovecorev1alpha1.PodCliqueSet, operation admissionv1. pcs: pcs, tasEnabled: tasConfig.Enabled, clusterTopologyDomains: topologyDomains, + schedulerConfig: schedulerConfig, } } @@ -138,21 +143,50 @@ func (v *pcsValidator) validatePodCliqueTemplates(fldPath *field.Path) ([]string allErrs = append(allErrs, sliceMustHaveUniqueElements(cliqueNames, fldPath.Child("name"), "cliqueTemplateSpec names must be unique")...) allErrs = append(allErrs, sliceMustHaveUniqueElements(cliqueRoles, fldPath.Child("roleName"), "cliqueTemplateSpec.Spec roleNames must be unique")...) + allErrs = append(allErrs, v.validateSchedulerNames(schedulerNames, fldPath)...) + + if v.isStartupTypeExplicit() { + allErrs = append(allErrs, validateCliqueDependencies(cliqueTemplateSpecs, fldPath)...) + } + + return warnings, allErrs +} + +// validateSchedulerNames ensures all pod scheduler names resolve to the same scheduler and that scheduler is enabled. +// Empty schedulerName is resolved to the default backend name from schedulerbackend.GetDefault(). +func (v *pcsValidator) validateSchedulerNames(schedulerNames []string, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + specPath := fldPath.Child("spec").Child("podSpec").Child("schedulerName") + + defaultSchedulerName := "default-scheduler" + if def := schedulerbackend.GetDefault(); def != nil { + defaultSchedulerName = def.Name() + } + + // Resolve empty to default backend name; then require all resolved names to be the same. uniqueSchedulerNames := lo.Uniq(lo.Map(schedulerNames, func(item string, _ int) string { if item == "" { - return "default-scheduler" + return defaultSchedulerName } return item })) if len(uniqueSchedulerNames) > 1 { - allErrs = append(allErrs, field.Invalid(fldPath.Child("spec").Child("podSpec").Child("schedulerName"), uniqueSchedulerNames[0], "the schedulerName for all pods have to be the same")) + allErrs = append(allErrs, field.Invalid(specPath, strings.Join(uniqueSchedulerNames, ", "), "the schedulerName for all pods have to be the same")) } - if v.isStartupTypeExplicit() { - allErrs = append(allErrs, validateCliqueDependencies(cliqueTemplateSpecs, fldPath)...) + // Validate that the resolved scheduler is enabled. + pcsSchedulerName := "" + if len(uniqueSchedulerNames) > 0 && uniqueSchedulerNames[0] != "" { + pcsSchedulerName = uniqueSchedulerNames[0] } - - return warnings, allErrs + if pcsSchedulerName != "default-scheduler" && schedulerbackend.Get(pcsSchedulerName) == nil { + allErrs = append(allErrs, field.Invalid( + specPath, + pcsSchedulerName, + "schedulerName must be an enabled scheduler backend; this scheduler is not enabled in OperatorConfiguration", + )) + } + return allErrs } // validatePodCliqueNameConstraints validates that PodClique names meet DNS subdomain requirements and pod naming constraints. @@ -543,6 +577,7 @@ func (v *pcsValidator) validatePodCliqueUpdate(oldCliques []*grovecorev1alpha1.P allErrs = append(allErrs, apivalidation.ValidateImmutableField(newClique.Spec.RoleName, oldIndexCliqueTuple.B.Spec.RoleName, cliqueFldPath.Child("roleName"))...) allErrs = append(allErrs, apivalidation.ValidateImmutableField(newClique.Spec.MinAvailable, oldIndexCliqueTuple.B.Spec.MinAvailable, cliqueFldPath.Child("minAvailable"))...) allErrs = append(allErrs, apivalidation.ValidateImmutableField(newClique.Spec.StartsAfter, oldIndexCliqueTuple.B.Spec.StartsAfter, cliqueFldPath.Child("startsAfter"))...) + allErrs = append(allErrs, apivalidation.ValidateImmutableField(newClique.Spec.PodSpec.SchedulerName, oldIndexCliqueTuple.B.Spec.PodSpec.SchedulerName, cliqueFldPath.Child("podSpec", "schedulerName"))...) } return allErrs diff --git a/operator/internal/webhook/admission/pcs/validation/podcliqueset_test.go b/operator/internal/webhook/admission/pcs/validation/podcliqueset_test.go index d1e62fe2d..022e5bee9 100644 --- a/operator/internal/webhook/admission/pcs/validation/podcliqueset_test.go +++ b/operator/internal/webhook/admission/pcs/validation/podcliqueset_test.go @@ -18,17 +18,22 @@ package validation import ( "fmt" + "strings" "testing" "time" groveconfigv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + "github.com/ai-dynamo/grove/operator/internal/schedulerbackend" testutils "github.com/ai-dynamo/grove/operator/test/utils" + "github.com/samber/lo" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" admissionv1 "k8s.io/api/admission/v1" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/validation/field" + "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ) @@ -133,7 +138,7 @@ func TestResourceNamingValidation(t *testing.T) { pcs := pcsBuilder.Build() - validator := newPCSValidator(pcs, admissionv1.Create, defaultTASConfig()) + validator := newPCSValidator(pcs, admissionv1.Create, defaultTASConfig(), groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}) warnings, errs := validator.validate() if tc.errorMatchers != nil { @@ -147,6 +152,178 @@ func TestResourceNamingValidation(t *testing.T) { } } +func TestValidateSchedulerNames(t *testing.T) { + specPath := field.NewPath("cliques").Child("spec").Child("podSpec").Child("schedulerName") + cl := testutils.CreateDefaultFakeClient(nil) + recorder := record.NewFakeRecorder(10) + + tests := []struct { + name string + schedulerConfig groveconfigv1alpha1.SchedulerConfiguration + schedulerNames []string + expectErrors int + expectInvalidSame bool + expectInvalidEnabled bool + }{ + { + name: "all same default-scheduler (kube default)", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"default-scheduler", "default-scheduler"}, + expectErrors: 0, + }, + { + name: "all empty with default default-scheduler", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"", ""}, + expectErrors: 0, + }, + { + name: "all empty with default kai-scheduler (kai default)", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKai), + }, + schedulerNames: []string{"", ""}, + expectErrors: 0, + }, + { + name: "mixed empty and default-scheduler", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"", "default-scheduler"}, + expectErrors: 0, + }, + { + name: "mixed default-scheduler and kai-scheduler", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"default-scheduler", "kai-scheduler"}, + expectErrors: 1, + expectInvalidSame: true, + expectInvalidEnabled: false, + }, + { + name: "single kai-scheduler when enabled (kube+kai)", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"kai-scheduler"}, + expectErrors: 0, + }, + { + name: "single kube-scheduler when enabled (kube only)", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"kai-scheduler"}, + expectErrors: 1, + expectInvalidSame: false, + expectInvalidEnabled: true, + }, + { + name: "unknown scheduler not enabled", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"volcano"}, + expectErrors: 1, + expectInvalidSame: false, + expectInvalidEnabled: true, + }, + { + name: "no cliques (empty list)", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{}, + expectErrors: 0, + }, + { + name: "mixed empty and kai when default is default-scheduler", + schedulerConfig: groveconfigv1alpha1.SchedulerConfiguration{ + Profiles: []groveconfigv1alpha1.SchedulerProfile{ + {Name: groveconfigv1alpha1.SchedulerNameKube}, + {Name: groveconfigv1alpha1.SchedulerNameKai}, + }, + DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube), + }, + schedulerNames: []string{"", "kai-scheduler"}, + expectErrors: 1, + expectInvalidSame: true, + expectInvalidEnabled: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := schedulerbackend.Initialize(cl, cl.Scheme(), recorder, tt.schedulerConfig) + require.NoError(t, err) + + pcsBuilder := testutils.NewPodCliqueSetBuilder("test", "default", uuid.NewUUID()). + WithReplicas(1). + WithTerminationDelay(4 * time.Hour). + WithCliqueStartupType(ptr.To(grovecorev1alpha1.CliqueStartupTypeAnyOrder)) + for i := 0; i < len(tt.schedulerNames); i++ { + clique := createDummyPodCliqueTemplate(fmt.Sprintf("c%d", i)) + clique.Spec.PodSpec.SchedulerName = tt.schedulerNames[i] + pcsBuilder = pcsBuilder.WithPodCliqueTemplateSpec(clique) + } + pcs := pcsBuilder.Build() + validator := newPCSValidator(pcs, admissionv1.Create, defaultTASConfig(), tt.schedulerConfig) + fldPath := field.NewPath("cliques") + errs := validator.validateSchedulerNames(tt.schedulerNames, fldPath) + + assert.Len(t, errs, tt.expectErrors, "validation errors: %v", errs) + if tt.expectErrors > 0 { + msgs := lo.Map(errs, func(e *field.Error, _ int) string { return e.ErrorBody() }) + if tt.expectInvalidSame { + assert.Contains(t, strings.Join(msgs, " "), "have to be the same") + } + if tt.expectInvalidEnabled { + assert.Contains(t, strings.Join(msgs, " "), "not enabled") + } + } + for _, e := range errs { + assert.Equal(t, specPath.String(), e.Field, "error field path") + } + }) + } +} + func TestPodCliqueScalingGroupConfigValidation(t *testing.T) { testCases := []struct { description string @@ -263,7 +440,7 @@ func TestPodCliqueScalingGroupConfigValidation(t *testing.T) { // Add scaling groups pcs.Spec.Template.PodCliqueScalingGroupConfigs = tc.scalingGroups - validator := newPCSValidator(pcs, admissionv1.Create, defaultTASConfig()) + validator := newPCSValidator(pcs, admissionv1.Create, defaultTASConfig(), groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}) warnings, errs := validator.validate() if tc.errorMatchers != nil { @@ -386,7 +563,7 @@ func TestPodCliqueUpdateValidation(t *testing.T) { newPCS.Spec.Template.Cliques = tc.newCliques // Create validator and validate update - validator := newPCSValidator(newPCS, admissionv1.Update, defaultTASConfig()) + validator := newPCSValidator(newPCS, admissionv1.Update, defaultTASConfig(), groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}) fldPath := field.NewPath("spec").Child("template").Child("cliques") validationErrors := validator.validatePodCliqueUpdate(oldPCS.Spec.Template.Cliques, fldPath) @@ -497,6 +674,21 @@ func TestImmutableFieldsValidation(t *testing.T) { expectError: true, expectedErrMsg: "field is immutable", }, + { + name: "Invalid: schedulerName is immutable", + setupOldPCS: func() *grovecorev1alpha1.PodCliqueSet { + pcs := createTestPodCliqueSet("test") + pcs.Spec.Template.Cliques[0].Spec.PodSpec.SchedulerName = "" + return pcs + }, + setupNewPCS: func() *grovecorev1alpha1.PodCliqueSet { + pcs := createTestPodCliqueSet("test") + pcs.Spec.Template.Cliques[0].Spec.PodSpec.SchedulerName = "default-scheduler" + return pcs + }, + expectError: true, + expectedErrMsg: "field is immutable", + }, } for _, tc := range testCases { @@ -504,7 +696,7 @@ func TestImmutableFieldsValidation(t *testing.T) { oldPCS := tc.setupOldPCS() newPCS := tc.setupNewPCS() - validator := newPCSValidator(newPCS, admissionv1.Update, defaultTASConfig()) + validator := newPCSValidator(newPCS, admissionv1.Update, defaultTASConfig(), groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}) err := validator.validateUpdate(oldPCS) if tc.expectError { @@ -674,7 +866,7 @@ func TestPodCliqueScalingGroupConfigsUpdateValidation(t *testing.T) { newPCS.Spec.Template.PodCliqueScalingGroupConfigs = tc.newConfigs // Create validator and validate update - validator := newPCSValidator(newPCS, admissionv1.Update, defaultTASConfig()) + validator := newPCSValidator(newPCS, admissionv1.Update, defaultTASConfig(), groveconfigv1alpha1.SchedulerConfiguration{Profiles: []groveconfigv1alpha1.SchedulerProfile{{Name: groveconfigv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(groveconfigv1alpha1.SchedulerNameKube)}) fldPath := field.NewPath("spec", "template", "podCliqueScalingGroupConfigs") validationErrors := validator.validatePodCliqueScalingGroupConfigsUpdate(tc.oldConfigs, fldPath) diff --git a/operator/internal/webhook/admission/pcs/validation/register_test.go b/operator/internal/webhook/admission/pcs/validation/register_test.go index 7926b8874..9a52dd063 100644 --- a/operator/internal/webhook/admission/pcs/validation/register_test.go +++ b/operator/internal/webhook/admission/pcs/validation/register_test.go @@ -42,7 +42,12 @@ func TestRegisterWithManager(t *testing.T) { }) mgr.WebhookServer = server - handler := NewHandler(mgr, configv1alpha1.TopologyAwareSchedulingConfiguration{}, configv1alpha1.NetworkAcceleration{}) + cfg := configv1alpha1.OperatorConfiguration{ + TopologyAwareScheduling: configv1alpha1.TopologyAwareSchedulingConfiguration{}, + Network: configv1alpha1.NetworkAcceleration{}, + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + handler := NewHandler(mgr, &cfg) err := handler.RegisterWithManager(mgr) require.NoError(t, err) } diff --git a/operator/internal/webhook/register.go b/operator/internal/webhook/register.go index 9cf378160..a01da791b 100644 --- a/operator/internal/webhook/register.go +++ b/operator/internal/webhook/register.go @@ -31,18 +31,21 @@ import ( ) // Register registers the webhooks with the controller manager. -func Register(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerConfig, tasConfig configv1alpha1.TopologyAwareSchedulingConfiguration, networkConfig configv1alpha1.NetworkAcceleration) error { - defaultingWebhook := defaulting.NewHandler(mgr, networkConfig) +func Register(mgr manager.Manager, operatorCfg *configv1alpha1.OperatorConfiguration) error { + if operatorCfg == nil { + return fmt.Errorf("operator configuration must not be nil") + } + defaultingWebhook := defaulting.NewHandler(mgr, operatorCfg.Network) slog.Info("Registering webhook with manager", "handler", defaulting.Name) if err := defaultingWebhook.RegisterWithManager(mgr); err != nil { return fmt.Errorf("failed adding %s webhook handler: %v", defaulting.Name, err) } - pcsValidatingWebhook := pcsvalidation.NewHandler(mgr, tasConfig, networkConfig) + pcsValidatingWebhook := pcsvalidation.NewHandler(mgr, operatorCfg) slog.Info("Registering webhook with manager", "handler", pcsvalidation.Name) if err := pcsValidatingWebhook.RegisterWithManager(mgr); err != nil { return fmt.Errorf("failed adding %s webhook handler: %v", pcsvalidation.Name, err) } - if authorizerConfig.Enabled { + if operatorCfg.Authorizer.Enabled { serviceAccountName, ok := os.LookupEnv(constants.EnvVarServiceAccountName) if !ok { return fmt.Errorf("can not register authorizer webhook with no \"%s\" environment vairable", constants.EnvVarServiceAccountName) @@ -52,7 +55,7 @@ func Register(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerCon return fmt.Errorf("error reading namespace file with error: %w", err) } reconcilerServiceAccountUserName := generateReconcilerServiceAccountUsername(string(namespace), serviceAccountName) - authorizerWebhook := authorization.NewHandler(mgr, authorizerConfig, reconcilerServiceAccountUserName) + authorizerWebhook := authorization.NewHandler(mgr, operatorCfg.Authorizer, reconcilerServiceAccountUserName) slog.Info("Registering webhook with manager", "handler", authorization.Name) if err := authorizerWebhook.RegisterWithManager(mgr); err != nil { return fmt.Errorf("failed adding %s webhook handler: %v", authorization.Name, err) diff --git a/operator/internal/webhook/register_test.go b/operator/internal/webhook/register_test.go index 9c560e507..c3b8f813b 100644 --- a/operator/internal/webhook/register_test.go +++ b/operator/internal/webhook/register_test.go @@ -91,7 +91,13 @@ func TestRegisterWebhooks_WithoutAuthorizer(t *testing.T) { Enabled: false, } - err := Register(mgr, authorizerConfig, configv1alpha1.TopologyAwareSchedulingConfiguration{}, configv1alpha1.NetworkAcceleration{}) + operatorCfg := configv1alpha1.OperatorConfiguration{ + Authorizer: authorizerConfig, + TopologyAwareScheduling: configv1alpha1.TopologyAwareSchedulingConfiguration{}, + Network: configv1alpha1.NetworkAcceleration{}, + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + err := Register(mgr, &operatorCfg) require.NoError(t, err) } @@ -120,7 +126,13 @@ func TestRegisterWebhooks_WithAuthorizerMissingEnvVar(t *testing.T) { Enabled: true, } - err = Register(mgr, authorizerConfig, configv1alpha1.TopologyAwareSchedulingConfiguration{}, configv1alpha1.NetworkAcceleration{}) + operatorCfg := configv1alpha1.OperatorConfiguration{ + Authorizer: authorizerConfig, + TopologyAwareScheduling: configv1alpha1.TopologyAwareSchedulingConfiguration{}, + Network: configv1alpha1.NetworkAcceleration{}, + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + err = Register(mgr, &operatorCfg) require.Error(t, err) assert.Contains(t, err.Error(), constants.EnvVarServiceAccountName) } @@ -149,7 +161,13 @@ func TestRegisterWebhooks_WithAuthorizerMissingNamespaceFile(t *testing.T) { Enabled: true, } - err := Register(mgr, authorizerConfig, configv1alpha1.TopologyAwareSchedulingConfiguration{}, configv1alpha1.NetworkAcceleration{}) + operatorCfg := configv1alpha1.OperatorConfiguration{ + Authorizer: authorizerConfig, + TopologyAwareScheduling: configv1alpha1.TopologyAwareSchedulingConfiguration{}, + Network: configv1alpha1.NetworkAcceleration{}, + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + err := Register(mgr, &operatorCfg) require.Error(t, err) assert.Contains(t, err.Error(), "error reading namespace file") } @@ -194,7 +212,13 @@ func TestRegisterWebhooks_WithAuthorizerSuccess(t *testing.T) { Enabled: true, } - err = Register(mgr, authorizerConfig, configv1alpha1.TopologyAwareSchedulingConfiguration{}, configv1alpha1.NetworkAcceleration{}) + operatorCfg := configv1alpha1.OperatorConfiguration{ + Authorizer: authorizerConfig, + TopologyAwareScheduling: configv1alpha1.TopologyAwareSchedulingConfiguration{}, + Network: configv1alpha1.NetworkAcceleration{}, + Scheduler: configv1alpha1.SchedulerConfiguration{Profiles: []configv1alpha1.SchedulerProfile{{Name: configv1alpha1.SchedulerNameKube}}, DefaultProfileName: string(configv1alpha1.SchedulerNameKube)}, + } + err = Register(mgr, &operatorCfg) // Will error because it tries to read the hardcoded namespace file path require.Error(t, err) } diff --git a/operator/skaffold.yaml b/operator/skaffold.yaml index 52f760410..c8c309a7f 100644 --- a/operator/skaffold.yaml +++ b/operator/skaffold.yaml @@ -65,6 +65,10 @@ profiles: value: replicaCount: 1 config: + scheduler: + defaultProfileName: kai-scheduler + profiles: + - name: kai-scheduler leaderElection: enabled: false topologyAwareScheduling: diff --git a/operator/test/utils/client.go b/operator/test/utils/client.go index b7f4d07d4..37afbb61c 100644 --- a/operator/test/utils/client.go +++ b/operator/test/utils/client.go @@ -138,6 +138,14 @@ func (b *TestClientBuilder) WithObjects(objects ...client.Object) *TestClientBui return b } +// WithStatusSubresource registers types that have status subresources so that Status().Patch() works with the fake client. +func (b *TestClientBuilder) WithStatusSubresource(objs ...client.Object) *TestClientBuilder { + if len(objs) > 0 { + b.delegatingClientBuilder.WithStatusSubresource(objs...) + } + return b +} + // RecordErrorForObjects records an error for a specific client.Client method and object keys. func (b *TestClientBuilder) RecordErrorForObjects(method ClientMethod, err *apierrors.StatusError, objectKeys ...client.ObjectKey) *TestClientBuilder { // this method records error, so if nil error is passed then there is no need to create any error record. diff --git a/operator/test/utils/pod.go b/operator/test/utils/pod.go index d94159fca..453607b43 100644 --- a/operator/test/utils/pod.go +++ b/operator/test/utils/pod.go @@ -81,6 +81,12 @@ func (b *PodBuilder) WithOwner(ownerName string) *PodBuilder { return b } +// WithSchedulerName sets the scheduler name on the Pod spec. +func (b *PodBuilder) WithSchedulerName(name string) *PodBuilder { + b.pod.Spec.SchedulerName = name + return b +} + // WithLabels adds labels to the Pod. func (b *PodBuilder) WithLabels(labels map[string]string) *PodBuilder { if b.pod.Labels == nil { diff --git a/operator/test/utils/podgang.go b/operator/test/utils/podgang.go new file mode 100644 index 000000000..075b25942 --- /dev/null +++ b/operator/test/utils/podgang.go @@ -0,0 +1,85 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package utils + +import ( + apicommon "github.com/ai-dynamo/grove/operator/api/common" + + groveschedulerv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// PodGangBuilder is a builder for PodGang objects (scheduler API). +type PodGangBuilder struct { + pg *groveschedulerv1alpha1.PodGang +} + +// NewPodGangBuilder creates a new PodGangBuilder. +func NewPodGangBuilder(name, namespace string) *PodGangBuilder { + return &PodGangBuilder{ + pg: createEmptyPodGang(name, namespace), + } +} + +// WithGeneration sets the Generation on the PodGang. +func (b *PodGangBuilder) WithGeneration(generation int64) *PodGangBuilder { + b.pg.SetGeneration(generation) + return b +} + +// WithManaged sets the managed-by label so the PodGang is considered operator-managed. +func (b *PodGangBuilder) WithManaged(managed bool) *PodGangBuilder { + if b.pg.Labels == nil { + b.pg.Labels = make(map[string]string) + } + if managed { + b.pg.Labels[apicommon.LabelManagedByKey] = apicommon.LabelManagedByValue + } else { + delete(b.pg.Labels, apicommon.LabelManagedByKey) + } + return b +} + +// WithPodGroups sets the Spec.PodGroups slice. +func (b *PodGangBuilder) WithPodGroups(groups []groveschedulerv1alpha1.PodGroup) *PodGangBuilder { + b.pg.Spec.PodGroups = groups + return b +} + +// WithPodGroup adds a single PodGroup (convenience for tests that need one group). +func (b *PodGangBuilder) WithPodGroup(name string, minReplicas int32) *PodGangBuilder { + b.pg.Spec.PodGroups = append(b.pg.Spec.PodGroups, groveschedulerv1alpha1.PodGroup{ + Name: name, + MinReplicas: minReplicas, + }) + return b +} + +// Build returns the PodGang. +func (b *PodGangBuilder) Build() *groveschedulerv1alpha1.PodGang { + return b.pg +} + +func createEmptyPodGang(name, namespace string) *groveschedulerv1alpha1.PodGang { + return &groveschedulerv1alpha1.PodGang{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: groveschedulerv1alpha1.PodGangSpec{}, + } +} diff --git a/scheduler/api/core/v1alpha1/podgang.go b/scheduler/api/core/v1alpha1/podgang.go index dae9804a8..914f17112 100644 --- a/scheduler/api/core/v1alpha1/podgang.go +++ b/scheduler/api/core/v1alpha1/podgang.go @@ -157,6 +157,9 @@ const ( PodGangConditionTypeScheduled PodGangConditionType = "Scheduled" // PodGangConditionTypeReady indicates that all the constituent PodGroups are Ready. PodGangConditionTypeReady PodGangConditionType = "Ready" + // PodGangConditionTypeInitialized indicates that all Pods have been created and PodGang has been populated with pod references. + // This condition is set to True after all pods are created, signaling that scheduling gates can be removed. + PodGangConditionTypeInitialized PodGangConditionType = "Initialized" // PodGangConditionTypeUnhealthy indicates that the PodGang is unhealthy. It is now a candidate for gang termination. // If this condition is true for at least PodGangSpec.TerminationDelay duration, then the PodGang will be terminated. PodGangConditionTypeUnhealthy PodGangConditionType = "Unhealthy"