diff --git a/slice/api/v1alpha1/slice_types.go b/slice/api/v1alpha1/slice_types.go index f014664ca..52efd50d7 100644 --- a/slice/api/v1alpha1/slice_types.go +++ b/slice/api/v1alpha1/slice_types.go @@ -21,15 +21,25 @@ import ( ) // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. +type Type string + +const ( + TypeV6e Type = "v6e" + TypeTpu7x Type = "tpu-v7x" +) // SliceSpec defines the desired state of Slice. type SliceSpec struct { - // AcceleratorType specifies the type of accelerator used in this slice. + // Type specifies the type of accelerator used in this slice, e.g., "v6e", "tpu-v7x". // +kubebuilder:validation:Immutable - Type string `json:"type"` + // +kubebuilder:validation:Enum=v6e;tpu-v7x + Type Type `json:"type"` // Topology represents the network topology of the slice. + // It defines the physical arrangement of TPU chips in a 2D or 3D mesg. + // The topology must be specified in `x` or `xx` format. // +kubebuilder:validation:Immutable + // +kubebuilder:validation:Pattern=^\d+x\d+(x\d+)?$ Topology string `json:"topology"` // Partition Ids denotes the set of partitions to use to form a slice @@ -55,7 +65,7 @@ type SliceStatus struct { // +kubebuilder:subresource:status // +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Topology",type=string,JSONPath=`.spec.topology` -// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.conditions[0].type` +// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason` // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" // Slice is the Schema for the slices API. type Slice struct { @@ -75,22 +85,10 @@ type SliceList struct { Items []Slice `json:"items"` } -// SliceConditionType defines the type of condition -type SliceConditionType string - -const ( - // Forming means the slice is being created and configured. - Forming SliceConditionType = "Forming" - // Ready means the slice is fully operational. - Ready SliceConditionType = "Ready" - // Degraded means the slice is operational but with reduced capacity or performance. - Degraded SliceConditionType = "Degraded" - // Deformed means the slice is being torn down. - Deformed SliceConditionType = "Deformed" - // Error means the slice has encountered an error and is not operational. - Error SliceConditionType = "Error" -) - func init() { SchemeBuilder.Register(&Slice{}, &SliceList{}) } + +const ( + SliceStateConditionType = "Ready" +) diff --git a/slice/internal/controller/workload_controller.go b/slice/internal/controller/workload_controller.go index efb902aa0..dd7d977dc 100644 --- a/slice/internal/controller/workload_controller.go +++ b/slice/internal/controller/workload_controller.go @@ -162,7 +162,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, err } - deleted, _, errored, _ := r.groupSlices(slices) + deleted, toDelete, _ := r.groupSlices(slices) if len(deleted) > 0 { log.V(3).Info( "Waiting for deleted Slices to be cleaned up; skipping reconciliation for now", @@ -181,12 +181,12 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, client.IgnoreNotFound(err) } - if len(errored) > 0 { + if len(toDelete) > 0 { log.V(3).Info( - "Deleting errored Slices", - "erroredSlices", klog.KObjSlice(errored), + "Deleting Slices", + "slices", klog.KObjSlice(toDelete), ) - err = r.deleteSlices(ctx, errored) + err = r.deleteSlices(ctx, toDelete) if err != nil { return ctrl.Result{}, err } @@ -256,22 +256,13 @@ func (r *WorkloadReconciler) cleanupSlices(ctx context.Context, wl *kueue.Worklo return false, err } - deleted, deformed, errored, other := r.groupSlices(slices) + deleted, errored, other := r.groupSlices(slices) if len(deleted) == len(slices) { log.V(3).Info("All slices already deleted; finishing cleanup") return true, nil } - if len(deformed) > 0 { - log.V(3).Info("Found Slices in deformed state; cleaning them up", "deformedSlices", klog.KObjSlice(deformed)) - // We still need to delete deformed Slices because requeueing causes a conflict error during Slice creation. - err = r.deleteSlices(ctx, deformed) - if err != nil { - return false, err - } - } - if len(other)+len(errored) > 0 { terminated, err := r.ownerPodsFinished(ctx, wl) if err != nil || !terminated { @@ -313,21 +304,19 @@ func (r *WorkloadReconciler) findWorkloadSlices(ctx context.Context, wl *kueue.W // - A slice containing deformed Slice objects (being torn down). // - A slice containing errored Slice objects. // - A slice containing other Slice objects (active/valid slices). -func (r *WorkloadReconciler) groupSlices(slices []v1alpha1.Slice) ([]v1alpha1.Slice, []v1alpha1.Slice, []v1alpha1.Slice, []v1alpha1.Slice) { - var deleted, deformed, errored, other []v1alpha1.Slice +func (r *WorkloadReconciler) groupSlices(slices []v1alpha1.Slice) ([]v1alpha1.Slice, []v1alpha1.Slice, []v1alpha1.Slice) { + var deleted, toDelete, other []v1alpha1.Slice for _, slice := range slices { switch { case !slice.DeletionTimestamp.IsZero(): deleted = append(deleted, slice) - case core.Deformed(&slice): - deformed = append(deformed, slice) - case core.IsError(&slice): - errored = append(errored, slice) + case core.IsError(&slice) || core.IsStale(&slice): + toDelete = append(toDelete, slice) default: other = append(other, slice) } } - return deleted, deformed, errored, other + return deleted, toDelete, other } func (r *WorkloadReconciler) deleteSlices(ctx context.Context, slices []v1alpha1.Slice) error { @@ -507,7 +496,7 @@ func shouldCreateSliceForPodSetAssignment(wl *kueue.Workload, psa kueue.PodSetAs return false } -func parseTopologyAssignmentIntoNodeSelector(slice *v1alpha1.Slice, topologyAssignment *kueue.TopologyAssignment, nodes map[string]corev1.Node) { +func parseTopologyAssignmentIntoPartitionIDs(slice *v1alpha1.Slice, topologyAssignment *kueue.TopologyAssignment, nodes map[string]corev1.Node) { subBlockIDs := sets.New[string]() // we already validated that all assignments have a valid level, // in validateRelevantWorkload. @@ -529,10 +518,10 @@ func (r *WorkloadReconciler) createSlice(ctx context.Context, wl *kueue.Workload if err := controllerutil.SetControllerReference(wl, slice, r.client.Scheme()); err != nil { return nil, err } - parseTopologyAssignmentIntoNodeSelector(slice, psa.TopologyAssignment, nodes) + parseTopologyAssignmentIntoPartitionIDs(slice, psa.TopologyAssignment, nodes) ps := podset.FindPodSetByName(wl.Spec.PodSets, psa.Name) - slice.Spec.Type = core.GetTPUAccelerator(ps.Template) + slice.Spec.Type = v1alpha1.Type(core.GetTPUAccelerator(ps.Template)) slice.Spec.Topology = core.GetTPUTopology(ps.Template) if err := r.client.Create(ctx, slice); err != nil { @@ -594,8 +583,8 @@ func (r *WorkloadReconciler) syncAdmissionCheckStatus(ctx context.Context, wl *k if ac.Message != originalMessage { // Logging error messages if exists for _, slice := range slices { - cond := meta.FindStatusCondition(slice.Status.Conditions, string(v1alpha1.Error)) - if cond != nil && cond.Status == metav1.ConditionTrue { + cond := meta.FindStatusCondition(slice.Status.Conditions, v1alpha1.SliceStateConditionType) + if cond != nil && cond.Status == metav1.ConditionFalse && cond.Reason == string(core.MMIGHealthStatusFailed) { log.V(2).Info( "WARNING: The Slice is not operational due to an error", "slice", klog.KObj(&slice), @@ -608,21 +597,16 @@ func (r *WorkloadReconciler) syncAdmissionCheckStatus(ctx context.Context, wl *k return nil } -func groupSlicesByState(slices []v1alpha1.Slice) (map[v1alpha1.SliceConditionType][]v1alpha1.Slice, []v1alpha1.Slice) { - slicesByState := make(map[v1alpha1.SliceConditionType][]v1alpha1.Slice) +func groupSlicesByState(slices []v1alpha1.Slice) (map[core.MMIGHealthStatus][]v1alpha1.Slice, []v1alpha1.Slice) { + slicesByState := make(map[core.MMIGHealthStatus][]v1alpha1.Slice) var noState []v1alpha1.Slice for _, slice := range slices { - foundState := false - for _, status := range core.SliceStates { - if meta.IsStatusConditionTrue(slice.Status.Conditions, string(status)) { - slicesByState[status] = append(slicesByState[status], slice) - foundState = true - break - } - } - if !foundState { + cond := meta.FindStatusCondition(slice.Status.Conditions, string(v1alpha1.SliceStateConditionType)) + if cond == nil { noState = append(noState, slice) + continue } + slicesByState[core.MMIGHealthStatus(cond.Reason)] = append(slicesByState[core.MMIGHealthStatus(cond.Reason)], slice) } return slicesByState, noState } @@ -631,17 +615,17 @@ func prepareAdmissionCheckStatus(ac *kueue.AdmissionCheckState, slices []v1alpha slicesByState, noState := groupSlicesByState(slices) switch { - case len(slicesByState[v1alpha1.Deformed]) > 0: - ac.State = kueue.CheckStateRejected - case len(slicesByState[v1alpha1.Error]) > 0: - ac.State = kueue.CheckStateRetry - case len(slices) == len(slicesByState[v1alpha1.Degraded])+len(slicesByState[v1alpha1.Ready]): + case len(slices) == len(slicesByState[core.MMIGHealthStatusActiveDegraded])+len(slicesByState[core.MMIGHealthStatusActive]): ac.State = kueue.CheckStateReady + case len(slicesByState[core.MMIGHealthStatusFailed]) > 0: + ac.State = kueue.CheckStateRetry + case len(slicesByState[core.MMIGHealthStatusDeactivating])+len(slicesByState[core.MMIGHealthStatusIncomplete])+len(slicesByState[core.MMIGHealthStatusUnknown]) > 0: + ac.State = kueue.CheckStateRejected } var stateMessages []string if len(noState) > 0 { - stateMessages = append(stateMessages, fmt.Sprintf("%d Created", len(noState))) + stateMessages = append(stateMessages, fmt.Sprintf("%d CREATED", len(noState))) } for _, state := range core.SliceStates { @@ -652,10 +636,10 @@ func prepareAdmissionCheckStatus(ac *kueue.AdmissionCheckState, slices []v1alpha ac.Message = fmt.Sprintf("Slices are in states: %s", strings.Join(stateMessages, ", ")) - if len(slicesByState[v1alpha1.Error]) > 0 { + if len(slicesByState[core.MMIGHealthStatusFailed]) > 0 { var errMessages []string - for _, slice := range slicesByState[v1alpha1.Error] { - cond := meta.FindStatusCondition(slice.Status.Conditions, string(v1alpha1.Error)) + for _, slice := range slicesByState[core.MMIGHealthStatusFailed] { + cond := meta.FindStatusCondition(slice.Status.Conditions, string(v1alpha1.SliceStateConditionType)) errMessages = append(errMessages, cond.Message) } ac.Message += ". Errors: " + strings.Join(errMessages, "; ") diff --git a/slice/internal/controller/workload_controller_test.go b/slice/internal/controller/workload_controller_test.go index 4ed8ce2c7..758f8f2e7 100644 --- a/slice/internal/controller/workload_controller_test.go +++ b/slice/internal/controller/workload_controller_test.go @@ -298,29 +298,6 @@ func TestWorkloadReconciler(t *testing.T) { Obj(), }, }, - "should delete the finalizer because Slices with status Deformed": { - request: baseRequest, - objs: []client.Object{ - baseAdmissionCheckWrapper.DeepCopy(), - baseJobSetWrapper.DeepCopy(), - basePod1Wrapper.DeepCopy(), - baseWorkloadWrapper.Clone(). - PodSets(basePodSets...). - ReserveQuota(baseAdmission, now). - ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). - Active(false). - Finalizers(SliceControllerName). - Obj(), - baseSlice1Wrapper.Clone().Deformed().Obj(), - baseSlice2Wrapper.Clone().Deformed().Obj(), - }, - wantWorkloads: []kueue.Workload{ - *baseWorkloadWrapper.Clone(). - PodSets(basePodSets...). - ReserveQuota(baseAdmission, now). - ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName).Active(false).Obj(), - }, - }, "shouldn't delete the finalizer because Slices status Degraded": { request: baseRequest, objs: []client.Object{ @@ -752,7 +729,7 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 Created`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 CREATED`)). Obj(), }, wantSlices: []slice.Slice{ @@ -805,7 +782,7 @@ func TestWorkloadReconciler(t *testing.T) { NodeSelector("cloud.google.com/gke-tpu-accelerator", "tpu-v8x"). Obj(), ). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 Created`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 CREATED`)). Obj(), }, wantSlices: []slice.Slice{ @@ -853,7 +830,7 @@ func TestWorkloadReconciler(t *testing.T) { }, }, now, ). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 Created`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 CREATED`)). Obj(), }, wantSlices: []slice.Slice{ @@ -884,7 +861,7 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 Created`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 CREATED`)). Obj(), }, wantSlices: []slice.Slice{ @@ -915,7 +892,7 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 Created`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 CREATED`)). Obj(), }, wantSlices: []slice.Slice{ @@ -980,7 +957,7 @@ func TestWorkloadReconciler(t *testing.T) { }, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 Created`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 CREATED`)). Obj(), }, wantSlices: []slice.Slice{ @@ -1047,7 +1024,7 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 Created`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 CREATED`)). Obj(), }, wantSlices: []slice.Slice{ @@ -1055,7 +1032,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice2Wrapper.DeepCopy(), }, }, - "should update the Workload's AdmissionCheckState when one Slice is in the Forming state": { + "should update the Workload's AdmissionCheckState when one Slice is in the Activating state": { request: baseRequest, objs: []client.Object{ worker1Node.DeepCopy(), @@ -1067,7 +1044,7 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Forming().Obj(), + baseSlice1Wrapper.Clone().Activating().Obj(), baseSlice2Wrapper.DeepCopy(), }, wantWorkloads: []kueue.Workload{ @@ -1076,15 +1053,15 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 Created, 1 Forming`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 CREATED, 1 ACTIVATING`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Forming().Obj(), + *baseSlice1Wrapper.Clone().Activating().Obj(), *baseSlice2Wrapper.DeepCopy(), }, }, - "should update the Workload's AdmissionCheckState when all Slices are in the Forming state": { + "should update the Workload's AdmissionCheckState when all Slices are in the Activating state": { request: baseRequest, objs: []client.Object{ worker1Node.DeepCopy(), @@ -1096,8 +1073,8 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Forming().Obj(), - baseSlice2Wrapper.Clone().Forming().Obj(), + baseSlice1Wrapper.Clone().Activating().Obj(), + baseSlice2Wrapper.Clone().Activating().Obj(), }, wantWorkloads: []kueue.Workload{ *baseWorkloadWrapper.Clone(). @@ -1105,12 +1082,12 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 Forming`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 ACTIVATING`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Forming().Obj(), - *baseSlice2Wrapper.Clone().Forming().Obj(), + *baseSlice1Wrapper.Clone().Activating().Obj(), + *baseSlice2Wrapper.Clone().Activating().Obj(), }, }, "should update the Workload's AdmissionCheckState when one Slice is in the Ready state": { @@ -1125,8 +1102,8 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Ready().Obj(), - baseSlice2Wrapper.Clone().Forming().Obj(), + baseSlice1Wrapper.Clone().Active().Obj(), + baseSlice2Wrapper.Clone().Activating().Obj(), }, wantWorkloads: []kueue.Workload{ *baseWorkloadWrapper.Clone(). @@ -1134,12 +1111,12 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 Forming, 1 Ready`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 ACTIVATING, 1 ACTIVE`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Ready().Obj(), - *baseSlice2Wrapper.Clone().Forming().Obj(), + *baseSlice1Wrapper.Clone().Active().Obj(), + *baseSlice2Wrapper.Clone().Activating().Obj(), }, }, "should update the Workload's AdmissionCheckState when all Slices are in the Ready state": { @@ -1154,8 +1131,8 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Ready().Obj(), - baseSlice2Wrapper.Clone().Ready().Obj(), + baseSlice1Wrapper.Clone().Active().Obj(), + baseSlice2Wrapper.Clone().Active().Obj(), }, wantWorkloads: []kueue.Workload{ *baseWorkloadWrapper.Clone(). @@ -1163,12 +1140,12 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 2 Ready`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 2 ACTIVE`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Ready().Obj(), - *baseSlice2Wrapper.Clone().Ready().Obj(), + *baseSlice1Wrapper.Clone().Active().Obj(), + *baseSlice2Wrapper.Clone().Active().Obj(), }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, @@ -1187,7 +1164,7 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Ready().Obj(), + baseSlice1Wrapper.Clone().Active().Obj(), baseSlice2Wrapper.Clone().Degraded().Obj(), }, wantWorkloads: []kueue.Workload{ @@ -1196,18 +1173,18 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 1 Degraded, 1 Ready`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 1 ACTIVE, 1 ACTIVE_DEGRADED`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Ready().Obj(), + *baseSlice1Wrapper.Clone().Active().Obj(), *baseSlice2Wrapper.Clone().Degraded().Obj()}, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, fmt.Sprintf(`Admission check %q updated state from "Pending" to "Ready"`, baseACName)), }, }, - "should update the Workload's AdmissionCheckState when one Slice is in the Error state": { + "should update the Workload's AdmissionCheckState when one Slice is in the Failed state": { request: baseRequest, objs: []client.Object{ worker1Node.DeepCopy(), @@ -1219,8 +1196,8 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Ready().Obj(), - baseSlice2Wrapper.Clone().Error().Obj(), + baseSlice1Wrapper.Clone().Active().Obj(), + baseSlice2Wrapper.Clone().Failed().Obj(), }, wantWorkloads: []kueue.Workload{ *baseWorkloadWrapper.Clone(). @@ -1228,18 +1205,18 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateRetry, `Slices are in states: 1 Error, 1 Ready. Errors: Error by test`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateRetry, `Slices are in states: 1 ACTIVE, 1 FAILED. Errors: Error by test`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Ready().Obj(), + *baseSlice1Wrapper.Clone().Active().Obj(), }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, fmt.Sprintf(`Admission check %q updated state from "Pending" to "Retry"`, baseACName)), }, }, - "should update the Workload's AdmissionCheckState when one Slice is in the Deformed state": { + "should delete a state slice": { request: baseRequest, objs: []client.Object{ worker1Node.DeepCopy(), @@ -1251,8 +1228,8 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Ready().Obj(), - baseSlice2Wrapper.Clone().Deformed().Obj(), + baseSlice1Wrapper.Clone().Active().Obj(), + baseSlice2Wrapper.Clone().Stale().Obj(), }, wantWorkloads: []kueue.Workload{ *baseWorkloadWrapper.Clone(). @@ -1260,15 +1237,11 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateRejected, `Slices are in states: 1 Deformed, 1 Ready`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 1 ACTIVATING, 1 ACTIVE`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Ready().Obj(), - *baseSlice2Wrapper.Clone().Deformed().Obj()}, - wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, - fmt.Sprintf(`Admission check %q updated state from "Pending" to "Rejected"`, baseACName)), + *baseSlice1Wrapper.Clone().Active().Obj(), }, }, "should use the first AdmissionCheck if more than one is found": { @@ -1284,8 +1257,8 @@ func TestWorkloadReconciler(t *testing.T) { ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). Obj(), - baseSlice1Wrapper.Clone().Ready().Obj(), - baseSlice2Wrapper.Clone().Ready().Obj(), + baseSlice1Wrapper.Clone().Active().Obj(), + baseSlice2Wrapper.Clone().Active().Obj(), }, wantWorkloads: []kueue.Workload{ *baseWorkloadWrapper.Clone(). @@ -1293,12 +1266,12 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 2 Ready`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 2 ACTIVE`)). Obj(), }, wantSlices: []slice.Slice{ - *baseSlice1Wrapper.Clone().Ready().Obj(), - *baseSlice2Wrapper.Clone().Ready().Obj(), + *baseSlice1Wrapper.Clone().Active().Obj(), + *baseSlice2Wrapper.Clone().Active().Obj(), }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, diff --git a/slice/internal/core/constants.go b/slice/internal/core/constants.go index 84d49e47b..15770ac91 100644 --- a/slice/internal/core/constants.go +++ b/slice/internal/core/constants.go @@ -16,6 +16,8 @@ limitations under the License. package core +type MMIGHealthStatus string + const ( TPUTopologyAnnotation = "cloud.google.com/gke-tpu-topology" TPUAcceleratorLabel = "cloud.google.com/gke-tpu-accelerator" @@ -26,4 +28,19 @@ const ( TPUSliceHealthNodeSelectorValue = "true" AcceleratorTpu7x = "tpu-v7x" + + // MMIGHealthStatusIncomplete indicates the MMIG is incomplete. + MMIGHealthStatusIncomplete MMIGHealthStatus = "INCOMPLETE" + // MMIGHealthStatusActivating indicates the MMIG is activating. + MMIGHealthStatusActivating MMIGHealthStatus = "ACTIVATING" + // MMIGHealthStatusActive indicates the MMIG is active. + MMIGHealthStatusActive MMIGHealthStatus = "ACTIVE" + // MMIGHealthStatusActiveDegraded indicates the MMIG is active but degraded. + MMIGHealthStatusActiveDegraded MMIGHealthStatus = "ACTIVE_DEGRADED" + // MMIGHealthStatusDeactivating indicates the MMIG is deactivating. + MMIGHealthStatusDeactivating MMIGHealthStatus = "DEACTIVATING" + // MMIGHealthStatusFailed indicates the MMIG has failed. + MMIGHealthStatusFailed MMIGHealthStatus = "FAILED" + // MMIGHealthStatusUnknown indicates the MMIG health is unknown. + MMIGHealthStatusUnknown MMIGHealthStatus = "UNKNOWN" ) diff --git a/slice/internal/core/core.go b/slice/internal/core/core.go index f604c43c4..7fae43536 100644 --- a/slice/internal/core/core.go +++ b/slice/internal/core/core.go @@ -20,12 +20,12 @@ import ( "regexp" corev1 "k8s.io/api/core/v1" - - "tpu-slice-controller/api/v1alpha1" ) -var SliceStates = []v1alpha1.SliceConditionType{ - v1alpha1.Error, v1alpha1.Deformed, v1alpha1.Forming, v1alpha1.Degraded, v1alpha1.Ready, +var SliceStates = []MMIGHealthStatus{ + MMIGHealthStatusActivating, MMIGHealthStatusActive, MMIGHealthStatusActiveDegraded, + MMIGHealthStatusDeactivating, MMIGHealthStatusFailed, MMIGHealthStatusIncomplete, + MMIGHealthStatusUnknown, } func IsValidTPUTopology(tpuTopology string) bool { diff --git a/slice/internal/core/slice.go b/slice/internal/core/slice.go index 4cc7a66fb..a8e1a73bb 100644 --- a/slice/internal/core/slice.go +++ b/slice/internal/core/slice.go @@ -16,6 +16,7 @@ package core import ( "fmt" + "time" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -25,6 +26,10 @@ import ( "tpu-slice-controller/api/v1alpha1" ) +const ( + activationTimeout = 3 * time.Minute +) + func SliceKeyFromWorkload(wl *kueue.Workload, podSetName kueue.PodSetReference) client.ObjectKey { slice := SliceWithMetadata(wl, podSetName) return client.ObjectKeyFromObject(slice) @@ -43,10 +48,12 @@ func SliceName(workloadName string, podSetName kueue.PodSetReference) string { return fmt.Sprintf("%s-%s", workloadName, podSetName) } -func Deformed(slice *v1alpha1.Slice) bool { - return meta.IsStatusConditionTrue(slice.Status.Conditions, string(v1alpha1.Deformed)) +func IsStale(slice *v1alpha1.Slice) bool { + cond := meta.FindStatusCondition(slice.Status.Conditions, string(v1alpha1.SliceStateConditionType)) + return cond != nil && cond.Status == metav1.ConditionFalse && time.Since(cond.LastTransitionTime.Time) >= activationTimeout } func IsError(slice *v1alpha1.Slice) bool { - return meta.IsStatusConditionTrue(slice.Status.Conditions, string(v1alpha1.Error)) + cond := meta.FindStatusCondition(slice.Status.Conditions, string(v1alpha1.SliceStateConditionType)) + return cond != nil && cond.Status == metav1.ConditionFalse && cond.Reason == string(MMIGHealthStatusFailed) } diff --git a/slice/internal/util/testing/wrappers.go b/slice/internal/util/testing/wrappers.go index 39d81ea15..de93bc4e1 100644 --- a/slice/internal/util/testing/wrappers.go +++ b/slice/internal/util/testing/wrappers.go @@ -33,6 +33,7 @@ import ( kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" "tpu-slice-controller/api/v1alpha1" + "tpu-slice-controller/internal/core" ) // JobTemplateWrapper wraps a JobTemplateSpec. @@ -274,7 +275,7 @@ func (s *SliceWrapper) Name(name string) *SliceWrapper { } func (s *SliceWrapper) Type(Type string) *SliceWrapper { - s.Spec.Type = Type + s.Spec.Type = v1alpha1.Type(Type) return s } @@ -293,61 +294,61 @@ func (s *SliceWrapper) PartitionIDs(ids ...string) *SliceWrapper { return s } -func (s *SliceWrapper) Ready() *SliceWrapper { +func (s *SliceWrapper) Active() *SliceWrapper { cond := metav1.Condition{ - Type: string(v1alpha1.Ready), + Type: string(v1alpha1.SliceStateConditionType), Status: metav1.ConditionTrue, LastTransitionTime: metav1.Now(), - Reason: "ByTest", - Message: "Ready by test", + Reason: string(core.MMIGHealthStatusActive), + Message: "Active by test", } apimeta.SetStatusCondition(&s.Status.Conditions, cond) return s } -func (s *SliceWrapper) Forming() *SliceWrapper { +func (s *SliceWrapper) Activating() *SliceWrapper { cond := metav1.Condition{ - Type: string(v1alpha1.Forming), - Status: metav1.ConditionTrue, + Type: string(v1alpha1.SliceStateConditionType), + Status: metav1.ConditionFalse, LastTransitionTime: metav1.Now(), - Reason: "ByTest", - Message: "Forming by test", + Reason: string(core.MMIGHealthStatusActivating), + Message: "Activating by test", } apimeta.SetStatusCondition(&s.Status.Conditions, cond) return s } -func (s *SliceWrapper) Deformed() *SliceWrapper { +func (s *SliceWrapper) Degraded() *SliceWrapper { cond := metav1.Condition{ - Type: string(v1alpha1.Deformed), + Type: string(v1alpha1.SliceStateConditionType), Status: metav1.ConditionTrue, LastTransitionTime: metav1.Now(), - Reason: "ByTest", - Message: "Deformed by test", + Reason: string(core.MMIGHealthStatusActiveDegraded), + Message: "Degraded by test", } apimeta.SetStatusCondition(&s.Status.Conditions, cond) return s } -func (s *SliceWrapper) Degraded() *SliceWrapper { +func (s *SliceWrapper) Failed() *SliceWrapper { cond := metav1.Condition{ - Type: string(v1alpha1.Degraded), - Status: metav1.ConditionTrue, + Type: string(v1alpha1.SliceStateConditionType), + Status: metav1.ConditionFalse, LastTransitionTime: metav1.Now(), - Reason: "ByTest", - Message: "Degraded by test", + Reason: string(core.MMIGHealthStatusFailed), + Message: "Error by test", } apimeta.SetStatusCondition(&s.Status.Conditions, cond) return s } -func (s *SliceWrapper) Error() *SliceWrapper { +func (s *SliceWrapper) Stale() *SliceWrapper { cond := metav1.Condition{ - Type: string(v1alpha1.Error), - Status: metav1.ConditionTrue, - LastTransitionTime: metav1.Now(), - Reason: "ByTest", - Message: "Error by test", + Type: string(v1alpha1.SliceStateConditionType), + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now().Add(-3 * time.Minute)), + Reason: string(core.MMIGHealthStatusActivating), + Message: "Stale by test", } apimeta.SetStatusCondition(&s.Status.Conditions, cond) return s diff --git a/slice/test/e2e/jobset_test.go b/slice/test/e2e/jobset_test.go index 07b70c1f0..cb344e5c5 100644 --- a/slice/test/e2e/jobset_test.go +++ b/slice/test/e2e/jobset_test.go @@ -19,6 +19,7 @@ package e2e import ( "fmt" "strconv" + "time" "github.com/google/go-cmp/cmp/cmpopts" "github.com/onsi/ginkgo/v2" @@ -222,7 +223,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdSlice.Spec.PartitionIDs).To(gomega.HaveLen(len(tc.wantPartitionIDs))) g.Expect(createdSlice.Spec.PartitionIDs).To(gomega.BeComparableTo(tc.wantPartitionIDs)) g.Expect(createdSlice.Spec.Topology).To(gomega.Equal(tc.tpuTopology)) - g.Expect(createdSlice.Spec.Type).To(gomega.Equal("tpu-v7x")) + g.Expect(createdSlice.Spec.Type).To(gomega.Equal(slice.Type("tpu-v7x"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -233,7 +234,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 1 Created`, + Message: `Slices are in states: 1 CREATED`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -242,9 +243,9 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Forming), - Status: metav1.ConditionTrue, - Reason: "Test", + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionFalse, + Reason: string(core.MMIGHealthStatusActivating), Message: "Test", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) @@ -258,7 +259,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 1 Forming`, + Message: `Slices are in states: 1 ACTIVATING`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Interval).Should(gomega.Succeed()) }) @@ -267,15 +268,9 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Forming), - Status: metav1.ConditionFalse, - Reason: "Test", - Message: "Test", - }) - meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Ready), + Type: string(slice.SliceStateConditionType), Status: metav1.ConditionTrue, - Reason: "Test", + Reason: string(core.MMIGHealthStatusActive), Message: "Test", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) @@ -289,7 +284,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStateReady, - Message: `Slices are in states: 1 Ready`, + Message: `Slices are in states: 1 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) }) @@ -497,15 +492,9 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Forming), - Status: metav1.ConditionFalse, - Reason: "Test", - Message: "Test", - }) - meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Ready), + Type: string(slice.SliceStateConditionType), Status: metav1.ConditionTrue, - Reason: "Test", + Reason: string(core.MMIGHealthStatusActive), Message: "Test", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) @@ -609,7 +598,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 1 Created`, + Message: `Slices are in states: 1 CREATED`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -618,9 +607,9 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Error), - Status: metav1.ConditionTrue, - Reason: "TestError", + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionFalse, + Reason: string(core.MMIGHealthStatusFailed), Message: "Slice has an error", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) @@ -640,7 +629,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 1 Created`, + Message: `Slices are in states: 1 CREATED`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -649,9 +638,9 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Ready), + Type: string(slice.SliceStateConditionType), Status: metav1.ConditionTrue, - Reason: "TestReady", + Reason: string(core.MMIGHealthStatusActive), Message: "Slice is ready", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) @@ -664,7 +653,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStateReady, - Message: `Slices are in states: 1 Ready`, + Message: `Slices are in states: 1 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Interval).Should(gomega.Succeed()) }) @@ -728,7 +717,12 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) oldSliceUID = createdSlice.GetUID() - meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{Type: string(slice.Ready), Status: metav1.ConditionTrue, Reason: "TestReady"}) + meta.SetStatusCondition( + &createdSlice.Status.Conditions, + metav1.Condition{ + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionTrue, + Reason: string(core.MMIGHealthStatusActive)}) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -739,7 +733,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStateReady, - Message: `Slices are in states: 1 Ready`, + Message: `Slices are in states: 1 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -747,7 +741,12 @@ var _ = ginkgo.Describe("JobSet", func() { ginkgo.By("Changing Slice condition to error", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) - meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{Type: string(slice.Error), Status: metav1.ConditionTrue, Reason: "TestError", Message: "Slice has an error"}) + meta.SetStatusCondition(&createdSlice.Status.Conditions, + metav1.Condition{ + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionFalse, + Reason: string(core.MMIGHealthStatusFailed), + Message: "Slice has an error"}) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -758,7 +757,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 1 Created`, + Message: `Slices are in states: 1 CREATED`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -774,9 +773,9 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ - Type: string(slice.Ready), + Type: string(slice.SliceStateConditionType), Status: metav1.ConditionTrue, - Reason: "TestReady", + Reason: string(core.MMIGHealthStatusActive), Message: "Slice is ready", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) @@ -789,12 +788,108 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStateReady, - Message: `Slices are in states: 1 Ready`, + Message: `Slices are in states: 1 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) }) + ginkgo.It("should recover after Slice is stale during initialization", func() { + jobSet := testingjobsjobset.MakeJobSet("jobset", ns.Name). + Queue(lq.Name). + ReplicatedJobs( + testingjobsjobset.ReplicatedJobRequirements{ + Name: "rj1", + Image: utils.E2eTestAgnHostImage, + Args: utils.BehaviorWaitForDeletion, + Replicas: 1, + Parallelism: 1, + Completions: 1, + PodAnnotations: map[string]string{ + "cloud.google.com/gke-tpu-topology": "4x4x4", + }, + NodeSelector: map[string]string{ + "cloud.google.com/gke-tpu-accelerator": "tpu-v7x", + }, + }, + ). + RequestAndLimit("rj1", extraResource, "1"). + Obj() + + ginkgo.By("Creating a JobSet", func() { + utils.MustCreate(ctx, k8sClient, jobSet) + }) + + createdWorkload := &kueue.Workload{} + wlKey := types.NamespacedName{ + Name: jobsetcontroller.GetWorkloadNameForJobSet(jobSet.Name, jobSet.UID), + Namespace: ns.Name, + } + + ginkgo.By("Waiting for Admission of the Workload", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, wlKey, createdWorkload)).Should(gomega.Succeed()) + g.Expect(createdWorkload.Status.Admission).ShouldNot(gomega.BeNil()) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + createdSlice := &slice.Slice{} + sliceKey := core.SliceKeyFromWorkload(createdWorkload, "rj1") + + var oldSliceUID types.UID + ginkgo.By("Checking that Slice is created", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) + oldSliceUID = createdSlice.GetUID() + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Setting Slice state to forming", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) + meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionFalse, + Reason: string(core.MMIGHealthStatusActivating), + LastTransitionTime: metav1.NewTime(time.Now().Add(-3 * time.Minute)), + Message: "Slice is stale", + }) + g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Checking that a new Slice is created", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) + g.Expect(createdSlice.GetUID()).ShouldNot(gomega.Equal(oldSliceUID)) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Adding ready condition to the new Slice", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed()) + meta.SetStatusCondition(&createdSlice.Status.Conditions, metav1.Condition{ + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionTrue, + Reason: string(core.MMIGHealthStatusActive), + Message: "Slice is ready", + }) + g.Expect(k8sClient.Status().Update(ctx, createdSlice)).To(gomega.Succeed()) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Checking that the Admission Check state is ready", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, wlKey, createdWorkload)).Should(gomega.Succeed()) + g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ + Name: kueue.AdmissionCheckReference(ac.Name), + State: kueue.CheckStateReady, + Message: `Slices are in states: 1 ACTIVE`, + }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) + }, utils.LongTimeout, utils.Interval).Should(gomega.Succeed()) + }) + }) + ginkgo.It("should create multiple Slices", func() { jobSet := testingjobsjobset.MakeJobSet("jobset", ns.Name). Queue(lq.Name). @@ -937,7 +1032,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 2 Created`, + Message: `Slices are in states: 2 CREATED`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -946,9 +1041,9 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey1, createdSlice1)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice1.Status.Conditions, metav1.Condition{ - Type: string(slice.Forming), - Status: metav1.ConditionTrue, - Reason: "Test", + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionFalse, + Reason: string(core.MMIGHealthStatusActivating), Message: "Test", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice1)).To(gomega.Succeed()) @@ -962,18 +1057,18 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 1 Created, 1 Forming`, + Message: `Slices are in states: 1 CREATED, 1 ACTIVATING`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Interval).Should(gomega.Succeed()) }) - ginkgo.By("Adding Forming condition for Slice 2", func() { + ginkgo.By("Adding Activating condition for Slice 2", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey2, createdSlice2)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice2.Status.Conditions, metav1.Condition{ - Type: string(slice.Forming), - Status: metav1.ConditionTrue, - Reason: "Test", + Type: string(slice.SliceStateConditionType), + Status: metav1.ConditionFalse, + Reason: string(core.MMIGHealthStatusActivating), Message: "Test", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice2)).To(gomega.Succeed()) @@ -987,24 +1082,18 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 2 Forming`, + Message: `Slices are in states: 2 ACTIVATING`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Interval).Should(gomega.Succeed()) }) - ginkgo.By("Adding Ready condition for Slice 1", func() { + ginkgo.By("Adding Active condition for Slice 1", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey1, createdSlice1)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice1.Status.Conditions, metav1.Condition{ - Type: string(slice.Forming), - Status: metav1.ConditionFalse, - Reason: "Test", - Message: "Test", - }) - meta.SetStatusCondition(&createdSlice1.Status.Conditions, metav1.Condition{ - Type: string(slice.Ready), + Type: string(slice.SliceStateConditionType), Status: metav1.ConditionTrue, - Reason: "Test", + Reason: string(core.MMIGHealthStatusActive), Message: "Test", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice1)).To(gomega.Succeed()) @@ -1018,24 +1107,18 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStatePending, - Message: `Slices are in states: 1 Forming, 1 Ready`, + Message: `Slices are in states: 1 ACTIVATING, 1 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Interval).Should(gomega.Succeed()) }) - ginkgo.By("Adding Ready condition for Slice 2", func() { + ginkgo.By("Adding Active condition for Slice 2", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey2, createdSlice2)).To(gomega.Succeed()) meta.SetStatusCondition(&createdSlice2.Status.Conditions, metav1.Condition{ - Type: string(slice.Forming), - Status: metav1.ConditionFalse, - Reason: "Test", - Message: "Test", - }) - meta.SetStatusCondition(&createdSlice2.Status.Conditions, metav1.Condition{ - Type: string(slice.Ready), + Type: string(slice.SliceStateConditionType), Status: metav1.ConditionTrue, - Reason: "Test", + Reason: string(core.MMIGHealthStatusActive), Message: "Test", }) g.Expect(k8sClient.Status().Update(ctx, createdSlice2)).To(gomega.Succeed()) @@ -1049,7 +1132,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStateReady, - Message: `Slices are in states: 2 Ready`, + Message: `Slices are in states: 2 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) })