Skip to content

Commit c3775bf

Browse files
authored
[Cherry-pick][Bug][GCS FT] Clean up the Redis key before the head Pod is deleted (#1989) (#2017)
1 parent e64a9b6 commit c3775bf

File tree

2 files changed

+11
-13
lines changed

2 files changed

+11
-13
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,22 +179,22 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, request ctrl.Reque
179179
return ctrl.Result{}, client.IgnoreNotFound(err)
180180
}
181181

182-
func (r *RayClusterReconciler) deleteAllPods(ctx context.Context, namespace string, filterLabels client.MatchingLabels) (active int, pods corev1.PodList, err error) {
182+
func (r *RayClusterReconciler) deleteAllPods(ctx context.Context, namespace string, filterLabels client.MatchingLabels) (pods corev1.PodList, err error) {
183183
logger := ctrl.LoggerFrom(ctx)
184184
if err = r.List(ctx, &pods, client.InNamespace(namespace), filterLabels); err != nil {
185-
return 0, pods, err
185+
return pods, err
186186
}
187-
active = 0
187+
active := 0
188188
for _, pod := range pods.Items {
189189
if pod.DeletionTimestamp.IsZero() {
190190
active++
191191
}
192192
}
193193
if active > 0 {
194194
logger.Info("Deleting all Pods with labels", "filterLabels", filterLabels, "Number of active Pods", active)
195-
return active, pods, r.DeleteAllOf(ctx, &corev1.Pod{}, client.InNamespace(namespace), filterLabels)
195+
return pods, r.DeleteAllOf(ctx, &corev1.Pod{}, client.InNamespace(namespace), filterLabels)
196196
}
197-
return active, pods, nil
197+
return pods, nil
198198
}
199199

200200
func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request ctrl.Request, instance *rayv1.RayCluster) (ctrl.Result, error) {
@@ -230,21 +230,21 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
230230
"DeletionTimestamp", instance.ObjectMeta.DeletionTimestamp)
231231

232232
// Delete the head Pod if it exists.
233-
numDeletedHeads, headPods, err := r.deleteAllPods(ctx, instance.Namespace, client.MatchingLabels{
233+
headPods, err := r.deleteAllPods(ctx, instance.Namespace, client.MatchingLabels{
234234
utils.RayClusterLabelKey: instance.Name,
235235
utils.RayNodeTypeLabelKey: string(rayv1.HeadNode),
236236
})
237237
if err != nil {
238238
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err
239239
}
240240
// Delete all worker Pods if they exist.
241-
if _, _, err = r.deleteAllPods(ctx, instance.Namespace, client.MatchingLabels{
241+
if _, err = r.deleteAllPods(ctx, instance.Namespace, client.MatchingLabels{
242242
utils.RayClusterLabelKey: instance.Name,
243243
utils.RayNodeTypeLabelKey: string(rayv1.WorkerNode),
244244
}); err != nil {
245245
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err
246246
}
247-
if numDeletedHeads > 0 {
247+
if len(headPods.Items) > 0 {
248248
logger.Info(fmt.Sprintf(
249249
"Wait for the head Pod %s to be terminated before initiating the Redis cleanup process. "+
250250
"The storage namespace %s in Redis cannot be fully deleted if the GCS process on the head Pod is still writing to it.",
@@ -632,7 +632,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
632632
// if RayCluster is suspended, delete all pods and skip reconcile
633633
if instance.Spec.Suspend != nil && *instance.Spec.Suspend {
634634
clusterLabel := client.MatchingLabels{utils.RayClusterLabelKey: instance.Name}
635-
if _, _, err := r.deleteAllPods(ctx, instance.Namespace, clusterLabel); err != nil {
635+
if _, err := r.deleteAllPods(ctx, instance.Namespace, clusterLabel); err != nil {
636636
return err
637637
}
638638

ray-operator/controllers/ray/raycluster_controller_fake_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2624,15 +2624,13 @@ func TestDeleteAllPods(t *testing.T) {
26242624
}
26252625
ctx := context.Background()
26262626
// The first `deleteAllPods` function call should delete the "alive" Pod.
2627-
active, pods, err := testRayClusterReconciler.deleteAllPods(ctx, ns, filter)
2627+
pods, err := testRayClusterReconciler.deleteAllPods(ctx, ns, filter)
26282628
assert.Nil(t, err)
2629-
assert.Equal(t, 1, active)
26302629
assert.Equal(t, 2, len(pods.Items))
26312630
assert.Subset(t, []string{"alive", "deleted"}, []string{pods.Items[0].Name, pods.Items[1].Name})
26322631
// The second `deleteAllPods` function call should delete no Pods because none are active.
2633-
active, pods, err = testRayClusterReconciler.deleteAllPods(ctx, ns, filter)
2632+
pods, err = testRayClusterReconciler.deleteAllPods(ctx, ns, filter)
26342633
assert.Nil(t, err)
2635-
assert.Equal(t, 0, active)
26362634
assert.Equal(t, 1, len(pods.Items))
26372635
assert.Equal(t, "deleted", pods.Items[0].Name)
26382636
// Make sure that the above `deleteAllPods` calls didn't remove other Pods.

0 commit comments

Comments
 (0)