Skip to content

Commit bc662b9

Browse files
authored
Merge pull request #588 from brownleej/buggify-crash-loop
Add an option to put pods in a crash loop state for testing purposes
2 parents 3a22f5f + 57dafbe commit bc662b9

14 files changed

+434
-15
lines changed

api/v1beta1/foundationdbcluster_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2427,6 +2427,10 @@ type RequiredAddressSet struct {
24272427
type BuggifyConfig struct {
24282428
// NoSchedule defines a list of instance IDs that should fail to schedule.
24292429
NoSchedule []string `json:"noSchedule,omitempty"`
2430+
2431+
// CrashLoops defines a list of instance IDs that should be put into a
2432+
// crash looping state.
2433+
CrashLoop []string `json:"crashLoop,omitempty"`
24302434
}
24312435

24322436
// FdbVersion represents a version of FoundationDB.

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/apps.foundationdb.org_foundationdbclusters.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ spec:
6868
type: boolean
6969
buggify:
7070
properties:
71+
crashLoop:
72+
items:
73+
type: string
74+
type: array
7175
noSchedule:
7276
items:
7377
type: string

config/samples/multi_dc/final.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,19 @@ spec:
3030
- id: dc1
3131
priority: 1
3232
- id: dc3
33+
satellite: 1
34+
priority: 2
35+
- id: dc2
3336
satellite: 1
3437
priority: 1
3538
satellite_logs: 3
3639
- datacenters:
3740
- id: dc2
3841
priority: 0
3942
- id: dc3
43+
satellite: 1
44+
priority: 2
45+
- id: dc1
4046
satellite: 1
4147
priority: 1
4248
satellite_logs: 3

config/samples/multi_dc/stage_1.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ spec:
2727
regions:
2828
- datacenters:
2929
- id: $dc
30-
priority: 1
30+
priority: 1
3131
processes:
3232
general:
3333
volumeClaimTemplate:

controllers/cluster_controller.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ func (r *FoundationDBClusterReconciler) Reconcile(ctx context.Context, request c
138138
CheckClientCompatibility{},
139139
ReplaceMisconfiguredPods{},
140140
ReplaceFailedPods{},
141+
DeletePodsForBuggification{},
141142
AddProcessGroups{},
142143
AddServices{},
143144
AddPVCs{},
@@ -839,7 +840,7 @@ type PodLifecycleManager interface {
839840
CanDeletePods(*FoundationDBClusterReconciler, ctx.Context, *fdbtypes.FoundationDBCluster) (bool, error)
840841

841842
// UpdatePods updates a list of pods to match the latest specs.
842-
UpdatePods(*FoundationDBClusterReconciler, ctx.Context, *fdbtypes.FoundationDBCluster, []FdbInstance) error
843+
UpdatePods(reconciler *FoundationDBClusterReconciler, context ctx.Context, cluster *fdbtypes.FoundationDBCluster, instances []FdbInstance, unsafe bool) error
843844

844845
// UpdateImageVersion updates a container's image.
845846
UpdateImageVersion(*FoundationDBClusterReconciler, ctx.Context, *fdbtypes.FoundationDBCluster, FdbInstance, int, string) error
@@ -969,7 +970,7 @@ func (manager StandardPodLifecycleManager) CanDeletePods(r *FoundationDBClusterR
969970
}
970971

971972
// UpdatePods updates a list of pods to match the latest specs.
972-
func (manager StandardPodLifecycleManager) UpdatePods(r *FoundationDBClusterReconciler, context ctx.Context, cluster *fdbtypes.FoundationDBCluster, instances []FdbInstance) error {
973+
func (manager StandardPodLifecycleManager) UpdatePods(r *FoundationDBClusterReconciler, context ctx.Context, cluster *fdbtypes.FoundationDBCluster, instances []FdbInstance, unsafe bool) error {
973974
for _, instance := range instances {
974975
err := r.Delete(context, instance.Pod)
975976
if err != nil {

controllers/cluster_controller_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,28 @@ var _ = Describe(string(fdbtypes.ProcessClassClusterController), func() {
257257
})
258258
})
259259

260+
Context("when buggifying a pod to make it crash loop", func() {
261+
BeforeEach(func() {
262+
cluster.Spec.Buggify.CrashLoop = []string{"storage-1"}
263+
err = k8sClient.Update(context.TODO(), cluster)
264+
Expect(err).NotTo(HaveOccurred())
265+
})
266+
267+
It("should add the crash loop flag", func() {
268+
pods := &corev1.PodList{}
269+
err = k8sClient.List(context.TODO(), pods, getListOptions(cluster)...)
270+
Expect(len(pods.Items)).To(Equal(len(originalPods.Items)))
271+
sortPodsByID(pods)
272+
273+
pod := pods.Items[firstStorageIndex]
274+
Expect(pod.ObjectMeta.Labels[FDBInstanceIDLabel]).To(Equal("storage-1"))
275+
276+
mainContainer := pod.Spec.Containers[0]
277+
Expect(mainContainer.Name).To(Equal("foundationdb"))
278+
Expect(mainContainer.Args).To(Equal([]string{"crash-loop"}))
279+
})
280+
})
281+
260282
Context("with a decreased process count", func() {
261283
BeforeEach(func() {
262284
cluster.Spec.ProcessCounts.Storage = 3
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* delete_pods_for_buggification.go
3+
*
4+
* This source file is part of the FoundationDB open source project
5+
*
6+
* Copyright 2021 Apple Inc. and the FoundationDB project authors
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package controllers
22+
23+
import (
24+
ctx "context"
25+
"time"
26+
27+
fdbtypes "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta1"
28+
)
29+
30+
// DeletePodsForBuggification provides a reconciliation step for recreating
31+
// pods with new pod specs when buggifying the config.
32+
type DeletePodsForBuggification struct{}
33+
34+
// Reconcile runs the reconciler's work.
35+
func (d DeletePodsForBuggification) Reconcile(r *FoundationDBClusterReconciler, context ctx.Context, cluster *fdbtypes.FoundationDBCluster) (bool, error) {
36+
instances, err := r.PodLifecycleManager.GetInstances(r, cluster, context, getPodListOptions(cluster, "", "")...)
37+
if err != nil {
38+
return false, err
39+
}
40+
41+
updates := make([]FdbInstance, 0)
42+
43+
removals := make(map[string]bool)
44+
for _, processGroup := range cluster.Status.ProcessGroups {
45+
if processGroup.Remove {
46+
removals[processGroup.ProcessGroupID] = true
47+
}
48+
}
49+
50+
crashLoopPods := make(map[string]bool, len(cluster.Spec.Buggify.CrashLoop))
51+
crashLoopAll := false
52+
for _, instanceID := range cluster.Spec.Buggify.CrashLoop {
53+
if instanceID == "*" {
54+
crashLoopAll = true
55+
} else {
56+
crashLoopPods[instanceID] = true
57+
}
58+
}
59+
60+
for _, instance := range instances {
61+
if instance.Pod == nil {
62+
continue
63+
}
64+
65+
instanceID := instance.GetInstanceID()
66+
_, pendingRemoval := removals[instanceID]
67+
if pendingRemoval {
68+
continue
69+
}
70+
71+
inCrashLoop := false
72+
for _, container := range instance.Pod.Spec.Containers {
73+
if container.Name == "foundationdb" && len(container.Args) > 0 {
74+
inCrashLoop = container.Args[0] == "crash-loop"
75+
}
76+
}
77+
78+
shouldCrashLoop := crashLoopAll || crashLoopPods[instanceID]
79+
80+
if shouldCrashLoop != inCrashLoop {
81+
log.Info("Deleting pod for buggification", "instanceID", instanceID, "shouldCrashLoop", shouldCrashLoop, "inCrashLoop", inCrashLoop)
82+
updates = append(updates, instance)
83+
}
84+
}
85+
86+
if len(updates) > 0 {
87+
log.Info("Deleting pods", "namespace", cluster.Namespace, "cluster", cluster.Name, "count", len(updates))
88+
r.Recorder.Event(cluster, "Normal", "UpdatingPods", "Recreating pods for buggification")
89+
90+
err = r.PodLifecycleManager.UpdatePods(r, context, cluster, updates, true)
91+
if err != nil {
92+
return false, err
93+
}
94+
95+
return false, nil
96+
}
97+
return true, nil
98+
}
99+
100+
// RequeueAfter returns the delay before we should run the reconciliation
101+
// again.
102+
func (d DeletePodsForBuggification) RequeueAfter() time.Duration {
103+
return 0
104+
}

0 commit comments

Comments
 (0)