Skip to content

Commit 256a576

Browse files
committed
Add featuregate, feature and unit and e2e tests
Signed-off-by: lauralorenz <[email protected]> Signed-off-by: Laura Lorenz <[email protected]>
1 parent 2261137 commit 256a576

File tree

8 files changed

+204
-32
lines changed

8 files changed

+204
-32
lines changed

pkg/features/kube_features.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,13 @@ const (
565565
// Allows recursive read-only mounts.
566566
RecursiveReadOnlyMounts featuregate.Feature = "RecursiveReadOnlyMounts"
567567

568+
// owner: @lauralorenz
569+
// kep: https://kep.k8s.io/4603
570+
//
571+
// Enables support for a lower internal cluster-wide backoff maximum for restarting
572+
// containers (aka containers in CrashLoopBackOff)
573+
ReduceDefaultCrashLoopBackOffDecay featuregate.Feature = "ReduceDefaultCrashLoopBackOffDecay"
574+
568575
// owner: @adrianmoisey
569576
// kep: https://kep.k8s.io/4427
570577
//

pkg/features/versioned_kube_features.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
622622
{Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.36
623623
},
624624

625+
ReduceDefaultCrashLoopBackOffDecay: {
626+
{Version: version.MustParse("1.33"), Default: false, PreRelease: featuregate.Alpha},
627+
},
628+
625629
RelaxedDNSSearchValidation: {
626630
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
627631
{Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.Beta},

pkg/kubelet/kubelet.go

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,19 @@ const (
153153
// DefaultContainerLogsDir is the location of container logs.
154154
DefaultContainerLogsDir = "/var/log/containers"
155155

156-
// MaxContainerBackOff is the max backoff period for container restarts, exported for the e2e test
157-
MaxContainerBackOff = v1beta1.MaxContainerBackOff
156+
// MaxCrashLoopBackOff is the max backoff period for container restarts, exported for the e2e test
157+
MaxCrashLoopBackOff = v1beta1.MaxContainerBackOff
158+
159+
// reducedMaxCrashLoopBackOff is the default max backoff period for container restarts when the alpha feature
160+
// gate ReduceDefaultCrashLoopBackOffDecay is enabled
161+
reducedMaxCrashLoopBackOff = 60 * time.Second
162+
163+
// Initial period for the exponential backoff for container restarts.
164+
initialCrashLoopBackOff = time.Second * 10
165+
166+
// reducedInitialCrashLoopBackOff is the default initial backoff period for container restarts when the alpha feature
167+
// gate ReduceDefaultCrashLoopBackOffDecay is enabled
168+
reducedInitialCrashLoopBackOff = 1 * time.Second
158169

159170
// MaxImageBackOff is the max backoff period for image pulls, exported for the e2e test
160171
MaxImageBackOff = 300 * time.Second
@@ -204,9 +215,6 @@ const (
204215
// error.
205216
backOffPeriod = time.Second * 10
206217

207-
// Initial period for the exponential backoff for container restarts.
208-
containerBackOffPeriod = time.Second * 10
209-
210218
// Initial period for the exponential backoff for image pulls.
211219
imageBackOffPeriod = time.Second * 10
212220

@@ -320,6 +328,27 @@ type Dependencies struct {
320328
useLegacyCadvisorStats bool
321329
}
322330

331+
// newCrashLoopBackOff configures the backoff maximum to be used
332+
// by kubelet for container restarts depending on the alpha gates
333+
// and kubelet configuration set
334+
func newCrashLoopBackOff(kubeCfg *kubeletconfiginternal.KubeletConfiguration) (time.Duration, time.Duration) {
335+
boMax := MaxCrashLoopBackOff
336+
boInitial := initialCrashLoopBackOff
337+
if utilfeature.DefaultFeatureGate.Enabled(features.ReduceDefaultCrashLoopBackOffDecay) {
338+
boMax = reducedMaxCrashLoopBackOff
339+
boInitial = reducedInitialCrashLoopBackOff
340+
}
341+
342+
if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) {
343+
// operator-invoked configuration always has precedence if valid
344+
boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration
345+
if boMax < boInitial {
346+
boInitial = boMax
347+
}
348+
}
349+
return boMax, boInitial
350+
}
351+
323352
// makePodSourceConfig creates a config.PodConfig from the given
324353
// KubeletConfiguration or returns an error.
325354
func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) {
@@ -936,16 +965,10 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
936965
kubeDeps.Recorder,
937966
volumepathhandler.NewBlockVolumePathHandler())
938967

939-
boMax := MaxContainerBackOff
940-
base := containerBackOffPeriod
941-
if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) {
942-
boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration
943-
if boMax < containerBackOffPeriod {
944-
base = boMax
945-
}
946-
}
947-
klet.backOff = flowcontrol.NewBackOff(base, boMax)
948-
klet.backOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
968+
boMax, base := newCrashLoopBackOff(kubeCfg)
969+
970+
klet.crashLoopBackOff = flowcontrol.NewBackOff(base, boMax)
971+
klet.crashLoopBackOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
949972
return eventTime.Sub(lastUpdate) > 600*time.Second
950973
}
951974

@@ -1346,7 +1369,7 @@ type Kubelet struct {
13461369
syncLoopMonitor atomic.Value
13471370

13481371
// Container restart Backoff
1349-
backOff *flowcontrol.Backoff
1372+
crashLoopBackOff *flowcontrol.Backoff
13501373

13511374
// Information about the ports which are opened by daemons on Node running this Kubelet server.
13521375
daemonEndpoints *v1.NodeDaemonEndpoints
@@ -2033,7 +2056,7 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType
20332056
// Use WithoutCancel instead of a new context.TODO() to propagate trace context
20342057
// Call the container runtime's SyncPod callback
20352058
sctx := context.WithoutCancel(ctx)
2036-
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.backOff)
2059+
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.crashLoopBackOff)
20372060
kl.reasonCache.Update(pod.UID, result)
20382061
if err := result.Error(); err != nil {
20392062
// Do not return error if the only failures were pods in backoff
@@ -2922,14 +2945,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
29222945
for i, container := range pod.Spec.Containers {
29232946
if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.Containers[i].Resources) {
29242947
key := kuberuntime.GetStableKey(pod, &container)
2925-
kl.backOff.Reset(key)
2948+
kl.crashLoopBackOff.Reset(key)
29262949
}
29272950
}
29282951
for i, container := range pod.Spec.InitContainers {
29292952
if podutil.IsRestartableInitContainer(&container) {
29302953
if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.InitContainers[i].Resources) {
29312954
key := kuberuntime.GetStableKey(pod, &container)
2932-
kl.backOff.Reset(key)
2955+
kl.crashLoopBackOff.Reset(key)
29332956
}
29342957
}
29352958
}

pkg/kubelet/kubelet_pods.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,7 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
13791379
}
13801380

13811381
// Cleanup any backoff entries.
1382-
kl.backOff.GC()
1382+
kl.crashLoopBackOff.GC()
13831383
return nil
13841384
}
13851385

pkg/kubelet/kubelet_test.go

Lines changed: 89 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ import (
5555
"k8s.io/client-go/kubernetes/fake"
5656
"k8s.io/client-go/tools/record"
5757
"k8s.io/client-go/util/flowcontrol"
58+
"k8s.io/component-base/featuregate"
5859
featuregatetesting "k8s.io/component-base/featuregate/testing"
5960
"k8s.io/component-base/metrics/testutil"
6061
internalapi "k8s.io/cri-api/pkg/apis"
@@ -337,8 +338,8 @@ func newTestKubeletWithImageList(
337338
kubelet.containerGC = containerGC
338339

339340
fakeClock := testingclock.NewFakeClock(time.Now())
340-
kubelet.backOff = flowcontrol.NewBackOff(time.Second, time.Minute)
341-
kubelet.backOff.Clock = fakeClock
341+
kubelet.crashLoopBackOff = flowcontrol.NewBackOff(time.Second, time.Minute)
342+
kubelet.crashLoopBackOff.Clock = fakeClock
342343
kubelet.resyncInterval = 10 * time.Second
343344
kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock)
344345
// Relist period does not affect the tests.
@@ -2895,7 +2896,7 @@ func TestHandlePodResourcesResize(t *testing.T) {
28952896
now := kubelet.clock.Now()
28962897
// Put the container in backoff so we can confirm backoff is reset.
28972898
backoffKey := kuberuntime.GetStableKey(originalPod, originalCtr)
2898-
kubelet.backOff.Next(backoffKey, now)
2899+
kubelet.crashLoopBackOff.Next(backoffKey, now)
28992900

29002901
updatedPod, err := kubelet.handlePodResourcesResize(newPod, podStatus)
29012902
require.NoError(t, err)
@@ -2917,7 +2918,7 @@ func TestHandlePodResourcesResize(t *testing.T) {
29172918
resizeStatus := kubelet.statusManager.GetPodResizeStatus(newPod.UID)
29182919
assert.Equal(t, tt.expectedResize, resizeStatus)
29192920

2920-
isInBackoff := kubelet.backOff.IsInBackOffSince(backoffKey, now)
2921+
isInBackoff := kubelet.crashLoopBackOff.IsInBackOffSince(backoffKey, now)
29212922
if tt.expectBackoffReset {
29222923
assert.False(t, isInBackoff, "container backoff should be reset")
29232924
} else {
@@ -3403,7 +3404,7 @@ func TestSyncPodSpans(t *testing.T) {
34033404
kubelet.os,
34043405
kubelet,
34053406
nil,
3406-
kubelet.backOff,
3407+
kubelet.crashLoopBackOff,
34073408
kubeCfg.SerializeImagePulls,
34083409
kubeCfg.MaxParallelImagePulls,
34093410
float32(kubeCfg.RegistryPullQPS),
@@ -3933,3 +3934,86 @@ func TestIsPodResizeInProgress(t *testing.T) {
39333934
})
39343935
}
39353936
}
3937+
3938+
func TestCrashLoopBackOffConfiguration(t *testing.T) {
3939+
testCases := []struct {
3940+
name string
3941+
featureGates []featuregate.Feature
3942+
nodeDecay metav1.Duration
3943+
expectedInitial time.Duration
3944+
expectedMax time.Duration
3945+
}{
3946+
{
3947+
name: "Prior behavior",
3948+
expectedMax: time.Duration(300 * time.Second),
3949+
expectedInitial: time.Duration(10 * time.Second),
3950+
},
3951+
{
3952+
name: "New default only",
3953+
featureGates: []featuregate.Feature{features.ReduceDefaultCrashLoopBackOffDecay},
3954+
expectedMax: time.Duration(60 * time.Second),
3955+
expectedInitial: time.Duration(1 * time.Second),
3956+
},
3957+
{
3958+
name: "Faster per node config; only node config configured",
3959+
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
3960+
nodeDecay: metav1.Duration{Duration: 2 * time.Second},
3961+
expectedMax: time.Duration(2 * time.Second),
3962+
expectedInitial: time.Duration(2 * time.Second),
3963+
},
3964+
{
3965+
name: "Faster per node config; new default and node config configured",
3966+
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
3967+
nodeDecay: metav1.Duration{Duration: 2 * time.Second},
3968+
expectedMax: time.Duration(2 * time.Second),
3969+
expectedInitial: time.Duration(1 * time.Second),
3970+
},
3971+
{
3972+
name: "Slower per node config; new default and node config configured, set A",
3973+
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
3974+
nodeDecay: metav1.Duration{Duration: 10 * time.Second},
3975+
expectedMax: time.Duration(10 * time.Second),
3976+
expectedInitial: time.Duration(1 * time.Second),
3977+
},
3978+
{
3979+
name: "Slower per node config; new default and node config configured, set B",
3980+
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
3981+
nodeDecay: metav1.Duration{Duration: 300 * time.Second},
3982+
expectedMax: time.Duration(300 * time.Second),
3983+
expectedInitial: time.Duration(1 * time.Second),
3984+
},
3985+
{
3986+
name: "Slower per node config; only node config configured, set A",
3987+
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
3988+
nodeDecay: metav1.Duration{Duration: 11 * time.Second},
3989+
expectedMax: time.Duration(11 * time.Second),
3990+
expectedInitial: time.Duration(10 * time.Second),
3991+
},
3992+
{
3993+
name: "Slower per node config; only node config configured, set B",
3994+
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
3995+
nodeDecay: metav1.Duration{Duration: 300 * time.Second},
3996+
expectedMax: time.Duration(300 * time.Second),
3997+
expectedInitial: time.Duration(10 * time.Second),
3998+
},
3999+
}
4000+
4001+
for _, tc := range testCases {
4002+
t.Run(tc.name, func(t *testing.T) {
4003+
kubeCfg := &kubeletconfiginternal.KubeletConfiguration{}
4004+
4005+
for _, f := range tc.featureGates {
4006+
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, f, true)
4007+
}
4008+
if tc.nodeDecay.Duration > 0 {
4009+
kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod = &tc.nodeDecay
4010+
}
4011+
4012+
resultMax, resultInitial := newCrashLoopBackOff(kubeCfg)
4013+
4014+
assert.Equalf(t, tc.expectedMax, resultMax, "wrong max calculated, want: %v, got %v", tc.expectedMax, resultMax)
4015+
assert.Equalf(t, tc.expectedInitial, resultInitial, "wrong base calculated, want: %v, got %v", tc.expectedInitial, resultInitial)
4016+
})
4017+
}
4018+
4019+
}

test/e2e/common/node/pods.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ import (
6464
const (
6565
buildBackOffDuration = time.Minute
6666
syncLoopFrequency = 10 * time.Second
67-
maxBackOffTolerance = time.Duration(1.3 * float64(kubelet.MaxContainerBackOff))
67+
maxBackOffTolerance = time.Duration(1.3 * float64(kubelet.MaxCrashLoopBackOff))
6868
podRetryPeriod = 1 * time.Second
6969
)
7070

@@ -739,7 +739,7 @@ var _ = SIGDescribe("Pods", func() {
739739
})
740740

741741
podClient.CreateSync(ctx, pod)
742-
time.Sleep(2 * kubelet.MaxContainerBackOff) // it takes slightly more than 2*x to get to a back-off of x
742+
time.Sleep(2 * kubelet.MaxCrashLoopBackOff) // it takes slightly more than 2*x to get to a back-off of x
743743

744744
// wait for a delay == capped delay of MaxContainerBackOff
745745
ginkgo.By("getting restart delay when capped")
@@ -753,13 +753,13 @@ var _ = SIGDescribe("Pods", func() {
753753
framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
754754
}
755755

756-
if delay1 < kubelet.MaxContainerBackOff {
756+
if delay1 < kubelet.MaxCrashLoopBackOff {
757757
continue
758758
}
759759
}
760760

761-
if (delay1 < kubelet.MaxContainerBackOff) || (delay1 > maxBackOffTolerance) {
762-
framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxContainerBackOff, delay1)
761+
if (delay1 < kubelet.MaxCrashLoopBackOff) || (delay1 > maxBackOffTolerance) {
762+
framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxCrashLoopBackOff, delay1)
763763
}
764764

765765
ginkgo.By("getting restart delay after a capped delay")
@@ -768,8 +768,8 @@ var _ = SIGDescribe("Pods", func() {
768768
framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
769769
}
770770

771-
if delay2 < kubelet.MaxContainerBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift
772-
framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxContainerBackOff, delay2)
771+
if delay2 < kubelet.MaxCrashLoopBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift
772+
framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxCrashLoopBackOff, delay2)
773773
}
774774
})
775775

test/e2e_node/container_restart_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,54 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(
8989
doTest(ctx, f, 3, containerName, 13)
9090
})
9191
})
92+
93+
ginkgo.Context("Reduced default container restart backs off as expected", func() {
94+
95+
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
96+
initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true}
97+
})
98+
99+
ginkgo.BeforeEach(func() {
100+
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
101+
ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
102+
}
103+
})
104+
105+
ginkgo.AfterEach(func() {
106+
err := resetCRIProxyInjector(e2eCriProxy)
107+
framework.ExpectNoError(err)
108+
})
109+
110+
ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) {
111+
// 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s
112+
doTest(ctx, f, 3, containerName, 13)
113+
})
114+
})
115+
116+
ginkgo.Context("Lower node config container restart takes precedence", func() {
117+
118+
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
119+
initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true}
120+
initialConfig.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: time.Duration(1 * time.Second)}
121+
initialConfig.FeatureGates = map[string]bool{"KubeletCrashLoopBackOffMax": true}
122+
})
123+
124+
ginkgo.BeforeEach(func() {
125+
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
126+
ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
127+
}
128+
})
129+
130+
ginkgo.AfterEach(func() {
131+
err := resetCRIProxyInjector(e2eCriProxy)
132+
framework.ExpectNoError(err)
133+
})
134+
135+
ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) {
136+
// 0s, 0s, 1s, 2s, 3s, 4s, 5s, 6s, 7s, and so on
137+
doTest(ctx, f, 3, containerName, 298)
138+
})
139+
})
92140
})
93141

94142
func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, containerName string, maxRestarts int) {

test/featuregates_linter/test_data/versioned_feature_list.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,6 +1048,12 @@
10481048
lockToDefault: true
10491049
preRelease: GA
10501050
version: "1.33"
1051+
- name: ReduceDefaultCrashLoopBackOffDecay
1052+
versionedSpecs:
1053+
- default: false
1054+
lockToDefault: false
1055+
preRelease: Alpha
1056+
version: "1.33"
10511057
- name: RelaxedDNSSearchValidation
10521058
versionedSpecs:
10531059
- default: false

0 commit comments

Comments
 (0)