Skip to content

Commit 950781a

Browse files
kannon92bitoku
authored andcommitted
add e2e tests for split filesystem
Signed-off-by: Ayato Tokubi <[email protected]>
1 parent e328c8f commit 950781a

File tree

2 files changed

+318
-0
lines changed

2 files changed

+318
-0
lines changed

test/e2e/nodefeature/nodefeature.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ var (
104104
// TODO: document the feature (owning SIG, when to use this feature for a test)
105105
SidecarContainers = framework.WithNodeFeature(framework.ValidNodeFeatures.Add("SidecarContainers"))
106106

107+
// Sig-node: add e2e tests for KEP-4191
108+
KubeletSeparateDiskGC = framework.WithNodeFeature(framework.ValidNodeFeatures.Add("KubeletSeparateDiskGC"))
109+
107110
// TODO: document the feature (owning SIG, when to use this feature for a test)
108111
SystemNodeCriticalPod = framework.WithNodeFeature(framework.ValidNodeFeatures.Add("SystemNodeCriticalPod"))
109112

test/e2e_node/split_disk_test.go

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package e2enode
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"k8s.io/kubernetes/pkg/features"
23+
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
24+
"os/exec"
25+
"path/filepath"
26+
"strings"
27+
"time"
28+
29+
v1 "k8s.io/api/core/v1"
30+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31+
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
32+
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
33+
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
34+
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
35+
"k8s.io/kubernetes/test/e2e/framework"
36+
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
37+
"k8s.io/kubernetes/test/e2e/nodefeature"
38+
imageutils "k8s.io/kubernetes/test/utils/image"
39+
admissionapi "k8s.io/pod-security-admission/api"
40+
41+
"github.com/onsi/ginkgo/v2"
42+
"github.com/onsi/gomega"
43+
)
44+
45+
var _ = SIGDescribe("KubeletSeparateDiskGC", nodefeature.KubeletSeparateDiskGC, func() {
46+
f := framework.NewDefaultFramework("split-disk-test")
47+
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
48+
pressureTimeout := 10 * time.Minute
49+
expectedNodeCondition := v1.NodeDiskPressure
50+
51+
ginkgo.BeforeEach(func(ctx context.Context) {
52+
e2eskipper.SkipUnlessFeatureGateEnabled(features.KubeletSeparateDiskGC)
53+
if !hasSplitFileSystem(ctx) {
54+
ginkgo.Skip("it doesn't have split filesystem")
55+
}
56+
})
57+
58+
f.It("should display different stats for imageFs and containerFs", func(ctx context.Context) {
59+
summary := eventuallyGetSummary(ctx)
60+
gomega.Expect(summary.Node.Fs.AvailableBytes).ToNot(gomega.Equal(summary.Node.Runtime.ImageFs.AvailableBytes))
61+
gomega.Expect(summary.Node.Fs.CapacityBytes).ToNot(gomega.Equal(summary.Node.Runtime.ImageFs.CapacityBytes))
62+
// Node.Fs represents rootfs where /var/lib/kubelet is located.
63+
// Since graphroot is left as the default in storage.conf, it will use the same filesystem location as rootfs.
64+
// Therefore, Node.Fs should be the same as Runtime.ContainerFs.
65+
gomega.Expect(summary.Node.Fs.AvailableBytes).To(gomega.Equal(summary.Node.Runtime.ContainerFs.AvailableBytes))
66+
gomega.Expect(summary.Node.Fs.CapacityBytes).To(gomega.Equal(summary.Node.Runtime.ContainerFs.CapacityBytes))
67+
})
68+
69+
f.Context("when there is disk pressure", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), func() {
70+
f.Context("on imageFs", func() {
71+
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
72+
initialConfig.EvictionHard = map[string]string{
73+
string(evictionapi.SignalNodeFsAvailable): "30%",
74+
string(evictionapi.SignalContainerFsAvailable): "30%",
75+
string(evictionapi.SignalImageFsAvailable): "30%",
76+
}
77+
initialConfig.EvictionMinimumReclaim = map[string]string{}
78+
ginkgo.By(fmt.Sprintf("EvictionHard %s", initialConfig.EvictionHard))
79+
})
80+
81+
runImageFsPressureTest(f, pressureTimeout, expectedNodeCondition, logDiskMetrics, []podEvictSpec{
82+
{
83+
evictionPriority: 1,
84+
pod: innocentPod(),
85+
},
86+
})
87+
})
88+
89+
f.Context("on containerFs", func() {
90+
expectedStarvedResource := v1.ResourceEphemeralStorage
91+
diskTestInMb := 5000
92+
93+
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
94+
initialConfig.EvictionHard = map[string]string{
95+
string(evictionapi.SignalNodeFsAvailable): "30%",
96+
string(evictionapi.SignalImageFsAvailable): "30%",
97+
}
98+
initialConfig.EvictionMinimumReclaim = map[string]string{}
99+
ginkgo.By(fmt.Sprintf("EvictionHard %s", initialConfig.EvictionHard))
100+
})
101+
runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{
102+
{
103+
// This pod should exceed disk capacity on nodeFs since it writes a lot to writeable layer.
104+
evictionPriority: 1,
105+
pod: diskConsumingPod("container-emptydir-disk-limit", diskTestInMb, nil,
106+
v1.ResourceRequirements{}),
107+
},
108+
})
109+
})
110+
})
111+
})
112+
113+
// runImageFsPressureTest tests are similar to eviction tests but will skip the checks on eviction itself,
114+
// as we want to induce disk pressure on the imageFs filesystem.
115+
func runImageFsPressureTest(f *framework.Framework, pressureTimeout time.Duration, expectedNodeCondition v1.NodeConditionType, logFunc func(ctx context.Context), testSpecs []podEvictSpec) {
116+
// Place the remainder of the test within a context so that the kubelet config is set before and after the test.
117+
ginkgo.Context("", func() {
118+
ginkgo.BeforeEach(func(ctx context.Context) {
119+
// Reduce memory usage in the allocatable cgroup to ensure we do not have MemoryPressure.
120+
reduceAllocatableMemoryUsageIfCgroupv1()
121+
// Nodes do not immediately report local storage capacity,
122+
// so wait a little to allow pods requesting local storage to be scheduled.
123+
time.Sleep(30 * time.Second)
124+
ginkgo.By("setting up pods to be used by tests")
125+
pods := []*v1.Pod{}
126+
for _, spec := range testSpecs {
127+
pods = append(pods, spec.pod)
128+
}
129+
e2epod.NewPodClient(f).CreateBatch(ctx, pods)
130+
})
131+
132+
ginkgo.It("should evict all of the correct pods", func(ctx context.Context) {
133+
_, is, err := getCRIClient()
134+
framework.ExpectNoError(err)
135+
resp, err := is.ImageFsInfo(ctx)
136+
framework.ExpectNoError(err)
137+
gomega.Expect(resp.ImageFilesystems).NotTo(gomega.BeEmpty())
138+
gomega.Expect(resp.ImageFilesystems[0].FsId).NotTo(gomega.BeNil())
139+
diskToPressure := filepath.Dir(resp.ImageFilesystems[0].FsId.Mountpoint)
140+
ginkgo.By(fmt.Sprintf("Got imageFs directory: %s", diskToPressure))
141+
imagesLenBeforeGC := 1
142+
sizeOfPressure := "8000"
143+
gomega.Eventually(ctx, func(ctx context.Context) error {
144+
images, err := is.ListImages(ctx, &runtimeapi.ImageFilter{})
145+
imagesLenBeforeGC = len(images)
146+
return err
147+
}, 1*time.Minute, evictionPollInterval).Should(gomega.Succeed())
148+
ginkgo.By(fmt.Sprintf("Number of images found before GC was %d", imagesLenBeforeGC))
149+
ginkgo.By(fmt.Sprintf("Induce disk pressure on %s with size %s", diskToPressure, sizeOfPressure))
150+
gomega.Expect(runDDOnFilesystem(diskToPressure, sizeOfPressure)).Should(gomega.Succeed())
151+
ginkgo.By(fmt.Sprintf("Waiting for node to have NodeCondition: %s", expectedNodeCondition))
152+
153+
gomega.Eventually(ctx, func(ctx context.Context) error {
154+
logFunc(ctx)
155+
if expectedNodeCondition == noPressure || hasNodeCondition(ctx, f, expectedNodeCondition) {
156+
return nil
157+
}
158+
return fmt.Errorf("NodeCondition: %s not encountered", expectedNodeCondition)
159+
}, pressureTimeout, evictionPollInterval).Should(gomega.BeNil())
160+
161+
ginkgo.By("Waiting for evictions to occur")
162+
gomega.Eventually(ctx, func(ctx context.Context) error {
163+
if expectedNodeCondition != noPressure {
164+
if hasNodeCondition(ctx, f, expectedNodeCondition) {
165+
framework.Logf("Node has condition: %s", expectedNodeCondition)
166+
} else {
167+
framework.Logf("Node does NOT have condition: %s", expectedNodeCondition)
168+
}
169+
}
170+
logKubeletLatencyMetrics(ctx, kubeletmetrics.EvictionStatsAgeKey)
171+
logFunc(ctx)
172+
return verifyEvictionOrdering(ctx, f, testSpecs)
173+
}, pressureTimeout, evictionPollInterval).Should(gomega.Succeed())
174+
175+
ginkgo.By("checking for the expected pod conditions for evicted pods")
176+
verifyPodConditions(ctx, f, testSpecs)
177+
178+
gomega.Eventually(ctx, func(ctx context.Context) error {
179+
images, err := is.ListImages(ctx, &runtimeapi.ImageFilter{})
180+
if err != nil {
181+
return err
182+
}
183+
imagesLenAfterGC := len(images)
184+
if imagesLenAfterGC < imagesLenBeforeGC {
185+
return nil
186+
}
187+
return fmt.Errorf("garbage collection of images should have occurred. before: %d after: %d", imagesLenBeforeGC, imagesLenAfterGC)
188+
}, pressureTimeout, evictionPollInterval).Should(gomega.Succeed())
189+
190+
gomega.Expect(removeDiskPressure(diskToPressure)).Should(gomega.Succeed(), "removing disk pressure should not fail")
191+
192+
ginkgo.By("making sure pressure from test has surfaced before continuing")
193+
194+
ginkgo.By(fmt.Sprintf("Waiting for NodeCondition: %s to no longer exist on the node", expectedNodeCondition))
195+
gomega.Eventually(ctx, func(ctx context.Context) error {
196+
logFunc(ctx)
197+
logKubeletLatencyMetrics(ctx, kubeletmetrics.EvictionStatsAgeKey)
198+
if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
199+
return fmt.Errorf("conditions haven't returned to normal, node still has: %s", expectedNodeCondition)
200+
}
201+
return nil
202+
}, pressureTimeout, evictionPollInterval).Should(gomega.BeNil())
203+
204+
ginkgo.By("checking for stable, pressure-free condition without unexpected pod failures")
205+
gomega.Consistently(ctx, func(ctx context.Context) error {
206+
if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
207+
return fmt.Errorf("condition %s disappeared and then reappeared", expectedNodeCondition)
208+
}
209+
logFunc(ctx)
210+
logKubeletLatencyMetrics(ctx, kubeletmetrics.EvictionStatsAgeKey)
211+
return verifyEvictionOrdering(ctx, f, testSpecs)
212+
}, postTestConditionMonitoringPeriod, evictionPollInterval).Should(gomega.Succeed())
213+
})
214+
215+
ginkgo.AfterEach(func(ctx context.Context) {
216+
prePullImagesIfNecessary := func() {
217+
if expectedNodeCondition == v1.NodeDiskPressure && framework.TestContext.PrepullImages {
218+
// The disk eviction test may cause the pre-pulled images to be evicted,
219+
// so pre-pull those images again to ensure this test does not affect subsequent tests.
220+
err := PrePullAllImages()
221+
framework.ExpectNoError(err)
222+
}
223+
}
224+
// Run pre-pull for images using a `defer` to ensure that images are pulled even when the subsequent assertions fail.
225+
defer prePullImagesIfNecessary()
226+
227+
ginkgo.By("deleting pods")
228+
for _, spec := range testSpecs {
229+
ginkgo.By(fmt.Sprintf("deleting pod: %s", spec.pod.Name))
230+
e2epod.NewPodClient(f).DeleteSync(ctx, spec.pod.Name, metav1.DeleteOptions{}, 10*time.Minute)
231+
}
232+
233+
// In case a test fails before verifying that NodeCondition no longer exist on the node,
234+
// we should wait for the NodeCondition to disappear.
235+
ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exists on the node", expectedNodeCondition))
236+
gomega.Eventually(ctx, func(ctx context.Context) error {
237+
if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
238+
return fmt.Errorf("conditions haven't returned to normal, node still has: %s", expectedNodeCondition)
239+
}
240+
return nil
241+
}, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil())
242+
243+
reduceAllocatableMemoryUsageIfCgroupv1()
244+
ginkgo.By("making sure we have all the required images for testing")
245+
prePullImagesIfNecessary()
246+
247+
// Ensure that the NodeCondition hasn't returned after pulling images.
248+
ginkgo.By(fmt.Sprintf("making sure NodeCondition %s doesn't exist again after pulling images", expectedNodeCondition))
249+
gomega.Eventually(ctx, func(ctx context.Context) error {
250+
if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
251+
return fmt.Errorf("conditions haven't returned to normal, node still has: %s", expectedNodeCondition)
252+
}
253+
return nil
254+
}, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil())
255+
256+
ginkgo.By("making sure we can start a new pod after the test")
257+
podName := "test-admit-pod"
258+
e2epod.NewPodClient(f).CreateSync(ctx, &v1.Pod{
259+
ObjectMeta: metav1.ObjectMeta{
260+
Name: podName,
261+
},
262+
Spec: v1.PodSpec{
263+
RestartPolicy: v1.RestartPolicyNever,
264+
Containers: []v1.Container{
265+
{
266+
Image: imageutils.GetPauseImageName(),
267+
Name: podName,
268+
},
269+
},
270+
},
271+
})
272+
273+
if ginkgo.CurrentSpecReport().Failed() {
274+
if framework.TestContext.DumpLogsOnFailure {
275+
logPodEvents(ctx, f)
276+
logNodeEvents(ctx, f)
277+
}
278+
}
279+
})
280+
})
281+
}
282+
283+
func runDDOnFilesystem(diskToPressure, sizeOfPressure string) error {
284+
script := strings.Split(fmt.Sprintf("if=/dev/zero of=%s/file.txt bs=1M count=%s", diskToPressure, sizeOfPressure), " ")
285+
ginkgo.By(fmt.Sprintf("running dd with %s", fmt.Sprintf("if=/dev/zero of=%s/file.txt bs=1M count=%s", diskToPressure, sizeOfPressure)))
286+
cmd := exec.Command("dd", script...)
287+
output, err := cmd.CombinedOutput()
288+
if err != nil {
289+
fmt.Println(string(output))
290+
fmt.Println(err)
291+
}
292+
return err
293+
}
294+
295+
func removeDiskPressure(diskToPressure string) error {
296+
fileToRemove := fmt.Sprintf("%s/file.txt", diskToPressure)
297+
ginkgo.By(fmt.Sprintf("calling rm %s", fileToRemove))
298+
cmd := exec.Command("rm", fileToRemove)
299+
_, err := cmd.CombinedOutput()
300+
return err
301+
}
302+
303+
func hasSplitFileSystem(ctx context.Context) bool {
304+
_, is, err := getCRIClient()
305+
framework.ExpectNoError(err)
306+
resp, err := is.ImageFsInfo(ctx)
307+
framework.ExpectNoError(err)
308+
if resp.ContainerFilesystems == nil || resp.ImageFilesystems == nil || len(resp.ContainerFilesystems) == 0 || len(resp.ImageFilesystems) == 0 {
309+
return false
310+
}
311+
if resp.ContainerFilesystems[0].FsId != nil && resp.ImageFilesystems[0].FsId != nil {
312+
return resp.ContainerFilesystems[0].FsId.Mountpoint != resp.ImageFilesystems[0].FsId.Mountpoint
313+
}
314+
return false
315+
}

0 commit comments

Comments
 (0)