Skip to content

Commit 3a36af4

Browse files
Merge pull request #30510 from Neilhamza/tnf-degraded-pdb
OCPEDGE-2232: degraded TNF testing - pdb + MCO reboot validation
2 parents 563a12f + 5a3a370 commit 3a36af4

File tree

2 files changed

+416
-0
lines changed

2 files changed

+416
-0
lines changed
Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
package two_node
2+
3+
import (
4+
"context"
5+
"encoding/base64"
6+
"fmt"
7+
"time"
8+
9+
g "github.com/onsi/ginkgo/v2"
10+
o "github.com/onsi/gomega"
11+
12+
machineconfigv1 "github.com/openshift/api/machineconfiguration/v1"
13+
machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned"
14+
"github.com/openshift/origin/test/extended/two_node/utils"
15+
"github.com/openshift/origin/test/extended/util/image"
16+
appsv1 "k8s.io/api/apps/v1"
17+
corev1 "k8s.io/api/core/v1"
18+
policyv1 "k8s.io/api/policy/v1"
19+
apierrs "k8s.io/apimachinery/pkg/api/errors"
20+
21+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
k8sruntime "k8s.io/apimachinery/pkg/runtime"
23+
"k8s.io/apimachinery/pkg/util/intstr"
24+
"k8s.io/apimachinery/pkg/util/wait"
25+
"k8s.io/client-go/kubernetes"
26+
"k8s.io/utils/ptr"
27+
28+
exutil "github.com/openshift/origin/test/extended/util"
29+
)
30+
31+
const (
32+
pdbLabelKey = "app"
33+
pdbLabelValue = "pdb-demo"
34+
pdbDeploymentName = "pdb-demo-deployment"
35+
pdbName = "pdb-demo"
36+
rebootTestMCName = "99-master-tnf-degraded-reboot-block-test"
37+
rebootTestMCFile = "/etc/tnf-degraded-reboot-block-test"
38+
)
39+
40+
var _ = g.Describe("[sig-apps][OCPFeatureGate:DualReplica][Suite:openshift/two-node] [Degraded] Two Node Fencing behavior in degraded mode", func() {
41+
oc := exutil.NewCLI("tnf-degraded").AsAdmin()
42+
43+
g.BeforeEach(func() {
44+
utils.EnsureTNFDegradedOrSkip(oc)
45+
})
46+
47+
g.It("should allow a single eviction and block the second when PDB minAvailable=1 [apigroup:policy]", func() {
48+
ctx := context.Background()
49+
kubeClient := oc.AdminKubeClient()
50+
51+
ns := oc.Namespace()
52+
labels := map[string]string{pdbLabelKey: pdbLabelValue}
53+
selector := fmt.Sprintf("%s=%s", pdbLabelKey, pdbLabelValue)
54+
55+
// Deployment with 2 pause pods
56+
deploy, err := createPauseDeployment(ctx, kubeClient, ns, pdbDeploymentName, 2, labels)
57+
o.Expect(err).NotTo(o.HaveOccurred())
58+
59+
err = exutil.WaitForDeploymentReadyWithTimeout(oc, deploy.Name, ns, -1, 3*time.Minute)
60+
o.Expect(err).NotTo(o.HaveOccurred(), "deployment did not reach 2 available replicas")
61+
62+
// PDB minAvailable=1
63+
pdb, err := createPDBMinAvailable(ctx, kubeClient, ns, pdbName, labels, 1)
64+
o.Expect(err).NotTo(o.HaveOccurred())
65+
66+
// Wait for disruptionsAllowed=1
67+
err = waitForPDBDisruptionsAllowed(ctx, kubeClient, ns, pdb.Name, 1, 2*time.Minute)
68+
o.Expect(err).NotTo(o.HaveOccurred(), "PDB did not report disruptionsAllowed=1")
69+
70+
pods, err := kubeClient.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{
71+
LabelSelector: selector,
72+
})
73+
o.Expect(err).NotTo(o.HaveOccurred())
74+
o.Expect(len(pods.Items)).To(o.Equal(2), "expected exactly 2 pods before first eviction")
75+
76+
firstPod := &pods.Items[0]
77+
secondPod := &pods.Items[1]
78+
79+
// Evict first pod should succeed and wait for PDB to consume
80+
err = evictPod(ctx, kubeClient, firstPod)
81+
o.Expect(err).NotTo(o.HaveOccurred(), "first eviction should succeed")
82+
83+
err = waitForPDBDisruptionsAllowed(ctx, kubeClient, ns, pdb.Name, 0, 2*time.Minute)
84+
o.Expect(err).NotTo(o.HaveOccurred(), "PDB did not update disruptionsAllowed=0 after first eviction")
85+
86+
// Evict second original pod, should be blocked with 429
87+
err = evictPod(ctx, kubeClient, secondPod)
88+
o.Expect(err).To(o.HaveOccurred(), "second eviction should be blocked by PDB")
89+
90+
statusErr, ok := err.(*apierrs.StatusError)
91+
o.Expect(ok).To(o.BeTrue(), "expected StatusError on blocked eviction")
92+
o.Expect(statusErr.Status().Code).To(o.Equal(int32(429)), "expected HTTP 429 Too Many Requests for second eviction")
93+
94+
// PDB disruptionsAllowed must be 0
95+
currentPDB, err := kubeClient.PolicyV1().PodDisruptionBudgets(ns).Get(ctx, pdb.Name, metav1.GetOptions{})
96+
o.Expect(err).NotTo(o.HaveOccurred())
97+
o.Expect(currentPDB.Status.DisruptionsAllowed).To(o.Equal(int32(0)), "expected disruptionsAllowed=0 after second eviction attempt")
98+
})
99+
100+
g.It("should block a reboot-required MachineConfig rollout on the remaining master [Serial] [apigroup:machineconfiguration.openshift.io]", func() {
101+
ctx := context.Background()
102+
kubeClient := oc.AdminKubeClient()
103+
ns := oc.Namespace()
104+
mcoClient := machineconfigclient.NewForConfigOrDie(oc.AdminConfig())
105+
106+
masterNode, err := utils.GetReadyMasterNode(ctx, oc)
107+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to find a Ready master node")
108+
109+
originalBootID := masterNode.Status.NodeInfo.BootID
110+
originalUnschedulable := masterNode.Spec.Unschedulable
111+
112+
// Capture current master MachineConfigPool state so we can assert it never progresses
113+
masterMCP, err := mcoClient.MachineconfigurationV1().MachineConfigPools().Get(ctx, "master", metav1.GetOptions{})
114+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to get master MachineConfigPool")
115+
116+
originalConfigName := masterMCP.Status.Configuration.Name
117+
118+
// Create a small reboot-required MachineConfig targeting master
119+
ignFileContents := fmt.Sprintf(" reboot-block test namespace=%s", ns)
120+
121+
testMC := newMasterRebootRequiredMachineConfig(rebootTestMCName, rebootTestMCFile, ignFileContents)
122+
123+
g.By(fmt.Sprintf("creating reboot-required MachineConfig %q for master pool", rebootTestMCName))
124+
_, err = mcoClient.MachineconfigurationV1().MachineConfigs().Create(ctx, testMC, metav1.CreateOptions{})
125+
o.Expect(err).NotTo(o.HaveOccurred(), "failed to create test MachineConfig")
126+
127+
// Cleanup
128+
defer func() {
129+
cleanupCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
130+
defer cancel()
131+
132+
_ = mcoClient.MachineconfigurationV1().MachineConfigs().Delete(
133+
cleanupCtx,
134+
rebootTestMCName,
135+
metav1.DeleteOptions{},
136+
)
137+
}()
138+
139+
g.By("observing (node safety + MCP blockage)")
140+
141+
observationWindow := 3 * time.Minute
142+
143+
err = observeTNFDegradedWindow(
144+
ctx,
145+
kubeClient,
146+
mcoClient,
147+
masterNode.Name,
148+
originalBootID,
149+
originalUnschedulable,
150+
originalConfigName,
151+
observationWindow,
152+
)
153+
154+
o.Expect(err).NotTo(o.HaveOccurred(), "behavior was not enforced correctly")
155+
})
156+
},
157+
)
158+
159+
// HELPERS
160+
func createPauseDeployment(
161+
ctx context.Context,
162+
client kubernetes.Interface,
163+
ns, name string,
164+
replicas int32,
165+
labels map[string]string,
166+
) (*appsv1.Deployment, error) {
167+
deploy := &appsv1.Deployment{
168+
ObjectMeta: metav1.ObjectMeta{
169+
Name: name,
170+
Namespace: ns,
171+
},
172+
Spec: appsv1.DeploymentSpec{
173+
Replicas: &replicas,
174+
Selector: &metav1.LabelSelector{
175+
MatchLabels: labels,
176+
},
177+
Template: corev1.PodTemplateSpec{
178+
ObjectMeta: metav1.ObjectMeta{
179+
Labels: labels,
180+
},
181+
Spec: corev1.PodSpec{
182+
Containers: []corev1.Container{
183+
{
184+
Name: "busy-work",
185+
Image: image.ShellImage(),
186+
Command: []string{
187+
"/bin/bash",
188+
"-c",
189+
`while true; do echo "Busy working, cycling through the ones and zeros"; sleep 5; done`,
190+
},
191+
},
192+
},
193+
},
194+
},
195+
},
196+
}
197+
198+
return client.AppsV1().Deployments(ns).Create(ctx, deploy, metav1.CreateOptions{})
199+
}
200+
201+
func createPDBMinAvailable(
202+
ctx context.Context,
203+
client kubernetes.Interface,
204+
ns, name string,
205+
labels map[string]string,
206+
minAvailable int,
207+
) (*policyv1.PodDisruptionBudget, error) {
208+
pdb := &policyv1.PodDisruptionBudget{
209+
ObjectMeta: metav1.ObjectMeta{
210+
Name: name,
211+
Namespace: ns,
212+
},
213+
Spec: policyv1.PodDisruptionBudgetSpec{
214+
MinAvailable: ptr.To(intstr.FromInt(minAvailable)),
215+
Selector: &metav1.LabelSelector{
216+
MatchLabels: labels,
217+
},
218+
},
219+
}
220+
return client.PolicyV1().PodDisruptionBudgets(ns).Create(ctx, pdb, metav1.CreateOptions{})
221+
}
222+
223+
func waitForPDBDisruptionsAllowed(
224+
ctx context.Context,
225+
client kubernetes.Interface,
226+
namespace, name string,
227+
expected int32,
228+
timeout time.Duration,
229+
) error {
230+
interval := 2 * time.Second
231+
232+
return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) {
233+
pdb, err := client.PolicyV1().PodDisruptionBudgets(namespace).Get(ctx, name, metav1.GetOptions{})
234+
if err != nil {
235+
return false, err
236+
}
237+
if pdb.Generation != pdb.Status.ObservedGeneration {
238+
return false, nil
239+
}
240+
return pdb.Status.DisruptionsAllowed == expected, nil
241+
})
242+
}
243+
244+
func evictPod(
245+
ctx context.Context,
246+
client kubernetes.Interface,
247+
pod *corev1.Pod,
248+
) error {
249+
eviction := &policyv1.Eviction{
250+
TypeMeta: metav1.TypeMeta{
251+
APIVersion: "policy/v1",
252+
Kind: "Eviction",
253+
},
254+
ObjectMeta: metav1.ObjectMeta{
255+
Name: pod.Name,
256+
Namespace: pod.Namespace,
257+
},
258+
}
259+
return client.CoreV1().Pods(pod.Namespace).EvictV1(ctx, eviction)
260+
}
261+
262+
func newMasterRebootRequiredMachineConfig(name, path, contents string) *machineconfigv1.MachineConfig {
263+
encoded := base64.StdEncoding.EncodeToString([]byte(contents))
264+
265+
ignJSON := fmt.Sprintf(`{
266+
"ignition": { "version": "3.2.0" },
267+
"storage": {
268+
"files": [{
269+
"path": "%s",
270+
"mode": 420,
271+
"overwrite": true,
272+
"contents": {
273+
"source": "data:text/plain;base64,%s"
274+
}
275+
}]
276+
}
277+
}`, path, encoded)
278+
279+
return &machineconfigv1.MachineConfig{
280+
ObjectMeta: metav1.ObjectMeta{
281+
Name: name,
282+
Labels: map[string]string{
283+
"machineconfiguration.openshift.io/role": "master",
284+
},
285+
},
286+
Spec: machineconfigv1.MachineConfigSpec{
287+
Config: k8sruntime.RawExtension{
288+
Raw: []byte(ignJSON),
289+
},
290+
},
291+
}
292+
}
293+
294+
// We don't use PollUntilContextTimeout here because it treats timeout as an error.
295+
// we implement our own loop where only real reboot/drain/API/MCP errors fail the test.
296+
func observeTNFDegradedWindow(
297+
ctx context.Context,
298+
kubeClient kubernetes.Interface,
299+
mcoClient machineconfigclient.Interface,
300+
nodeName, originalBootID string,
301+
originalUnschedulable bool,
302+
originalConfigName string,
303+
duration time.Duration,
304+
) error {
305+
interval := 10 * time.Second
306+
deadline := time.Now().Add(duration)
307+
308+
for {
309+
select {
310+
case <-ctx.Done():
311+
return fmt.Errorf("context cancelled during TNF degraded observation: %w", ctx.Err())
312+
default:
313+
}
314+
315+
if time.Now().After(deadline) {
316+
return nil // SUCCESS: node safe + MCP blocked
317+
}
318+
319+
// NODE SAFETY CHECKS
320+
node, err := kubeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
321+
if err != nil {
322+
return fmt.Errorf("failed to get node %q during observation: %w", nodeName, err)
323+
}
324+
325+
if node.Status.NodeInfo.BootID != originalBootID {
326+
return fmt.Errorf("node %q reboot detected (BootID changed)", nodeName)
327+
}
328+
329+
if node.Spec.Unschedulable && !originalUnschedulable {
330+
return fmt.Errorf("node %q became unschedulable (drain detected)", nodeName)
331+
}
332+
333+
// MCP BLOCKAGE CHECK
334+
mcp, err := mcoClient.MachineconfigurationV1().
335+
MachineConfigPools().
336+
Get(context.Background(), "master", metav1.GetOptions{})
337+
if err != nil {
338+
return fmt.Errorf("failed to get master MCP during observation: %w", err)
339+
}
340+
341+
cfg := mcp.Status.Configuration.Name
342+
if cfg != "" && cfg != originalConfigName {
343+
return fmt.Errorf("master MCP progressed to configuration %q (expected %q while degraded)", cfg, originalConfigName)
344+
}
345+
time.Sleep(interval)
346+
}
347+
}

0 commit comments

Comments
 (0)