Skip to content

Commit 5023168

Browse files
mnenciaarmru
andauthored
test(e2e): enhance cluster state diagnostics on timeout (cloudnative-pg#10305)
Add missing diagnostics to the cluster state dump printed when a cluster fails to become ready within the timeout. The existing dump only showed instance pods (filtered by label) and minimal PVC info, making it impossible to diagnose scheduling and volume provisioning failures. Add a section listing all pods in the namespace with their phase, node assignment, conditions, and container waiting states, so that initdb job pods stuck in Pending with no node are visible. Enhance PVC details with storage class, bound volume name, selected-node annotation, and PVC conditions to surface provisioning errors. Add namespace events sorted chronologically, capturing FailedScheduling, ProvisioningFailed, and other events that explain why pods or volumes are stuck. Signed-off-by: Marco Nenciarini <marco.nenciarini@enterprisedb.com> Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com>
1 parent 956d0c3 commit 5023168

File tree

2 files changed

+115
-9
lines changed

2 files changed

+115
-9
lines changed

.wordlist-en-custom.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,10 +1058,10 @@ livenessProbe
10581058
livenessProbeTimeout
10591059
livenessprobe
10601060
lm
1061+
loadBalancerSourceRanges
10611062
localeCType
10621063
localeCollate
10631064
localeProvider
1064-
loadBalancerSourceRanges
10651065
localhost
10661066
localobjectreference
10671067
locktype
@@ -1270,8 +1270,8 @@ postgresqlcnpgiov
12701270
ppc
12711271
pprof
12721272
pre
1273-
prefetched
12741273
preferredDuringSchedulingIgnoredDuringExecution
1274+
prefetched
12751275
preload
12761276
prepended
12771277
primaryUpdateMethod

tests/utils/utils.go

Lines changed: 113 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,21 @@ import (
2323
"bytes"
2424
"context"
2525
"fmt"
26+
"sort"
2627
"text/tabwriter"
28+
"time"
2729

2830
"github.com/cheynewallace/tabby"
2931
batchv1 "k8s.io/api/batch/v1"
32+
corev1 "k8s.io/api/core/v1"
33+
eventsv1 "k8s.io/api/events/v1"
3034
"sigs.k8s.io/controller-runtime/pkg/client"
3135

3236
apiv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1"
3337
utils2 "github.com/cloudnative-pg/cloudnative-pg/pkg/utils"
3438
"github.com/cloudnative-pg/cloudnative-pg/tests/utils/clusterutils"
39+
"github.com/cloudnative-pg/cloudnative-pg/tests/utils/namespaces"
40+
"github.com/cloudnative-pg/cloudnative-pg/tests/utils/pods"
3541
"github.com/cloudnative-pg/cloudnative-pg/tests/utils/run"
3642
"github.com/cloudnative-pg/cloudnative-pg/tests/utils/storage"
3743
)
@@ -68,7 +74,8 @@ func PrintClusterResources(ctx context.Context, crudClient client.Client, namesp
6874
clusterInfo.AddLine("Ready pod number: ", utils2.CountReadyPods(podList.Items))
6975
clusterInfo.AddLine()
7076
clusterInfo.AddHeader("Items", "Values")
71-
for _, pod := range podList.Items {
77+
for i := range podList.Items {
78+
pod := &podList.Items[i]
7279
clusterInfo.AddLine("Pod name", pod.Name)
7380
clusterInfo.AddLine("Pod phase", pod.Status.Phase)
7481
if cluster.Status.InstancesReportedState != nil {
@@ -89,29 +96,38 @@ func PrintClusterResources(ctx context.Context, crudClient client.Client, namesp
8996
_ = crudClient.List(
9097
ctx, jobList, client.InNamespace(namespace),
9198
)
92-
for _, job := range jobList.Items {
99+
for i := range jobList.Items {
100+
job := &jobList.Items[i]
93101
clusterInfo.AddLine("Job name", job.Name)
94102
clusterInfo.AddLine("Job status", fmt.Sprintf("%#v", job.Status))
95103
}
96104

105+
allPodList, _ := pods.List(ctx, crudClient, namespace)
106+
clusterInfo.AddLine()
107+
clusterInfo.AddLine("All namespace pods:")
108+
clusterInfo.AddLine()
109+
clusterInfo.AddHeader("Items", "Values")
110+
for i := range allPodList.Items {
111+
printPodDiagnostics(clusterInfo, &allPodList.Items[i])
112+
}
113+
97114
pvcList, _ := storage.GetPVCList(ctx, crudClient, cluster.GetNamespace())
98115
clusterInfo.AddLine()
99116
clusterInfo.AddLine("Cluster PVC information: (dumping all pvc under the namespace)")
100117
clusterInfo.AddLine("Available Cluster PVCCount", cluster.Status.PVCCount)
101118
clusterInfo.AddLine()
102119
clusterInfo.AddHeader("Items", "Values")
103-
for _, pvc := range pvcList.Items {
104-
clusterInfo.AddLine("PVC name", pvc.Name)
105-
clusterInfo.AddLine("PVC phase", pvc.Status.Phase)
106-
clusterInfo.AddLine("---", "---")
120+
for i := range pvcList.Items {
121+
printPVCDiagnostics(clusterInfo, &pvcList.Items[i])
107122
}
108123

109124
snapshotList, _ := storage.GetSnapshotList(ctx, crudClient, cluster.Namespace)
110125
clusterInfo.AddLine()
111126
clusterInfo.AddLine("Cluster Snapshot information: (dumping all snapshot under the namespace)")
112127
clusterInfo.AddLine()
113128
clusterInfo.AddHeader("Items", "Values")
114-
for _, snapshot := range snapshotList.Items {
129+
for i := range snapshotList.Items {
130+
snapshot := &snapshotList.Items[i]
115131
clusterInfo.AddLine("Snapshot name", snapshot.Name)
116132
if snapshot.Status.ReadyToUse != nil {
117133
clusterInfo.AddLine("Snapshot ready to use", *snapshot.Status.ReadyToUse)
@@ -121,12 +137,102 @@ func PrintClusterResources(ctx context.Context, crudClient client.Client, namesp
121137
clusterInfo.AddLine("---", "---")
122138
}
123139

140+
eventList, _ := namespaces.GetEventList(ctx, crudClient, namespace)
141+
printNamespaceEvents(clusterInfo, eventList)
142+
124143
// do not remove, this is needed to ensure that the writer cache is always flushed.
125144
clusterInfo.Print()
126145

127146
return buffer.String()
128147
}
129148

149+
func printPodDiagnostics(clusterInfo *tabby.Tabby, pod *corev1.Pod) {
150+
clusterInfo.AddLine("Pod name", pod.Name)
151+
clusterInfo.AddLine("Pod phase", pod.Status.Phase)
152+
clusterInfo.AddLine("Pod node", pod.Spec.NodeName)
153+
if pod.Status.Reason != "" {
154+
clusterInfo.AddLine("Pod reason", pod.Status.Reason)
155+
}
156+
if pod.Status.Message != "" {
157+
clusterInfo.AddLine("Pod message", pod.Status.Message)
158+
}
159+
for _, cond := range pod.Status.Conditions {
160+
if cond.Status == corev1.ConditionFalse {
161+
clusterInfo.AddLine(
162+
fmt.Sprintf("Condition %s", cond.Type),
163+
fmt.Sprintf("%s: %s", cond.Reason, cond.Message),
164+
)
165+
}
166+
}
167+
for _, cs := range pod.Status.InitContainerStatuses {
168+
if cs.State.Waiting != nil {
169+
clusterInfo.AddLine(
170+
fmt.Sprintf("Init container %s waiting", cs.Name),
171+
fmt.Sprintf("%s: %s", cs.State.Waiting.Reason, cs.State.Waiting.Message),
172+
)
173+
}
174+
}
175+
for _, cs := range pod.Status.ContainerStatuses {
176+
if cs.State.Waiting != nil {
177+
clusterInfo.AddLine(
178+
fmt.Sprintf("Container %s waiting", cs.Name),
179+
fmt.Sprintf("%s: %s", cs.State.Waiting.Reason, cs.State.Waiting.Message),
180+
)
181+
}
182+
}
183+
clusterInfo.AddLine("---", "---")
184+
}
185+
186+
func printPVCDiagnostics(clusterInfo *tabby.Tabby, pvc *corev1.PersistentVolumeClaim) {
187+
clusterInfo.AddLine("PVC name", pvc.Name)
188+
clusterInfo.AddLine("PVC phase", pvc.Status.Phase)
189+
if pvc.Spec.StorageClassName != nil {
190+
clusterInfo.AddLine("PVC storage class", *pvc.Spec.StorageClassName)
191+
}
192+
if pvc.Spec.VolumeName != "" {
193+
clusterInfo.AddLine("PVC volume", pvc.Spec.VolumeName)
194+
}
195+
if node, ok := pvc.Annotations["volume.kubernetes.io/selected-node"]; ok {
196+
clusterInfo.AddLine("PVC selected node", node)
197+
}
198+
for _, cond := range pvc.Status.Conditions {
199+
clusterInfo.AddLine(
200+
fmt.Sprintf("PVC condition %s", cond.Type),
201+
fmt.Sprintf("%s: %s", cond.Reason, cond.Message),
202+
)
203+
}
204+
clusterInfo.AddLine("---", "---")
205+
}
206+
207+
func printNamespaceEvents(clusterInfo *tabby.Tabby, eventList *eventsv1.EventList) {
208+
if len(eventList.Items) == 0 {
209+
return
210+
}
211+
eventTimeOf := func(ev *eventsv1.Event) time.Time {
212+
if ev.EventTime.Time != (time.Time{}) {
213+
return ev.EventTime.Time
214+
}
215+
return ev.CreationTimestamp.Time
216+
}
217+
sort.Slice(eventList.Items, func(i, j int) bool {
218+
return eventTimeOf(&eventList.Items[i]).Before(eventTimeOf(&eventList.Items[j]))
219+
})
220+
clusterInfo.AddLine()
221+
clusterInfo.AddLine("Namespace events:")
222+
clusterInfo.AddLine()
223+
clusterInfo.AddHeader("Time", "Type", "Reason", "Object", "Message")
224+
for i := range eventList.Items {
225+
ev := &eventList.Items[i]
226+
clusterInfo.AddLine(
227+
eventTimeOf(ev).Format(time.RFC3339),
228+
ev.Type,
229+
ev.Reason,
230+
fmt.Sprintf("%s/%s", ev.Regarding.Kind, ev.Regarding.Name),
231+
ev.Note,
232+
)
233+
}
234+
}
235+
130236
// ForgeArchiveWalOnMinio instead of using `switchWalCmd` to generate a real WAL archive, directly forges a WAL archive
131237
// file on Minio by copying and renaming an existing WAL archive file for the sake of more control of testing. To make
132238
// sure the forged one won't be a real WAL archive, we let the sequence in newWALName to be big enough so that it can't

0 commit comments

Comments
 (0)