Skip to content

Commit f69ae77

Browse files
authored
feat(analyzer): add detailed message for unhealthy pod (#1261)
* feat(analyzer): add detailed message for unhealthy pod
1 parent b02d12f commit f69ae77

14 files changed

+9185
-9
lines changed

pkg/analyze/cluster_pod_statuses.go

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package analyzer
33
import (
44
"bytes"
55
"encoding/json"
6+
"fmt"
67
"path/filepath"
78
"strings"
89
"text/template"
@@ -32,7 +33,8 @@ func (a *AnalyzeClusterPodStatuses) IsExcluded() (bool, error) {
3233
}
3334

3435
func (a *AnalyzeClusterPodStatuses) Analyze(getFile getCollectedFileContents, findFiles getChildCollectedFileContents) ([]*AnalyzeResult, error) {
35-
results, err := clusterPodStatuses(a.analyzer, findFiles)
36+
// findFiles is used to get the pod status and events files
37+
results, err := clusterPodStatuses(a.analyzer, findFiles, findFiles)
3638
if err != nil {
3739
return nil, err
3840
}
@@ -42,7 +44,7 @@ func (a *AnalyzeClusterPodStatuses) Analyze(getFile getCollectedFileContents, fi
4244
return results, nil
4345
}
4446

45-
func clusterPodStatuses(analyzer *troubleshootv1beta2.ClusterPodStatuses, getChildCollectedFileContents getChildCollectedFileContents) ([]*AnalyzeResult, error) {
47+
func clusterPodStatuses(analyzer *troubleshootv1beta2.ClusterPodStatuses, getChildCollectedFileContents getChildCollectedFileContents, getChildCollectedFileContentsEvents getChildCollectedFileContents) ([]*AnalyzeResult, error) {
4648
excludeFiles := []string{}
4749
collected, err := getChildCollectedFileContents(filepath.Join(constants.CLUSTER_RESOURCES_DIR, constants.CLUSTER_RESOURCES_PODS, "*.json"), excludeFiles)
4850
if err != nil {
@@ -77,7 +79,38 @@ func clusterPodStatuses(analyzer *troubleshootv1beta2.ClusterPodStatuses, getChi
7779

7880
for _, pod := range pods {
7981
if pod.Status.Reason == "" {
80-
pod.Status.Reason = k8sutil.GetPodStatusReason(&pod)
82+
// get pod status reason and message from the pod
83+
pod.Status.Reason, pod.Status.Message = k8sutil.GetPodStatusReason(&pod)
84+
}
85+
86+
// if the pod has no last termination message like pending or container creating, then check the pod events and get the warning messages. Errors will be logged and return empty message.
87+
if pod.Status.Message == "" {
88+
messages := []string{}
89+
collectedEvents, err := getChildCollectedFileContentsEvents(filepath.Join(constants.CLUSTER_RESOURCES_DIR, "events", fmt.Sprintf("%s.json", pod.Namespace)), excludeFiles)
90+
if err != nil {
91+
klog.V(2).Infof("failed to read collected events for namespace %s: %v", pod.Namespace, err)
92+
}
93+
94+
for _, fileContent := range collectedEvents {
95+
var nsEvents []corev1.Event
96+
if err := json.Unmarshal(fileContent, &nsEvents); err != nil {
97+
// try new format
98+
var nsEventsList corev1.EventList
99+
if err := json.Unmarshal(fileContent, &nsEventsList); err != nil {
100+
klog.V(2).Infof("failed to unmarshal events for namespace %s: %v", pod.Namespace, err)
101+
}
102+
nsEvents = nsEventsList.Items
103+
}
104+
105+
for _, event := range nsEvents {
106+
if event.InvolvedObject.Kind == "Pod" && event.InvolvedObject.Name == pod.Name && event.InvolvedObject.Namespace == pod.Namespace {
107+
if event.Type == "Warning" && event.Message != "" {
108+
messages = append(messages, event.Message)
109+
}
110+
}
111+
}
112+
}
113+
pod.Status.Message = strings.Join(messages, ". ")
81114
}
82115

83116
for _, outcome := range analyzer.Outcomes {
@@ -149,7 +182,12 @@ func clusterPodStatuses(analyzer *troubleshootv1beta2.ClusterPodStatuses, getChi
149182
}
150183

151184
if r.Message == "" {
152-
r.Message = "Pod {{ .Namespace }}/{{ .Name }} status is {{ .Status.Reason }}"
185+
r.Message = "Pod {{ .Namespace }}/{{ .Name }} status is {{ .Status.Reason }}. Message is: {{ .Status.Message }}"
186+
}
187+
188+
// if the pod has no status message, set it to None
189+
if pod.Status.Message == "" {
190+
pod.Status.Message = "None"
153191
}
154192

155193
tmpl := template.New("pod")
@@ -176,7 +214,7 @@ func clusterPodStatuses(analyzer *troubleshootv1beta2.ClusterPodStatuses, getChi
176214
if err != nil {
177215
return nil, errors.Wrap(err, "failed to execute template")
178216
}
179-
r.Message = m.String()
217+
r.Message = strings.TrimSpace(m.String())
180218

181219
// add to results, break and check the next pod
182220
allResults = append(allResults, &r)

0 commit comments

Comments
 (0)