@@ -18,6 +18,7 @@ package status
1818import (
1919 "context"
2020 "fmt"
21+ "regexp"
2122 "strings"
2223
2324 "github.com/devfile/devworkspace-operator/pkg/common"
@@ -145,10 +146,15 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string,
145146 if maxCount , isUnrecoverableEvent := unrecoverablePodEventReasons [ev .Reason ]; isUnrecoverableEvent {
146147 if ! checkIfUnrecoverableEventIgnored (ev .Reason , ignoredEvents ) && getEventCount (ev ) >= maxCount {
147148 var msg string
149+ eventMessage := ev .Message // Original Kubelet message from the event
150+ if ev .Reason == "FailedPostStartHook" {
151+ eventMessage = getConcisePostStartFailureMessage (ev .Message )
152+ }
153+
148154 if getEventCount (ev ) > 1 {
149- msg = fmt .Sprintf ("Detected unrecoverable event %s %d times: %s. " , ev .Reason , getEventCount (ev ), ev . Message )
155+ msg = fmt .Sprintf ("Detected unrecoverable event %s %d times: %s" , ev .Reason , getEventCount (ev ), eventMessage )
150156 } else {
151- msg = fmt .Sprintf ("Detected unrecoverable event %s: %s. " , ev .Reason , ev . Message )
157+ msg = fmt .Sprintf ("Detected unrecoverable event %s: %s" , ev .Reason , eventMessage )
152158 }
153159 return msg , nil
154160 }
@@ -157,22 +163,110 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string,
157163 return "" , nil
158164}
159165
166+ // getConcisePostStartFailureMessage tries to parse the Kubelet's verbose message
167+ // for a PostStartHookError into a more user-friendly one.
168+ func getConcisePostStartFailureMessage (kubeletMsg string ) string {
169+
170+ /* regexes for specific messages from our postStart script's output */
171+
172+ // matches: "[postStart hook] Commands terminated by SIGTERM (likely timed out after ...s). Exit code 143."
173+ reTerminatedSigterm := regexp .MustCompile (`(\[postStart hook\] Commands terminated by SIGTERM \(likely timed out after [^)]+?\)\. Exit code 143\.)` )
174+
175+ // matches: "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ...s expired). Exit code 137."
176+ reKilledSigkill := regexp .MustCompile (`(\[postStart hook\] Commands forcefully killed by SIGKILL \(likely after --kill-after [^)]+?\)\. Exit code 137\.)` )
177+
178+ // matches: "[postStart hook] Commands failed with exit code ..." (for any other script-reported non-zero exit code)
179+ reGenericFailedExitCode := regexp .MustCompile (`(\[postStart hook\] Commands failed with exit code \d+\.)` )
180+
181+ // regex to capture Kubelet's explicit message field content if it exists
182+ reKubeletInternalMessage := regexp .MustCompile (`message:\s*"([^"]*)"` )
183+
184+ // regex to capture Kubelet's reported exit code for the hook command
185+ reKubeletExitCode := regexp .MustCompile (`exited with (\d+):` )
186+
187+ /* 1: check Kubelet's explicit `message: "..."` field for the specific output */
188+
189+ kubeletInternalMsgMatch := reKubeletInternalMessage .FindStringSubmatch (kubeletMsg )
190+ if len (kubeletInternalMsgMatch ) > 1 && kubeletInternalMsgMatch [1 ] != "" {
191+ internalMsg := kubeletInternalMsgMatch [1 ]
192+ if match := reTerminatedSigterm .FindString (internalMsg ); match != "" {
193+ return match
194+ }
195+ if match := reKilledSigkill .FindString (internalMsg ); match != "" {
196+ return match
197+ }
198+ if match := reGenericFailedExitCode .FindString (internalMsg ); match != "" {
199+ return match
200+ }
201+ }
202+
203+ /* 2: parse Kubelet's reported exit code for the entire hook command */
204+
205+ matchesKubeletExitCode := reKubeletExitCode .FindStringSubmatch (kubeletMsg )
206+ if len (matchesKubeletExitCode ) > 1 {
207+ exitCodeStr := matchesKubeletExitCode [1 ]
208+ var exitCode int
209+ fmt .Sscanf (exitCodeStr , "%d" , & exitCode )
210+
211+ // generate messages indicating the source is Kubelet's reported exit code
212+ if exitCode == 143 { // SIGTERM
213+ return "[postStart hook] Commands terminated by SIGTERM due to timeout"
214+ } else if exitCode == 137 { // SIGKILL
215+ return "[postStart hook] Commands forcefully killed by SIGKILL due to timeout"
216+ } else if exitCode != 0 { // Other non-zero exit codes (e.g., 124, 127)
217+ return fmt .Sprintf ("[postStart hook] Commands failed (Kubelet reported exit code %s)" , exitCodeStr )
218+ }
219+ }
220+
221+ /* 3: try to match specific script outputs against the *entire* Kubelet message */
222+
223+ if match := reTerminatedSigterm .FindString (kubeletMsg ); match != "" {
224+ return match
225+ }
226+ if match := reKilledSigkill .FindString (kubeletMsg ); match != "" {
227+ return match
228+ }
229+ if match := reGenericFailedExitCode .FindString (kubeletMsg ); match != "" {
230+ return match
231+ }
232+
233+ /* 4: fallback */
234+
235+ return "[postStart hook] failed with an unknown error (see pod events or container logs for more details)"
236+ }
237+
160238func CheckContainerStatusForFailure (containerStatus * corev1.ContainerStatus , ignoredEvents []string ) (ok bool , reason string ) {
161239 if containerStatus .State .Waiting != nil {
240+ // Explicitly check for PostStartHookError
241+ if containerStatus .State .Waiting .Reason == "PostStartHookError" { // Kubelet uses this reason
242+ conciseMsg := getConcisePostStartFailureMessage (containerStatus .State .Waiting .Message )
243+ return checkIfUnrecoverableEventIgnored ("FailedPostStartHook" , ignoredEvents ), conciseMsg
244+ }
245+ // Check against other generic failure reasons
162246 for _ , failureReason := range containerFailureStateReasons {
163247 if containerStatus .State .Waiting .Reason == failureReason {
164- return checkIfUnrecoverableEventIgnored (containerStatus .State .Waiting .Reason , ignoredEvents ), containerStatus .State .Waiting .Reason
248+ return checkIfUnrecoverableEventIgnored (containerStatus .State .Waiting .Reason , ignoredEvents ),
249+ containerStatus .State .Waiting .Reason
165250 }
166251 }
167252 }
168253
169254 if containerStatus .State .Terminated != nil {
255+ // Check if termination was due to a generic error, which might include postStart issues
256+ // if the container failed to run.
257+ if containerStatus .State .Terminated .Reason == "Error" || containerStatus .State .Terminated .Reason == "ContainerCannotRun" {
258+ return checkIfUnrecoverableEventIgnored (containerStatus .State .Terminated .Reason , ignoredEvents ),
259+ fmt .Sprintf ("%s: %s" , containerStatus .State .Terminated .Reason , containerStatus .State .Terminated .Message )
260+ }
261+ // Check against other generic failure reasons for terminated state
170262 for _ , failureReason := range containerFailureStateReasons {
171263 if containerStatus .State .Terminated .Reason == failureReason {
172- return checkIfUnrecoverableEventIgnored (containerStatus .State .Terminated .Reason , ignoredEvents ), containerStatus .State .Terminated .Reason
264+ return checkIfUnrecoverableEventIgnored (containerStatus .State .Terminated .Reason , ignoredEvents ),
265+ containerStatus .State .Terminated .Reason
173266 }
174267 }
175268 }
269+
176270 return true , ""
177271}
178272
0 commit comments