@@ -56,7 +56,7 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
5656 errCh <- err
5757 return
5858 }
59- newOVSEvents := eventsFromOVSVswitchdLogs (nodeName , ovsVswitchdLogs )
59+ newOVSEvents := intervalsFromOVSVswitchdLogs (nodeName , ovsVswitchdLogs )
6060
6161 networkManagerLogs , err := getNodeLog (ctx , kubeClient , nodeName , "NetworkManager" )
6262 if err != nil {
@@ -66,11 +66,20 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
6666 }
6767 newNetworkManagerIntervals := intervalsFromNetworkManagerLogs (nodeName , networkManagerLogs )
6868
69+ systemdCoreDumpLogs , err := getNodeLog (ctx , kubeClient , nodeName , "systemd-coredump" )
70+ if err != nil {
71+ fmt .Fprintf (os .Stderr , "Error getting node systemd-coredump logs from %s: %s" , nodeName , err .Error ())
72+ errCh <- err
73+ return
74+ }
75+ newSystemdCoreDumpIntervals := intervalsFromSystemdCoreDumpLogs (nodeName , systemdCoreDumpLogs )
76+
6977 lock .Lock ()
7078 defer lock .Unlock ()
7179 ret = append (ret , newEvents ... )
7280 ret = append (ret , newOVSEvents ... )
7381 ret = append (ret , newNetworkManagerIntervals ... )
82+ ret = append (ret , newSystemdCoreDumpIntervals ... )
7483 }(ctx , node .Name )
7584 }
7685 wg .Wait ()
@@ -114,9 +123,9 @@ func eventsFromKubeletLogs(nodeName string, kubeletLog []byte) monitorapi.Interv
114123 return ret
115124}
116125
117- // eventsFromOVSVswitchdLogs returns the produced intervals. Any errors during this creation are logged, but
126+ // intervalsFromOVSVswitchdLogs returns the produced intervals. Any errors during this creation are logged, but
118127// not returned because this is a best effort step
119- func eventsFromOVSVswitchdLogs (nodeName string , ovsLogs []byte ) monitorapi.Intervals {
128+ func intervalsFromOVSVswitchdLogs (nodeName string , ovsLogs []byte ) monitorapi.Intervals {
120129 nodeLocator := monitorapi .NewLocator ().NodeFromName (nodeName )
121130 ret := monitorapi.Intervals {}
122131
@@ -164,6 +173,55 @@ func unreasonablyLongPollInterval(logLine string, nodeLocator monitorapi.Locator
164173
165174var unreasonablyLongPollIntervalRE = regexp .MustCompile (`Unreasonably long (\d+)ms poll interval` )
166175
176+ // intervalsFromSystemdCoreDumpLogs returns the produced intervals. Any errors during this creation are logged, but
177+ // not returned because this is a best effort step
178+ func intervalsFromSystemdCoreDumpLogs (nodeName string , coreDumpLogs []byte ) monitorapi.Intervals {
179+ nodeLocator := monitorapi .NewLocator ().NodeFromName (nodeName )
180+ ret := monitorapi.Intervals {}
181+
182+ scanner := bufio .NewScanner (bytes .NewBuffer (coreDumpLogs ))
183+ for scanner .Scan () {
184+ currLine := scanner .Text ()
185+ ret = append (ret , processCoreDump (currLine , nodeLocator )... )
186+ }
187+
188+ return ret
189+ }
190+
191+ // processCoreDump searches for core dump events with process information
192+ //
193+ // Process 7798 (haproxy) of user 1000680000 dumped core.
194+ func processCoreDump (logLine string , nodeLocator monitorapi.Locator ) monitorapi.Intervals {
195+ if ! strings .Contains (logLine , "dumped core" ) {
196+ return nil
197+ }
198+
199+ logTime := utility .SystemdJournalLogTime (logLine , time .Now ().Year ())
200+
201+ // Extract the process name from within parentheses
202+ var processName string
203+ match := coreDumpProcessRE .FindStringSubmatch (logLine )
204+ if match != nil && len (match ) > 1 {
205+ processName = match [1 ]
206+ }
207+
208+ message := logLine [strings .Index (logLine , "Process" ):]
209+
210+ // Build the message with process annotation if we extracted it
211+ messageBuilder := monitorapi .NewMessage ().HumanMessage (message ).Reason (monitorapi .ReasonProcessDumpedCore )
212+ if processName != "" {
213+ messageBuilder = messageBuilder .WithAnnotation ("process" , processName )
214+ }
215+
216+ interval := monitorapi .NewInterval (monitorapi .SourceSystemdCoreDumpLog , monitorapi .Warning ).Locator (
217+ nodeLocator ).Message (messageBuilder ).
218+ Display ().Build (logTime , logTime .Add (1 * time .Second ))
219+
220+ return monitorapi.Intervals {interval }
221+ }
222+
223+ var coreDumpProcessRE = regexp .MustCompile (`Process \d+ \(([^)]+)\) of user \d+ dumped core` )
224+
167225// intervalsFromNetworkManagerLogs returns the produced intervals. Any errors during this creation are logged, but
168226// not returned because this is a best effort step
169227func intervalsFromNetworkManagerLogs (nodeName string , ovsLogs []byte ) monitorapi.Intervals {
0 commit comments