@@ -89,6 +89,79 @@ type MemoryMonitorConfig struct {
8989 KmsgPath string `json:"kmsgPath,omitempty"`
9090}
9191
92+ // KmsgReader interface abstracts kernel message reading for testability.
93+ // This interface properly handles /dev/kmsg as a character device that never returns EOF.
94+ type KmsgReader interface {
95+ ReadKmsg (ctx context.Context , path string ) ([]byte , error )
96+ }
97+
98+ // maxKmsgBufferSize limits memory consumption when reading from /dev/kmsg
99+ const maxKmsgBufferSize = 1 * 1024 * 1024 // 1MB
100+
101+ // defaultKmsgReader implements KmsgReader using non-blocking syscall operations.
102+ // This properly handles /dev/kmsg as a character device that never returns EOF.
103+ type defaultKmsgReader struct {}
104+
105+ func (r * defaultKmsgReader ) ReadKmsg (ctx context.Context , path string ) ([]byte , error ) {
106+ // Open with O_NONBLOCK to prevent blocking on the character device
107+ fd , err := syscall .Open (path , syscall .O_RDONLY | syscall .O_NONBLOCK , 0 )
108+ if err != nil {
109+ return nil , err
110+ }
111+ defer syscall .Close (fd )
112+
113+ // Seek to end to only read new messages (skip existing ring buffer).
114+ // This prevents reading stale messages on every check.
115+ // Note: We intentionally skip historical OOM events on startup; the monitor
116+ // will detect new OOM kills going forward. Errors are ignored as seek may
117+ // not be supported on all platforms.
118+ _ , _ = syscall .Seek (fd , 0 , 2 ) // SEEK_END = 2
119+
120+ // Read messages with non-blocking I/O
121+ var allData []byte
122+ buf := make ([]byte , 8192 )
123+ totalRead := 0
124+
125+ for {
126+ // Check context cancellation
127+ select {
128+ case <- ctx .Done ():
129+ return allData , ctx .Err ()
130+ default :
131+ }
132+
133+ n , err := syscall .Read (fd , buf )
134+ if err != nil {
135+ // EINTR means interrupted by signal - retry
136+ if err == syscall .EINTR {
137+ continue
138+ }
139+ // EAGAIN/EWOULDBLOCK means no more data available (expected with O_NONBLOCK)
140+ if err == syscall .EAGAIN || err == syscall .EWOULDBLOCK {
141+ break
142+ }
143+ // EPIPE can occur if we've read past end of ring buffer
144+ if err == syscall .EPIPE {
145+ break
146+ }
147+ return allData , err
148+ }
149+
150+ if n == 0 {
151+ break
152+ }
153+
154+ totalRead += n
155+ if totalRead > maxKmsgBufferSize {
156+ break
157+ }
158+
159+ allData = append (allData , buf [:n ]... )
160+ }
161+
162+ return allData , nil
163+ }
164+
92165// MemoryMonitor monitors memory health conditions including usage and OOM kills.
93166// It tracks sustained high memory conditions and generates appropriate events and conditions.
94167type MemoryMonitor struct {
@@ -105,6 +178,7 @@ type MemoryMonitor struct {
105178
106179 // System interfaces (for testing)
107180 fileReader FileReader
181+ kmsgReader KmsgReader
108182}
109183
110184// MemoryInfo represents parsed memory data from /proc/meminfo.
@@ -145,6 +219,7 @@ func NewMemoryMonitor(ctx context.Context, config types.MonitorConfig) (types.Mo
145219 baseMonitor : baseMonitor ,
146220 config : memoryConfig ,
147221 fileReader : & defaultFileReader {},
222+ kmsgReader : & defaultKmsgReader {},
148223 }
149224
150225 // Set the check function
@@ -378,41 +453,66 @@ func (m *MemoryMonitor) checkSwapUsage(ctx context.Context, status *types.Status
378453
379454// checkOOMKills monitors for out-of-memory kills and generates appropriate events.
380455func (m * MemoryMonitor ) checkOOMKills (ctx context.Context , status * types.Status ) error {
381- // Try to read kernel messages for OOM kills
382- data , err := m .fileReader .ReadFile (m .config .KmsgPath )
456+ // Try to read kernel messages for OOM kills using non-blocking I/O
457+ // This properly handles /dev/kmsg as a character device that never returns EOF
458+ data , err := m .kmsgReader .ReadKmsg (ctx , m .config .KmsgPath )
383459 if err != nil {
460+ // Handle context cancellation
461+ if errors .Is (err , context .Canceled ) || errors .Is (err , context .DeadlineExceeded ) {
462+ return nil // Gracefully handle timeout/cancellation
463+ }
464+
384465 // Handle permission denied gracefully (log warning once, continue)
385- if os .IsPermission (err ) {
466+ if os .IsPermission (err ) || errors . Is ( err , syscall . EACCES ) {
386467 m .mu .Lock ()
387- if ! m .oomPermissionWarned {
468+ shouldWarn := ! m .oomPermissionWarned
469+ if shouldWarn {
388470 m .oomPermissionWarned = true
389- m .mu .Unlock ()
471+ }
472+ m .mu .Unlock ()
473+ if shouldWarn {
390474 status .AddEvent (types .NewEvent (
391475 types .EventWarning ,
392476 "OOMCheckPermissionDenied" ,
393477 fmt .Sprintf ("Cannot access %s for OOM detection: %v" , m .config .KmsgPath , err ),
394478 ))
395- } else {
396- m .mu .Unlock ()
397479 }
398480 return nil
399481 }
400482
401483 // Handle EINVAL errors gracefully (occurs on ARM64 platforms when reading /dev/kmsg)
402484 // This is a platform-specific limitation, not a critical failure
403- var pathErr * os.PathError
404- if errors .As (err , & pathErr ) && errors .Is (pathErr .Err , syscall .EINVAL ) {
485+ if errors .Is (err , syscall .EINVAL ) {
405486 m .mu .Lock ()
406- if ! m .oomReadErrorWarned {
487+ shouldWarn := ! m .oomReadErrorWarned
488+ if shouldWarn {
407489 m .oomReadErrorWarned = true
408- m .mu .Unlock ()
490+ }
491+ m .mu .Unlock ()
492+ if shouldWarn {
409493 status .AddEvent (types .NewEvent (
410494 types .EventWarning ,
411495 "OOMCheckNotSupported" ,
412496 fmt .Sprintf ("OOM detection via %s not supported on this platform (ARM64): %v. Memory monitoring will continue without OOM detection." , m .config .KmsgPath , err ),
413497 ))
414- } else {
415- m .mu .Unlock ()
498+ }
499+ return nil
500+ }
501+
502+ // Handle file not found gracefully (kmsg may not exist in some environments)
503+ if errors .Is (err , syscall .ENOENT ) || os .IsNotExist (err ) {
504+ m .mu .Lock ()
505+ shouldWarn := ! m .oomReadErrorWarned
506+ if shouldWarn {
507+ m .oomReadErrorWarned = true
508+ }
509+ m .mu .Unlock ()
510+ if shouldWarn {
511+ status .AddEvent (types .NewEvent (
512+ types .EventWarning ,
513+ "OOMCheckNotAvailable" ,
514+ fmt .Sprintf ("Kernel message file %s not available: %v. Memory monitoring will continue without OOM detection." , m .config .KmsgPath , err ),
515+ ))
416516 }
417517 return nil
418518 }
0 commit comments