Skip to content

Commit 4e17700

Browse files
committed
fix(memory): use non-blocking I/O for /dev/kmsg OOM detection
The checkOOMKills() function was using os.ReadFile() to read /dev/kmsg, which blocks forever because /dev/kmsg is a character device that never returns EOF. Changes: - Add KmsgReader interface for testability - Implement defaultKmsgReader with syscall.O_NONBLOCK - Handle EINTR, EAGAIN/EWOULDBLOCK, EPIPE errors gracefully - Seek to end to skip stale messages on each check - Respect context cancellation for clean shutdown - Fix race condition in warning flag patterns - Add mock KmsgReader and tests for edge cases Fixes #4265
1 parent 8eab4b9 commit 4e17700

File tree

3 files changed

+300
-42
lines changed

3 files changed

+300
-42
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
bin/
88
*.out
99

10+
11+
.claude
12+
1013
# Test binary, built with `go test -c`
1114
*.test
1215

@@ -135,3 +138,4 @@ _site/
135138

136139
# Log Bundles
137140
logs
141+

pkg/monitors/system/memory.go

Lines changed: 113 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,79 @@ type MemoryMonitorConfig struct {
8989
KmsgPath string `json:"kmsgPath,omitempty"`
9090
}
9191

92+
// KmsgReader interface abstracts kernel message reading for testability.
93+
// This interface properly handles /dev/kmsg as a character device that never returns EOF.
94+
type KmsgReader interface {
95+
ReadKmsg(ctx context.Context, path string) ([]byte, error)
96+
}
97+
98+
// maxKmsgBufferSize limits memory consumption when reading from /dev/kmsg
99+
const maxKmsgBufferSize = 1 * 1024 * 1024 // 1MB
100+
101+
// defaultKmsgReader implements KmsgReader using non-blocking syscall operations.
102+
// This properly handles /dev/kmsg as a character device that never returns EOF.
103+
type defaultKmsgReader struct{}
104+
105+
func (r *defaultKmsgReader) ReadKmsg(ctx context.Context, path string) ([]byte, error) {
106+
// Open with O_NONBLOCK to prevent blocking on the character device
107+
fd, err := syscall.Open(path, syscall.O_RDONLY|syscall.O_NONBLOCK, 0)
108+
if err != nil {
109+
return nil, err
110+
}
111+
defer syscall.Close(fd)
112+
113+
// Seek to end to only read new messages (skip existing ring buffer).
114+
// This prevents reading stale messages on every check.
115+
// Note: We intentionally skip historical OOM events on startup; the monitor
116+
// will detect new OOM kills going forward. Errors are ignored as seek may
117+
// not be supported on all platforms.
118+
_, _ = syscall.Seek(fd, 0, 2) // SEEK_END = 2
119+
120+
// Read messages with non-blocking I/O
121+
var allData []byte
122+
buf := make([]byte, 8192)
123+
totalRead := 0
124+
125+
for {
126+
// Check context cancellation
127+
select {
128+
case <-ctx.Done():
129+
return allData, ctx.Err()
130+
default:
131+
}
132+
133+
n, err := syscall.Read(fd, buf)
134+
if err != nil {
135+
// EINTR means interrupted by signal - retry
136+
if err == syscall.EINTR {
137+
continue
138+
}
139+
// EAGAIN/EWOULDBLOCK means no more data available (expected with O_NONBLOCK)
140+
if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK {
141+
break
142+
}
143+
// EPIPE can occur if we've read past end of ring buffer
144+
if err == syscall.EPIPE {
145+
break
146+
}
147+
return allData, err
148+
}
149+
150+
if n == 0 {
151+
break
152+
}
153+
154+
totalRead += n
155+
if totalRead > maxKmsgBufferSize {
156+
break
157+
}
158+
159+
allData = append(allData, buf[:n]...)
160+
}
161+
162+
return allData, nil
163+
}
164+
92165
// MemoryMonitor monitors memory health conditions including usage and OOM kills.
93166
// It tracks sustained high memory conditions and generates appropriate events and conditions.
94167
type MemoryMonitor struct {
@@ -105,6 +178,7 @@ type MemoryMonitor struct {
105178

106179
// System interfaces (for testing)
107180
fileReader FileReader
181+
kmsgReader KmsgReader
108182
}
109183

110184
// MemoryInfo represents parsed memory data from /proc/meminfo.
@@ -145,6 +219,7 @@ func NewMemoryMonitor(ctx context.Context, config types.MonitorConfig) (types.Mo
145219
baseMonitor: baseMonitor,
146220
config: memoryConfig,
147221
fileReader: &defaultFileReader{},
222+
kmsgReader: &defaultKmsgReader{},
148223
}
149224

150225
// Set the check function
@@ -378,41 +453,66 @@ func (m *MemoryMonitor) checkSwapUsage(ctx context.Context, status *types.Status
378453

379454
// checkOOMKills monitors for out-of-memory kills and generates appropriate events.
380455
func (m *MemoryMonitor) checkOOMKills(ctx context.Context, status *types.Status) error {
381-
// Try to read kernel messages for OOM kills
382-
data, err := m.fileReader.ReadFile(m.config.KmsgPath)
456+
// Try to read kernel messages for OOM kills using non-blocking I/O
457+
// This properly handles /dev/kmsg as a character device that never returns EOF
458+
data, err := m.kmsgReader.ReadKmsg(ctx, m.config.KmsgPath)
383459
if err != nil {
460+
// Handle context cancellation
461+
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
462+
return nil // Gracefully handle timeout/cancellation
463+
}
464+
384465
// Handle permission denied gracefully (log warning once, continue)
385-
if os.IsPermission(err) {
466+
if os.IsPermission(err) || errors.Is(err, syscall.EACCES) {
386467
m.mu.Lock()
387-
if !m.oomPermissionWarned {
468+
shouldWarn := !m.oomPermissionWarned
469+
if shouldWarn {
388470
m.oomPermissionWarned = true
389-
m.mu.Unlock()
471+
}
472+
m.mu.Unlock()
473+
if shouldWarn {
390474
status.AddEvent(types.NewEvent(
391475
types.EventWarning,
392476
"OOMCheckPermissionDenied",
393477
fmt.Sprintf("Cannot access %s for OOM detection: %v", m.config.KmsgPath, err),
394478
))
395-
} else {
396-
m.mu.Unlock()
397479
}
398480
return nil
399481
}
400482

401483
// Handle EINVAL errors gracefully (occurs on ARM64 platforms when reading /dev/kmsg)
402484
// This is a platform-specific limitation, not a critical failure
403-
var pathErr *os.PathError
404-
if errors.As(err, &pathErr) && errors.Is(pathErr.Err, syscall.EINVAL) {
485+
if errors.Is(err, syscall.EINVAL) {
405486
m.mu.Lock()
406-
if !m.oomReadErrorWarned {
487+
shouldWarn := !m.oomReadErrorWarned
488+
if shouldWarn {
407489
m.oomReadErrorWarned = true
408-
m.mu.Unlock()
490+
}
491+
m.mu.Unlock()
492+
if shouldWarn {
409493
status.AddEvent(types.NewEvent(
410494
types.EventWarning,
411495
"OOMCheckNotSupported",
412496
fmt.Sprintf("OOM detection via %s not supported on this platform (ARM64): %v. Memory monitoring will continue without OOM detection.", m.config.KmsgPath, err),
413497
))
414-
} else {
415-
m.mu.Unlock()
498+
}
499+
return nil
500+
}
501+
502+
// Handle file not found gracefully (kmsg may not exist in some environments)
503+
if errors.Is(err, syscall.ENOENT) || os.IsNotExist(err) {
504+
m.mu.Lock()
505+
shouldWarn := !m.oomReadErrorWarned
506+
if shouldWarn {
507+
m.oomReadErrorWarned = true
508+
}
509+
m.mu.Unlock()
510+
if shouldWarn {
511+
status.AddEvent(types.NewEvent(
512+
types.EventWarning,
513+
"OOMCheckNotAvailable",
514+
fmt.Sprintf("Kernel message file %s not available: %v. Memory monitoring will continue without OOM detection.", m.config.KmsgPath, err),
515+
))
416516
}
417517
return nil
418518
}

0 commit comments

Comments
 (0)