Skip to content

Commit 59326a9

Browse files
Add host-level memory pressure metrics to system memory check (#44808)
### What does this PR do? Adds memory pressure metrics from `/proc/vmstat` to the Linux system memory check. **New metrics:** | Metric | Type | Tags | Description | |--------|------|------|-------------| | `system.mem.allocstall` | MonotonicCount | `zone` | Direct reclaim stalls per memory zone | | `system.mem.pgscan_direct` | MonotonicCount | - | Pages scanned during direct (synchronous) reclaim | | `system.mem.pgsteal_direct` | MonotonicCount | - | Pages reclaimed during direct reclaim | | `system.mem.pgscan_kswapd` | MonotonicCount | - | Pages scanned by kswapd background daemon | | `system.mem.pgsteal_kswapd` | MonotonicCount | - | Pages reclaimed by kswapd | Similar to #1605 (commit_limit/committed_as) and #6708 (context_switches). ### Motivation These counters expose kernel memory reclaim activity: - **allocstall**: Increments when a process enters direct reclaim, blocking until memory is freed. Indicates memory pressure where kswapd cannot keep up with demand. Per-zone counters (`dma`, `dma32`, `normal`, `movable`) help distinguish zone-specific pressure from system-wide pressure. ([kernel docs](https://www.kernel.org/doc/html/latest/admin-guide/mm/concepts.html), [Red Hat on memory zones](https://www.redhat.com/en/blog/memory-zones-and-linux-driver-issue)) - **pgscan/pgsteal**: Track pages scanned vs reclaimed during memory reclaim. The ratio `pgsteal/pgscan` indicates reclaim efficiency—values below 30% suggest the kernel is scanning many pages to find few reclaimable ones. `_direct` variants measure synchronous reclaim in the allocation path; `_kswapd` variants measure background reclaim. ([kernel memory management docs](https://www.kernel.org/doc/gorman/html/understand/understand013.html)) ### Describe how you validated your changes - `go test -tags test ./pkg/collector/corechecks/system/memory/...` — all tests pass - `TestMemoryCheckLinux` updated to verify new metrics with correct values and zone tags - `TestSwapMemoryError` updated to handle vmstat unavailability path ### Additional Notes - Linux-only (relies on `/proc/vmstat`) - Respects `procfs_path` config for containerized environments - Per-zone `allocstall_*` counters available since kernel 4.8; older kernels have a single global counter (not collected) - No configuration required—metrics collected automatically with existing memory check - This PR was created with the help of [Claude Code](https://github.com/anthropics/claude-code) Co-authored-by: fabbing <fabbing@free.fr> Co-authored-by: fabrice.buoro <fabrice.buoro@datadoghq.com>
1 parent 2b6687a commit 59326a9

File tree

4 files changed

+172
-5
lines changed

4 files changed

+172
-5
lines changed
Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,16 @@
1+
## This file is overwritten upon Agent upgrade.
2+
## To make modifications to the check configuration, copy this file
3+
## to `memory.yaml` and make your changes on that file.
4+
5+
init_config:
6+
17
instances:
2-
- {}
8+
-
9+
10+
## @param collect_memory_pressure - boolean - optional - default: false
11+
## Set to true to collect memory pressure metrics from /proc/vmstat
12+
## (allocstall, pgscan_direct, pgsteal_direct, pgscan_kswapd, pgsteal_kswapd).
13+
## These metrics help detect kernel memory pressure before OOM events occur.
14+
## Linux only.
15+
#
16+
# collect_memory_pressure: false

pkg/collector/corechecks/system/memory/memory_nix.go

Lines changed: 98 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,39 @@
88
package memory
99

1010
import (
11+
"bufio"
1112
"errors"
13+
"os"
1214
"runtime"
15+
"strconv"
16+
"strings"
1317

1418
"github.com/shirou/gopsutil/v4/mem"
19+
"gopkg.in/yaml.v2"
1520

16-
"github.com/DataDog/datadog-agent/pkg/util/log"
17-
21+
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration"
22+
"github.com/DataDog/datadog-agent/pkg/aggregator/sender"
1823
core "github.com/DataDog/datadog-agent/pkg/collector/corechecks"
24+
pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup"
25+
"github.com/DataDog/datadog-agent/pkg/util/log"
1926
)
2027

2128
// For testing purpose
2229
var virtualMemory = mem.VirtualMemory
2330
var swapMemory = mem.SwapMemory
2431
var runtimeOS = runtime.GOOS
32+
var openProcFile = func(path string) (*os.File, error) {
33+
return os.Open(path)
34+
}
35+
36+
type memoryInstanceConfig struct {
37+
CollectMemoryPressure bool `yaml:"collect_memory_pressure"`
38+
}
2539

26-
// Check doesn't need additional fields
40+
// Check collects memory metrics
2741
type Check struct {
2842
core.CheckBase
43+
instanceConfig memoryInstanceConfig
2944
}
3045

3146
const mbSize float64 = 1024 * 1024
@@ -96,6 +111,11 @@ func (c *Check) linuxSpecificVirtualMemoryCheck(v *mem.VirtualMemoryStat) error
96111
sender.Gauge("system.mem.commit_limit", float64(v.CommitLimit)/mbSize, "", nil)
97112
sender.Gauge("system.mem.committed_as", float64(v.CommittedAS)/mbSize, "", nil)
98113
sender.Gauge("system.swap.cached", float64(v.SwapCached)/mbSize, "", nil)
114+
115+
if c.instanceConfig.CollectMemoryPressure {
116+
c.collectVMStatPressureMetrics(sender)
117+
}
118+
99119
return nil
100120
}
101121

@@ -108,3 +128,78 @@ func (c *Check) freebsdSpecificVirtualMemoryCheck(v *mem.VirtualMemoryStat) erro
108128
sender.Gauge("system.mem.cached", float64(v.Cached)/mbSize, "", nil)
109129
return nil
110130
}
131+
132+
// Configure configures the memory check
133+
func (c *Check) Configure(senderManager sender.SenderManager, _ uint64, rawInstance integration.Data, rawInitConfig integration.Data, source string) error {
134+
err := c.CommonConfigure(senderManager, rawInitConfig, rawInstance, source)
135+
if err != nil {
136+
return err
137+
}
138+
139+
s, err := c.GetSender()
140+
if err != nil {
141+
return err
142+
}
143+
s.FinalizeCheckServiceTag()
144+
145+
return yaml.Unmarshal(rawInstance, &c.instanceConfig)
146+
}
147+
148+
func (c *Check) collectVMStatPressureMetrics(sender sender.Sender) {
149+
procfsPath := "/proc"
150+
if pkgconfigsetup.Datadog().IsSet("procfs_path") {
151+
procfsPath = pkgconfigsetup.Datadog().GetString("procfs_path")
152+
}
153+
154+
filePath := procfsPath + "/vmstat"
155+
file, err := openProcFile(filePath)
156+
if err != nil {
157+
log.Debugf("memory.Check: could not open %s: %v", filePath, err)
158+
return
159+
}
160+
defer file.Close()
161+
162+
allocstallByZone := make(map[string]uint64)
163+
var pgscanDirect, pgstealDirect uint64
164+
var pgscanKswapd, pgstealKswapd uint64
165+
166+
scanner := bufio.NewScanner(file)
167+
for scanner.Scan() {
168+
fields := strings.Fields(scanner.Text())
169+
if len(fields) < 2 {
170+
continue
171+
}
172+
173+
value, err := strconv.ParseUint(fields[1], 10, 64)
174+
if err != nil {
175+
continue
176+
}
177+
178+
switch {
179+
case strings.HasPrefix(fields[0], "allocstall_"):
180+
zoneName := strings.TrimPrefix(fields[0], "allocstall_")
181+
allocstallByZone[zoneName] = value
182+
case fields[0] == "pgscan_direct":
183+
pgscanDirect = value
184+
case fields[0] == "pgsteal_direct":
185+
pgstealDirect = value
186+
case fields[0] == "pgscan_kswapd":
187+
pgscanKswapd = value
188+
case fields[0] == "pgsteal_kswapd":
189+
pgstealKswapd = value
190+
}
191+
}
192+
193+
if err := scanner.Err(); err != nil {
194+
log.Debugf("memory.Check: error reading %s: %v", filePath, err)
195+
return
196+
}
197+
198+
for zoneName, value := range allocstallByZone {
199+
sender.MonotonicCount("system.mem.allocstall", float64(value), "", []string{"zone:" + zoneName})
200+
}
201+
sender.MonotonicCount("system.mem.pgscan_direct", float64(pgscanDirect), "", nil)
202+
sender.MonotonicCount("system.mem.pgsteal_direct", float64(pgstealDirect), "", nil)
203+
sender.MonotonicCount("system.mem.pgscan_kswapd", float64(pgscanKswapd), "", nil)
204+
sender.MonotonicCount("system.mem.pgsteal_kswapd", float64(pgstealKswapd), "", nil)
205+
}

pkg/collector/corechecks/system/memory/memory_nix_test.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ package memory
99

1010
import (
1111
"errors"
12+
"os"
13+
"strings"
1214
"testing"
1315

1416
"github.com/shirou/gopsutil/v4/mem"
@@ -60,6 +62,29 @@ func TestMemoryCheckLinux(t *testing.T) {
6062
virtualMemory = VirtualMemory
6163
swapMemory = SwapMemory
6264
memCheck := new(Check)
65+
memCheck.instanceConfig.CollectMemoryPressure = true
66+
67+
vmstatContent := `nr_free_pages 12345
68+
allocstall_dma 10
69+
allocstall_dma32 20
70+
allocstall_normal 30
71+
allocstall_movable 40
72+
pgscan_kswapd 1000
73+
pgsteal_kswapd 900
74+
pgscan_direct 500
75+
pgsteal_direct 450
76+
`
77+
openProcFile = func(path string) (*os.File, error) {
78+
if strings.HasSuffix(path, "/vmstat") {
79+
r, w, _ := os.Pipe()
80+
go func() {
81+
w.WriteString(vmstatContent)
82+
w.Close()
83+
}()
84+
return r, nil
85+
}
86+
return nil, errors.New("file not found")
87+
}
6388

6489
mock := mocksender.NewMockSender(memCheck.ID())
6590

@@ -85,14 +110,26 @@ func TestMemoryCheckLinux(t *testing.T) {
85110
mock.On("Gauge", "system.swap.cached", 25000000000.0/mbSize, "", []string(nil)).Return().Times(1)
86111
mock.On("Rate", "system.swap.swap_in", 21.0/mbSize, "", []string(nil)).Return().Times(1)
87112
mock.On("Rate", "system.swap.swap_out", 22.0/mbSize, "", []string(nil)).Return().Times(1)
113+
114+
// vmstat pressure metrics with zone tags
115+
mock.On("MonotonicCount", "system.mem.allocstall", float64(10), "", []string{"zone:dma"}).Return().Times(1)
116+
mock.On("MonotonicCount", "system.mem.allocstall", float64(20), "", []string{"zone:dma32"}).Return().Times(1)
117+
mock.On("MonotonicCount", "system.mem.allocstall", float64(30), "", []string{"zone:normal"}).Return().Times(1)
118+
mock.On("MonotonicCount", "system.mem.allocstall", float64(40), "", []string{"zone:movable"}).Return().Times(1)
119+
mock.On("MonotonicCount", "system.mem.pgscan_direct", float64(500), "", []string(nil)).Return().Times(1)
120+
mock.On("MonotonicCount", "system.mem.pgsteal_direct", float64(450), "", []string(nil)).Return().Times(1)
121+
mock.On("MonotonicCount", "system.mem.pgscan_kswapd", float64(1000), "", []string(nil)).Return().Times(1)
122+
mock.On("MonotonicCount", "system.mem.pgsteal_kswapd", float64(900), "", []string(nil)).Return().Times(1)
123+
88124
mock.On("FinalizeCheckServiceTag").Return().Times(1)
89125
mock.On("Commit").Return().Times(1)
90126
memCheck.Configure(mock.GetSenderManager(), 0, nil, nil, "")
91127
err := memCheck.Run()
92128
require.Nil(t, err)
93129

94130
mock.AssertExpectations(t)
95-
mock.AssertNumberOfCalls(t, "Gauge", 18)
131+
mock.AssertNumberOfCalls(t, "Gauge", 18) // Standard memory metrics
132+
mock.AssertNumberOfCalls(t, "MonotonicCount", 8) // 4 allocstall zones + 4 other vmstat
96133
mock.AssertNumberOfCalls(t, "Rate", 2)
97134
mock.AssertNumberOfCalls(t, "Commit", 1)
98135
}
@@ -186,6 +223,11 @@ func TestSwapMemoryError(t *testing.T) {
186223
swapMemory = func() (*mem.SwapMemoryStat, error) { return nil, errors.New("some error") }
187224
memCheck := new(Check)
188225

226+
// Mock proc files to return errors (simulates non-Linux or unavailable)
227+
openProcFile = func(_ string) (*os.File, error) {
228+
return nil, errors.New("file not found")
229+
}
230+
189231
mock := mocksender.NewMockSender(memCheck.ID())
190232

191233
runtimeOS = "linux"
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Each section from every release note are combined when the
2+
# CHANGELOG.rst is rendered. So the text needs to be worded so that
3+
# it does not depend on any information only available in another
4+
# section. This may mean repeating some details, but each section
5+
# must be readable independently of the other.
6+
#
7+
# Each section note must be formatted as reStructuredText.
8+
---
9+
features:
10+
- |
11+
The system memory check on Linux can now collect memory pressure metrics
12+
from /proc/vmstat to help detect memory pressure before OOM events occur.
13+
To enable, set ``collect_memory_pressure: true`` in the memory check configuration.
14+
New metrics: ``system.mem.allocstall`` (with ``zone`` tag),
15+
``system.mem.pgscan_direct``, ``system.mem.pgsteal_direct``,
16+
``system.mem.pgscan_kswapd``, ``system.mem.pgsteal_kswapd``.

0 commit comments

Comments
 (0)