Skip to content

Commit 8a404ee

Browse files
feat(envd): add mem_cache metric to track page cache usage (#2269)
* feat(envd): add mem_cache metric to track page cache usage across full pipeline Expose the Linux page cache (Cached from /proc/meminfo) as a new mem_cache metric. Previously only mem_used was reported, which excludes page cache and can underreport actual memory pressure inside Firecracker VMs. The metric flows through: envd -> orchestrator OTel gauges -> ClickHouse -> public and edge APIs. Version-gated to envd >= 0.5.9 for backward compat. * fix(api): make memCache a required field in SandboxMetric schemas All other metric fields (memUsed, memTotal, diskUsed, etc.) are required in the OpenAPI specs. Leaving memCache optional caused oapi-codegen to generate it as a pointer (*int64) with omitempty, which is inconsistent with the rest of the struct and would make it the only metric that could be absent from API responses. * chore: auto-commit generated changes * fix(telemetry): add description and unit for ram cache gauge --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 87a0e2d commit 8a404ee

File tree

18 files changed

+205
-143
lines changed

18 files changed

+205
-143
lines changed

packages/api/internal/api/api.gen.go

Lines changed: 142 additions & 139 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/api/internal/clusters/resources_local.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ func (l *LocalClusterResourceProvider) GetSandboxMetrics(ctx context.Context, te
7878
CpuCount: int32(m.CPUCount),
7979
MemTotal: int64(m.MemTotal),
8080
MemUsed: int64(m.MemUsed),
81+
MemCache: int64(m.MemCache),
8182
DiskTotal: int64(m.DiskTotal),
8283
DiskUsed: int64(m.DiskUsed),
8384
}
@@ -105,6 +106,7 @@ func (l *LocalClusterResourceProvider) GetSandboxesMetrics(ctx context.Context,
105106
CpuCount: int32(m.CPUCount),
106107
MemTotal: int64(m.MemTotal),
107108
MemUsed: int64(m.MemUsed),
109+
MemCache: int64(m.MemCache),
108110
DiskTotal: int64(m.DiskTotal),
109111
DiskUsed: int64(m.DiskUsed),
110112
}

packages/api/internal/clusters/resources_remote.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ func (r *ClusterResourceProviderImpl) GetSandboxMetrics(ctx context.Context, tea
6666
CpuCount: m.CpuCount,
6767
MemTotal: m.MemTotal,
6868
MemUsed: m.MemUsed,
69+
MemCache: m.MemCache,
6970
DiskTotal: m.DiskTotal,
7071
DiskUsed: m.DiskUsed,
7172
}
@@ -98,6 +99,7 @@ func (r *ClusterResourceProviderImpl) GetSandboxesMetrics(ctx context.Context, t
9899
CpuCount: v.CpuCount,
99100
MemTotal: v.MemTotal,
100101
MemUsed: v.MemUsed,
102+
MemCache: v.MemCache,
101103
DiskTotal: v.DiskTotal,
102104
DiskUsed: v.DiskUsed,
103105
}

packages/clickhouse/pkg/sandbox.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ type Metrics struct {
1919
CPUUsedPercent float64 `ch:"cpu_used"`
2020
MemTotal float64 `ch:"ram_total"`
2121
MemUsed float64 `ch:"ram_used"`
22+
MemCache float64 `ch:"ram_cache"`
2223
DiskTotal float64 `ch:"disk_total"`
2324
DiskUsed float64 `ch:"disk_used"`
2425
}
@@ -30,6 +31,7 @@ SELECT sandbox_id,
3031
argMaxIf(value, timestamp, metric_name = '%s') AS cpu_used,
3132
argMaxIf(value, timestamp, metric_name = '%s') AS ram_total,
3233
argMaxIf(value, timestamp, metric_name = '%s') AS ram_used,
34+
argMaxIf(value, timestamp, metric_name = '%s') AS ram_cache,
3335
argMaxIf(value, timestamp, metric_name = '%s') AS disk_total,
3436
argMaxIf(value, timestamp, metric_name = '%s') AS disk_used,
3537
-- All metrics are recorded at the same time, so we can use max(timestamp) to get the latest one
@@ -39,7 +41,7 @@ WHERE sandbox_id IN ?
3941
AND team_id = ?
4042
GROUP BY sandbox_id,
4143
team_id;
42-
`, telemetry.SandboxCpuTotalGaugeName, telemetry.SandboxCpuUsedGaugeName, telemetry.SandboxRamTotalGaugeName, telemetry.SandboxRamUsedGaugeName, telemetry.SandboxDiskTotalGaugeName, telemetry.SandboxDiskUsedGaugeName)
44+
`, telemetry.SandboxCpuTotalGaugeName, telemetry.SandboxCpuUsedGaugeName, telemetry.SandboxRamTotalGaugeName, telemetry.SandboxRamUsedGaugeName, telemetry.SandboxRamCacheGaugeName, telemetry.SandboxDiskTotalGaugeName, telemetry.SandboxDiskUsedGaugeName)
4345

4446
// QueryLatestMetrics returns rows ordered by timestamp, paged by limit.
4547
func (c *Client) QueryLatestMetrics(ctx context.Context, sandboxIDs []string, teamID string) ([]Metrics, error) {
@@ -82,6 +84,7 @@ SELECT toStartOfInterval(timestamp, interval {step:UInt32} second) AS ts,
8284
maxIf(value, metric_name = '%s') AS cpu_used,
8385
maxIf(value, metric_name = '%s') AS ram_total,
8486
maxIf(value, metric_name = '%s') AS ram_used,
87+
maxIf(value, metric_name = '%s') AS ram_cache,
8588
maxIf(value, metric_name = '%s') AS disk_total,
8689
maxIf(value, metric_name = '%s') AS disk_used
8790
FROM sandbox_metrics_gauge s
@@ -91,7 +94,7 @@ AND timestamp >= {start_time:DateTime64}
9194
AND timestamp <= {end_time:DateTime64}
9295
GROUP BY ts
9396
ORDER BY ts;
94-
`, telemetry.SandboxCpuTotalGaugeName, telemetry.SandboxCpuUsedGaugeName, telemetry.SandboxRamTotalGaugeName, telemetry.SandboxRamUsedGaugeName, telemetry.SandboxDiskTotalGaugeName, telemetry.SandboxDiskUsedGaugeName)
97+
`, telemetry.SandboxCpuTotalGaugeName, telemetry.SandboxCpuUsedGaugeName, telemetry.SandboxRamTotalGaugeName, telemetry.SandboxRamUsedGaugeName, telemetry.SandboxRamCacheGaugeName, telemetry.SandboxDiskTotalGaugeName, telemetry.SandboxDiskUsedGaugeName)
9598

9699
func (c *Client) QuerySandboxTimeRange(ctx context.Context, sandboxID string, teamID string) (time.Time, time.Time, error) {
97100
var start, end time.Time

packages/envd/internal/api/api.gen.go

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/envd/internal/host/metrics.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type Metrics struct {
2525

2626
MemTotal uint64 `json:"mem_total"` // Total virtual memory in bytes
2727
MemUsed uint64 `json:"mem_used"` // Used virtual memory in bytes
28+
MemCache uint64 `json:"mem_cache"` // Cached memory (page cache) in bytes
2829

2930
DiskUsed uint64 `json:"disk_used"` // Used disk space in bytes
3031
DiskTotal uint64 `json:"disk_total"` // Total disk space in bytes
@@ -68,6 +69,7 @@ func GetMetrics() (*Metrics, error) {
6869
MemTotalMiB: memTotalMiB,
6970
MemTotal: v.Total,
7071
MemUsed: v.Used,
72+
MemCache: v.Cached,
7173
DiskUsed: diskMetrics.Total - diskMetrics.Available,
7274
DiskTotal: diskMetrics.Total,
7375
}, nil

packages/envd/pkg/version.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
package pkg
22

3-
const Version = "0.5.8"
3+
const Version = "0.5.9"

packages/envd/spec/envd.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,9 @@ components:
349349
mem_used:
350350
type: integer
351351
description: Used virtual memory in bytes
352+
mem_cache:
353+
type: integer
354+
description: Cached memory (page cache) in bytes
352355
mem_total_mib:
353356
type: integer
354357
description: Total virtual memory in MiB

packages/orchestrator/pkg/metrics/sandboxes.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const (
3333
minEnvVersionForMetricsTimestamp = "0.1.3"
3434
minEnvdVersionForMemoryPrecise = "0.2.4"
3535
minEnvdVersionForDiskMetrics = "0.2.4"
36+
minEnvdVersionForCacheMetrics = "0.5.9"
3637

3738
timeoutGetMetrics = 100 * time.Millisecond
3839
metricsParallelismFactor = 5 // Used to calculate number of concurrently sandbox metrics requests
@@ -57,6 +58,7 @@ type SandboxObserver struct {
5758
cpuUsed metric.Float64ObservableGauge
5859
memoryTotal metric.Int64ObservableGauge
5960
memoryUsed metric.Int64ObservableGauge
61+
memoryCache metric.Int64ObservableGauge
6062
diskTotal metric.Int64ObservableGauge
6163
diskUsed metric.Int64ObservableGauge
6264
}
@@ -108,6 +110,11 @@ func NewSandboxObserver(ctx context.Context, nodeID, serviceName, serviceCommit,
108110
return nil, fmt.Errorf("failed to create memory used gauge: %w", err)
109111
}
110112

113+
memoryCache, err := telemetry.GetGaugeInt(meter, telemetry.SandboxRamCacheGaugeName)
114+
if err != nil {
115+
return nil, fmt.Errorf("failed to create memory cache gauge: %w", err)
116+
}
117+
111118
diskTotal, err := telemetry.GetGaugeInt(meter, telemetry.SandboxDiskTotalGaugeName)
112119
if err != nil {
113120
return nil, fmt.Errorf("failed to create disk total gauge: %w", err)
@@ -127,6 +134,7 @@ func NewSandboxObserver(ctx context.Context, nodeID, serviceName, serviceCommit,
127134
cpuUsed: cpuUsed,
128135
memoryTotal: memoryTotal,
129136
memoryUsed: memoryUsed,
137+
memoryCache: memoryCache,
130138
diskTotal: diskTotal,
131139
diskUsed: diskUsed,
132140
}
@@ -229,6 +237,14 @@ func (so *SandboxObserver) startObserving() (metric.Registration, error) {
229237
o.ObserveInt64(so.memoryTotal, memoryTotal, attributes)
230238
o.ObserveInt64(so.memoryUsed, memoryUsed, attributes)
231239

240+
ok, err = utils.IsGTEVersion(sbx.Config.Envd.Version, minEnvdVersionForCacheMetrics)
241+
if err != nil {
242+
logger.L().Error(ctx, "Failed to check envd version for cache metrics", zap.Error(err), logger.WithSandboxID(sbx.Runtime.SandboxID))
243+
}
244+
if ok {
245+
o.ObserveInt64(so.memoryCache, sbxMetrics.MemCache, attributes)
246+
}
247+
232248
ok, err = utils.IsGTEVersion(sbx.Config.Envd.Version, minEnvdVersionForDiskMetrics)
233249
if err != nil {
234250
logger.L().Error(ctx, "Failed to check envd version for disk metrics", zap.Error(err), logger.WithSandboxID(sbx.Runtime.SandboxID))
@@ -266,7 +282,7 @@ func (so *SandboxObserver) startObserving() (metric.Registration, error) {
266282
}
267283

268284
return nil
269-
}, so.cpuTotal, so.cpuUsed, so.memoryTotal, so.memoryUsed, so.diskTotal, so.diskUsed)
285+
}, so.cpuTotal, so.cpuUsed, so.memoryTotal, so.memoryUsed, so.memoryCache, so.diskTotal, so.diskUsed)
270286
if err != nil {
271287
return nil, err
272288
}

packages/orchestrator/pkg/sandbox/envd/envd.gen.go

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)