Skip to content

Commit ac189cc

Browse files
authored
refactor(orch): remove unused PID-based host stats, rely on cgroup stats (#2251)
* refactor(orch): remove PID-based firecracker stats from host metrics Cgroups now provide all CPU and memory accounting for sandboxes, making the gopsutil/process-based stats from /proc/<pid> obsolete. Remove the Firecracker PID fields from the collector, data model, and ClickHouse INSERT query. The ClickHouse column drop migration will follow separately. * refactor(orch): widen host stats collection to cover full sandbox lifecycle Start collecting cgroup stats immediately after sandbox/cgroup creation (before FC boot/resume) and stop right before cgroup removal (after process exit). This captures resource usage during VM boot, envd init, and the full shutdown sequence — previously missed billing data. * refactor(orch): move host stats and cgroup cleanup to creation methods Move cgroup removal and host stats collector stop out of doStop() and createCgroup() into the Create/Resume sandbox methods as explicit cleanup.Add registrations. This makes each creation path own its full lifecycle and follows the project's cleanup pattern consistently. * refactor(orch): add NoopManager to ensure cgroupManager is never nil Introduce cgroup.NoopManager for environments that don't need cgroup accounting (CLI tools, tests). This eliminates nil checks on both cgroupManager and cgroupHandle throughout the sandbox lifecycle. Noop handles use an explicit noop field so GetStats returns (nil, nil) for intentional no-ops, while nil handles from real manager failures still return an error. * refactor(orch): add NoopDelivery to ensure hostStatsDelivery is never nil Introduce hoststats.NoopDelivery for environments that don't need host stats collection (CLI tools, tests). This eliminates the nil check on hostStatsDelivery in initializeHostStatsCollector, matching the pattern used for cgroupManager. * fix(orch): use NoopDelivery in production when ClickHouse is unconfigured The production path in run.go left hostStatsDelivery nil when ClickhouseConnectionString is empty. With the nil guard removed, this would panic. Initialize it with NoopDelivery instead. Also add compile-time interface compliance checks for noopManager and noopDelivery.
1 parent 911760b commit ac189cc

File tree

12 files changed

+136
-142
lines changed

12 files changed

+136
-142
lines changed

packages/clickhouse/pkg/hoststats/delivery.go

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,14 @@ const InsertSandboxHostStatQuery = `INSERT INTO sandbox_host_stats
2323
sandbox_team_id,
2424
sandbox_vcpu_count,
2525
sandbox_memory_mb,
26-
firecracker_cpu_user_time,
27-
firecracker_cpu_system_time,
28-
firecracker_memory_rss,
29-
firecracker_memory_vms,
3026
cgroup_cpu_usage_usec,
3127
cgroup_cpu_user_usec,
3228
cgroup_cpu_system_usec,
3329
cgroup_memory_usage_bytes,
3430
cgroup_memory_peak_bytes,
3531
sandbox_type
3632
)
37-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
33+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
3834

3935
type ClickhouseDelivery struct {
4036
batcher *batcher.Batcher[SandboxHostStat]
@@ -115,10 +111,6 @@ func (c *ClickhouseDelivery) batchInserter(ctx context.Context, stats []SandboxH
115111
stat.SandboxTeamID,
116112
stat.SandboxVCPUCount,
117113
stat.SandboxMemoryMB,
118-
stat.FirecrackerCPUUserTime,
119-
stat.FirecrackerCPUSystemTime,
120-
stat.FirecrackerMemoryRSS,
121-
stat.FirecrackerMemoryVMS,
122114
stat.CgroupCPUUsageUsec,
123115
stat.CgroupCPUUserUsec,
124116
stat.CgroupCPUSystemUsec,

packages/clickhouse/pkg/hoststats/hoststats.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,6 @@ type SandboxHostStat struct {
2020
SandboxVCPUCount int64 `ch:"sandbox_vcpu_count"` // number of virtual CPUs allocated to the sandbox
2121
SandboxMemoryMB int64 `ch:"sandbox_memory_mb"` // total memory allocated to the sandbox in megabytes
2222

23-
FirecrackerCPUUserTime float64 `ch:"firecracker_cpu_user_time"` // cumulative user CPU time in seconds
24-
FirecrackerCPUSystemTime float64 `ch:"firecracker_cpu_system_time"` // cumulative system CPU time in seconds
25-
FirecrackerMemoryRSS uint64 `ch:"firecracker_memory_rss"` // Resident Set Size in bytes
26-
FirecrackerMemoryVMS uint64 `ch:"firecracker_memory_vms"` // Virtual Memory Size in bytes
27-
2823
// Cgroup v2 accounting — cumulative CPU values, deltas calculated in queries
2924
CgroupCPUUsageUsec uint64 `ch:"cgroup_cpu_usage_usec"` // cumulative, microseconds
3025
CgroupCPUUserUsec uint64 `ch:"cgroup_cpu_user_usec"` // cumulative, microseconds
@@ -41,3 +36,17 @@ type Delivery interface {
4136
Push(stat SandboxHostStat) error
4237
Close(ctx context.Context) error
4338
}
39+
40+
// noopDelivery is a Delivery that discards all stats.
41+
// Used in environments where host stats collection is not needed (CLI tools, tests).
42+
type noopDelivery struct{}
43+
44+
var _ Delivery = (*noopDelivery)(nil)
45+
46+
// NewNoopDelivery returns a Delivery that silently discards all stats.
47+
func NewNoopDelivery() Delivery {
48+
return &noopDelivery{}
49+
}
50+
51+
func (d *noopDelivery) Push(_ SandboxHostStat) error { return nil }
52+
func (d *noopDelivery) Close(_ context.Context) error { return nil }

packages/orchestrator/benchmark_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ import (
2020
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
2121
"go.opentelemetry.io/otel/metric/noop"
2222

23+
"github.com/e2b-dev/infra/packages/clickhouse/pkg/hoststats"
2324
"github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg"
2425
"github.com/e2b-dev/infra/packages/orchestrator/pkg/proxy"
2526
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox"
2627
blockmetrics "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block/metrics"
28+
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/cgroup"
2729
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/fc"
2830
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/nbd"
2931
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/network"
@@ -183,7 +185,7 @@ func BenchmarkBaseImageLaunch(b *testing.B) {
183185
b.Cleanup(templateCache.Stop)
184186

185187
sandboxes := sandbox.NewSandboxesMap()
186-
sandboxFactory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, featureFlags, nil, nil, sandboxes)
188+
sandboxFactory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, featureFlags, hoststats.NewNoopDelivery(), cgroup.NewNoopManager(), sandboxes)
187189

188190
dockerhubRepository, err := dockerhub.GetRemoteRepository(b.Context())
189191
require.NoError(b, err)

packages/orchestrator/cmd/create-build/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@ import (
1919
"go.uber.org/zap"
2020
"go.uber.org/zap/zapcore"
2121

22+
"github.com/e2b-dev/infra/packages/clickhouse/pkg/hoststats"
2223
"github.com/e2b-dev/infra/packages/orchestrator/cmd/internal/cmdutil"
2324
"github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg"
2425
"github.com/e2b-dev/infra/packages/orchestrator/pkg/proxy"
2526
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox"
2627
blockmetrics "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block/metrics"
28+
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/cgroup"
2729
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/nbd"
2830
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/network"
2931
sbxtemplate "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/template"
@@ -297,7 +299,7 @@ func doBuild(
297299
defer templateCache.Stop()
298300

299301
buildMetrics, _ := metrics.NewBuildMetrics(noop.MeterProvider{})
300-
sandboxFactory := sandbox.NewFactory(c.BuilderConfig, networkPool, devicePool, featureFlags, nil, nil, sandboxes)
302+
sandboxFactory := sandbox.NewFactory(c.BuilderConfig, networkPool, devicePool, featureFlags, hoststats.NewNoopDelivery(), cgroup.NewNoopManager(), sandboxes)
301303

302304
builder := build.NewBuilder(
303305
builderConfig, l, featureFlags, sandboxFactory,

packages/orchestrator/cmd/resume-build/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,13 @@ import (
2121
"go.opentelemetry.io/otel/metric/noop"
2222
"golang.org/x/sys/unix"
2323

24+
"github.com/e2b-dev/infra/packages/clickhouse/pkg/hoststats"
2425
"github.com/e2b-dev/infra/packages/orchestrator/cmd/internal/cmdutil"
2526
"github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg"
2627
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox"
2728
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block"
2829
blockmetrics "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block/metrics"
30+
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/cgroup"
2931
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/fc"
3032
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/nbd"
3133
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/network"
@@ -1050,7 +1052,7 @@ func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefe
10501052
if verbose {
10511053
fmt.Println("🔧 Creating sandbox factory...")
10521054
}
1053-
factory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, flags, nil, nil, sandboxes)
1055+
factory := sandbox.NewFactory(config.BuilderConfig, networkPool, devicePool, flags, hoststats.NewNoopDelivery(), cgroup.NewNoopManager(), sandboxes)
10541056

10551057
fmt.Printf("📦 Loading %s...\n", buildID)
10561058
tmpl, err := cache.GetTemplate(ctx, buildID, false, false)

packages/orchestrator/cmd/smoketest/smoke_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ import (
1717
"github.com/stretchr/testify/require"
1818
"go.opentelemetry.io/otel/metric/noop"
1919

20+
"github.com/e2b-dev/infra/packages/clickhouse/pkg/hoststats"
2021
"github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg"
2122
"github.com/e2b-dev/infra/packages/orchestrator/pkg/proxy"
2223
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox"
2324
blockmetrics "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block/metrics"
25+
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/cgroup"
2426
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/fc"
2527
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/nbd"
2628
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/network"
@@ -226,7 +228,7 @@ func newTestInfra(t *testing.T, ctx context.Context) *testInfra {
226228
ti.closers = append(ti.closers, func(ctx context.Context) { sandboxProxy.Close(ctx) })
227229

228230
// Factory + Builder
229-
factory := sandbox.NewFactory(orcConfig.BuilderConfig, networkPool, devicePool, flags, nil, nil, sandboxes)
231+
factory := sandbox.NewFactory(orcConfig.BuilderConfig, networkPool, devicePool, flags, hoststats.NewNoopDelivery(), cgroup.NewNoopManager(), sandboxes)
230232
ti.factory = factory
231233

232234
buildMetrics, _ := metrics.NewBuildMetrics(noop.MeterProvider{})

packages/orchestrator/pkg/factories/run.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ func run(config cfg.Config, opts Options) (success bool) {
410410

411411
sbxEventsDeliveryTargets := make([]event.Delivery[event.SandboxEvent], 0)
412412

413-
var hostStatsDelivery clickhousehoststats.Delivery
413+
hostStatsDelivery := clickhousehoststats.NewNoopDelivery()
414414

415415
// Clickhouse sandbox events and host stats delivery
416416
if config.ClickhouseConnectionString != "" {

packages/orchestrator/pkg/sandbox/cgroup/manager.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ type CgroupHandle struct {
5151
memoryPeakFile *os.File // Open FD to memory.peak for per-FD reset (nil after Remove or if not available)
5252
manager *managerImpl
5353
removed bool
54+
noop bool // true for handles created by NoopManager (no real cgroup backing)
5455
}
5556

5657
// GetFD returns the file descriptor for use with SysProcAttr.CgroupFD.
@@ -84,13 +85,18 @@ func (h *CgroupHandle) ReleaseCgroupFD() error {
8485
return err
8586
}
8687

87-
// GetStats retrieves current resource usage statistics for this cgroup
88-
// Returns error if cgroup has been removed or stats cannot be read
88+
// GetStats retrieves current resource usage statistics for this cgroup.
89+
// Returns (nil, nil) for noop handles (no real cgroup backing).
90+
// Returns error if the handle is nil (unexpected) or stats cannot be read.
8991
func (h *CgroupHandle) GetStats(ctx context.Context) (*Stats, error) {
9092
if h == nil {
9193
return nil, fmt.Errorf("cgroup handle is nil")
9294
}
9395

96+
if h.noop {
97+
return nil, nil
98+
}
99+
94100
return h.manager.getStatsForPath(ctx, h.path, h.memoryPeakFile)
95101
}
96102

@@ -99,7 +105,7 @@ func (h *CgroupHandle) GetStats(ctx context.Context) (*Stats, error) {
99105
// Safe to call multiple times. Returns error if removal fails
100106
// (but tolerates the cgroup having been auto-cleaned by the kernel).
101107
func (h *CgroupHandle) Remove(ctx context.Context) error {
102-
if h == nil || h.removed {
108+
if h == nil || h.noop || h.removed {
103109
return nil
104110
}
105111

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package cgroup
2+
3+
import "context"
4+
5+
// noopManager is a cgroup Manager that does not create real cgroups.
6+
// Used in environments where cgroup accounting is not needed (CLI tools, tests).
7+
type noopManager struct{}
8+
9+
var _ Manager = (*noopManager)(nil)
10+
11+
// NewNoopManager returns a Manager that creates noop cgroup handles.
12+
// The handles are safe to use but do not perform any actual cgroup operations.
13+
func NewNoopManager() Manager {
14+
return &noopManager{}
15+
}
16+
17+
func (m *noopManager) Initialize(_ context.Context) error {
18+
return nil
19+
}
20+
21+
func (m *noopManager) Create(_ context.Context, cgroupName string) (*CgroupHandle, error) {
22+
return newNoopHandle(cgroupName), nil
23+
}
24+
25+
// newNoopHandle creates a CgroupHandle that performs no real cgroup operations.
26+
// GetFD returns NoCgroupFD, GetStats returns (nil, nil), Remove is a no-op.
27+
func newNoopHandle(cgroupName string) *CgroupHandle {
28+
return &CgroupHandle{
29+
cgroupName: cgroupName,
30+
noop: true,
31+
}
32+
}

packages/orchestrator/pkg/sandbox/hoststats.go

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"go.uber.org/zap"
99

1010
"github.com/e2b-dev/infra/packages/clickhouse/pkg/hoststats"
11-
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/fc"
1211
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
1312
)
1413

@@ -17,36 +16,18 @@ import (
1716
func initializeHostStatsCollector(
1817
ctx context.Context,
1918
sbx *Sandbox,
20-
fcHandle *fc.Process,
2119
runtime RuntimeMetadata,
2220
config *Config,
2321
hostStatsDelivery hoststats.Delivery,
2422
samplingInterval time.Duration,
2523
) {
26-
if hostStatsDelivery == nil {
27-
return
28-
}
29-
30-
firecrackerPID, err := fcHandle.Pid()
31-
if err != nil {
32-
logger.L().Error(ctx, "failed to get firecracker PID for host stats",
33-
logger.WithSandboxID(runtime.SandboxID),
34-
zap.Error(err))
35-
36-
return
37-
}
38-
3924
teamID, err := uuid.Parse(runtime.TeamID)
4025
if err != nil {
41-
logger.L().Error(ctx, "error parsing team ID", logger.WithTeamID(runtime.TeamID), zap.Error(err))
26+
logger.L().Warn(ctx, "invalid team ID for host stats, using zero UUID",
27+
logger.WithTeamID(runtime.TeamID), zap.Error(err))
4228
}
4329

44-
var cgroupStats CgroupStatsFunc
45-
if sbx.cgroupHandle != nil {
46-
cgroupStats = sbx.cgroupHandle.GetStats
47-
}
48-
49-
collector, err := NewHostStatsCollector(
30+
collector := NewHostStatsCollector(
5031
HostStatsMetadata{
5132
SandboxID: runtime.SandboxID,
5233
ExecutionID: runtime.ExecutionID,
@@ -57,18 +38,10 @@ func initializeHostStatsCollector(
5738
MemoryMB: config.RamMB,
5839
SandboxType: runtime.SandboxType,
5940
},
60-
int32(firecrackerPID),
6141
hostStatsDelivery,
6242
samplingInterval,
63-
cgroupStats,
43+
sbx.cgroupHandle.GetStats,
6444
)
65-
if err != nil {
66-
logger.L().Error(ctx, "failed to create host stats collector",
67-
logger.WithSandboxID(runtime.SandboxID),
68-
zap.Error(err))
69-
70-
return
71-
}
7245

7346
sbx.hostStatsCollector = collector
7447

0 commit comments

Comments
 (0)