Skip to content

Commit f90ce78

Browse files
authored
fix(runner): make metrics snapshot intervals configurable (#3460)
Signed-off-by: Luka Brecic <luka.brecic3@gmail.com>
1 parent 767d312 commit f90ce78

File tree

5 files changed

+112
-85
lines changed

5 files changed

+112
-85
lines changed

apps/docs/src/content/docs/en/oss-deployment.mdx

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -214,31 +214,33 @@ Below is a full list of environment variables with their default values:
214214

215215
### Runner
216216

217-
| Variable | Type | Default Value | Description |
218-
| ----------------------------- | ------- | --------------------------------- | ---------------------------------------------------- |
219-
| `DAYTONA_API_URL` | string | `http://api:3000/api` | Daytona API URL |
220-
| `DAYTONA_RUNNER_TOKEN` | string | `secret_api_token` | Runner API authentication token |
221-
| `VERSION` | string | `0.0.1` | Runner service version |
222-
| `ENVIRONMENT` | string | `development` | Application environment |
223-
| `API_PORT` | number | `3003` | Runner API service port |
224-
| `LOG_FILE_PATH` | string | `/home/daytona/runner/runner.log` | Path to runner log file |
225-
| `RESOURCE_LIMITS_DISABLED` | boolean | `true` | Disable resource limits for sandboxes |
226-
| `AWS_ENDPOINT_URL` | string | `http://minio:9000` | AWS S3-compatible storage endpoint |
227-
| `AWS_REGION` | string | `us-east-1` | AWS region |
228-
| `AWS_ACCESS_KEY_ID` | string | `minioadmin` | AWS access key ID |
229-
| `AWS_SECRET_ACCESS_KEY` | string | `minioadmin` | AWS secret access key |
230-
| `AWS_DEFAULT_BUCKET` | string | `daytona` | AWS default bucket name |
231-
| `DAEMON_START_TIMEOUT_SEC` | number | `60` | Daemon start timeout in seconds |
232-
| `SANDBOX_START_TIMEOUT_SEC` | number | `30` | Sandbox start timeout in seconds |
233-
| `USE_SNAPSHOT_ENTRYPOINT` | boolean | `false` | Use snapshot entrypoint for sandbox |
234-
| `RUNNER_DOMAIN` | string | (none) | Runner domain name (hostname for runner URLs) |
235-
| `VOLUME_CLEANUP_INTERVAL_SEC` | number | `30` | Volume cleanup interval in seconds (minimum: 10s) |
236-
| `POLL_TIMEOUT` | string | `30s` | Poller service timeout duration (e.g., `30s`, `1m`) |
237-
| `POLL_LIMIT` | number | `10` | Maximum poll attempts per request (min: 1, max: 100) |
238-
| `COLLECTOR_WINDOW_SIZE` | number | `60` | Metrics collector window size (number of samples) |
239-
| `HEALTHCHECK_INTERVAL` | string | `30s` | Interval between health checks (minimum: 10s) |
240-
| `HEALTHCHECK_TIMEOUT` | string | `10s` | Health check timeout duration |
241-
| `API_VERSION` | number | `2` | Runner API version (default: 2) |
217+
| Variable | Type | Default Value | Description |
218+
| --------------------------------------- | ------- | --------------------------------- | -------------------------------------------------------------- |
219+
| `DAYTONA_API_URL` | string | `http://api:3000/api` | Daytona API URL |
220+
| `DAYTONA_RUNNER_TOKEN` | string | `secret_api_token` | Runner API authentication token |
221+
| `VERSION` | string | `0.0.1` | Runner service version |
222+
| `ENVIRONMENT` | string | `development` | Application environment |
223+
| `API_PORT` | number | `3003` | Runner API service port |
224+
| `LOG_FILE_PATH` | string | `/home/daytona/runner/runner.log` | Path to runner log file |
225+
| `RESOURCE_LIMITS_DISABLED` | boolean | `true` | Disable resource limits for sandboxes |
226+
| `AWS_ENDPOINT_URL` | string | `http://minio:9000` | AWS S3-compatible storage endpoint |
227+
| `AWS_REGION` | string | `us-east-1` | AWS region |
228+
| `AWS_ACCESS_KEY_ID` | string | `minioadmin` | AWS access key ID |
229+
| `AWS_SECRET_ACCESS_KEY` | string | `minioadmin` | AWS secret access key |
230+
| `AWS_DEFAULT_BUCKET` | string | `daytona` | AWS default bucket name |
231+
| `DAEMON_START_TIMEOUT_SEC` | number | `60` | Daemon start timeout in seconds |
232+
| `SANDBOX_START_TIMEOUT_SEC` | number | `30` | Sandbox start timeout in seconds |
233+
| `USE_SNAPSHOT_ENTRYPOINT` | boolean | `false` | Use snapshot entrypoint for sandbox |
234+
| `RUNNER_DOMAIN` | string | (none) | Runner domain name (hostname for runner URLs) |
235+
| `VOLUME_CLEANUP_INTERVAL_SEC` | number | `30` | Volume cleanup interval in seconds (minimum: 10s) |
236+
| `COLLECTOR_WINDOW_SIZE` | number | `60` | Metrics collector window size (number of samples) |
237+
| `CPU_USAGE_SNAPSHOT_INTERVAL` | string | `5s` | CPU usage snapshot interval duration (minimum: 1s) |
238+
| `ALLOCATED_RESOURCES_SNAPSHOT_INTERVAL` | string | `5s` | Allocated resources snapshot interval (minimum: 1s) |
239+
| `POLL_TIMEOUT` | string | `30s` | Poller service timeout duration (e.g., `30s`, `1m`) |
240+
| `POLL_LIMIT` | number | `10` | Maximum poll attempts per request (min: 1, max: 100) |
241+
| `HEALTHCHECK_INTERVAL` | string | `30s` | Interval between health checks (minimum: 10s) |
242+
| `HEALTHCHECK_TIMEOUT` | string | `10s` | Health check timeout duration |
243+
| `API_VERSION` | number | `2` | Runner API version (default: 2) |
242244

243245
### SSH Gateway
244246

apps/runner/cmd/runner/config/config.go

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,35 +17,37 @@ import (
1717
)
1818

1919
type Config struct {
20-
DaytonaApiUrl string `envconfig:"DAYTONA_API_URL"`
21-
ApiToken string `envconfig:"DAYTONA_RUNNER_TOKEN"`
22-
ApiPort int `envconfig:"API_PORT"`
23-
TLSCertFile string `envconfig:"TLS_CERT_FILE"`
24-
TLSKeyFile string `envconfig:"TLS_KEY_FILE"`
25-
EnableTLS bool `envconfig:"ENABLE_TLS"`
26-
CacheRetentionDays int `envconfig:"CACHE_RETENTION_DAYS"`
27-
Environment string `envconfig:"ENVIRONMENT"`
28-
ContainerRuntime string `envconfig:"CONTAINER_RUNTIME"`
29-
ContainerNetwork string `envconfig:"CONTAINER_NETWORK"`
30-
LogFilePath string `envconfig:"LOG_FILE_PATH"`
31-
AWSRegion string `envconfig:"AWS_REGION"`
32-
AWSEndpointUrl string `envconfig:"AWS_ENDPOINT_URL"`
33-
AWSAccessKeyId string `envconfig:"AWS_ACCESS_KEY_ID"`
34-
AWSSecretAccessKey string `envconfig:"AWS_SECRET_ACCESS_KEY"`
35-
AWSDefaultBucket string `envconfig:"AWS_DEFAULT_BUCKET"`
36-
ResourceLimitsDisabled bool `envconfig:"RESOURCE_LIMITS_DISABLED"`
37-
DaemonStartTimeoutSec int `envconfig:"DAEMON_START_TIMEOUT_SEC"`
38-
SandboxStartTimeoutSec int `envconfig:"SANDBOX_START_TIMEOUT_SEC"`
39-
UseSnapshotEntrypoint bool `envconfig:"USE_SNAPSHOT_ENTRYPOINT"`
40-
Domain string `envconfig:"RUNNER_DOMAIN" validate:"omitempty,hostname|ip"`
41-
VolumeCleanupIntervalSec int `envconfig:"VOLUME_CLEANUP_INTERVAL_SEC" default:"30" validate:"min=10"`
42-
PollTimeout time.Duration `envconfig:"POLL_TIMEOUT" default:"30s"`
43-
PollLimit int `envconfig:"POLL_LIMIT" default:"10" validate:"min=1,max=100"`
44-
CollectorWindowSize int `envconfig:"COLLECTOR_WINDOW_SIZE" default:"60" validate:"min=1"`
45-
HealthcheckInterval time.Duration `envconfig:"HEALTHCHECK_INTERVAL" default:"30s" validate:"min=10s"`
46-
HealthcheckTimeout time.Duration `envconfig:"HEALTHCHECK_TIMEOUT" default:"10s"`
47-
BackupTimeoutMin int `envconfig:"BACKUP_TIMEOUT_MIN" default:"60" validate:"min=1"`
48-
ApiVersion int `envconfig:"API_VERSION" default:"2"`
20+
DaytonaApiUrl string `envconfig:"DAYTONA_API_URL"`
21+
ApiToken string `envconfig:"DAYTONA_RUNNER_TOKEN"`
22+
ApiPort int `envconfig:"API_PORT"`
23+
TLSCertFile string `envconfig:"TLS_CERT_FILE"`
24+
TLSKeyFile string `envconfig:"TLS_KEY_FILE"`
25+
EnableTLS bool `envconfig:"ENABLE_TLS"`
26+
CacheRetentionDays int `envconfig:"CACHE_RETENTION_DAYS"`
27+
Environment string `envconfig:"ENVIRONMENT"`
28+
ContainerRuntime string `envconfig:"CONTAINER_RUNTIME"`
29+
ContainerNetwork string `envconfig:"CONTAINER_NETWORK"`
30+
LogFilePath string `envconfig:"LOG_FILE_PATH"`
31+
AWSRegion string `envconfig:"AWS_REGION"`
32+
AWSEndpointUrl string `envconfig:"AWS_ENDPOINT_URL"`
33+
AWSAccessKeyId string `envconfig:"AWS_ACCESS_KEY_ID"`
34+
AWSSecretAccessKey string `envconfig:"AWS_SECRET_ACCESS_KEY"`
35+
AWSDefaultBucket string `envconfig:"AWS_DEFAULT_BUCKET"`
36+
ResourceLimitsDisabled bool `envconfig:"RESOURCE_LIMITS_DISABLED"`
37+
DaemonStartTimeoutSec int `envconfig:"DAEMON_START_TIMEOUT_SEC"`
38+
SandboxStartTimeoutSec int `envconfig:"SANDBOX_START_TIMEOUT_SEC"`
39+
UseSnapshotEntrypoint bool `envconfig:"USE_SNAPSHOT_ENTRYPOINT"`
40+
Domain string `envconfig:"RUNNER_DOMAIN" validate:"omitempty,hostname|ip"`
41+
VolumeCleanupIntervalSec int `envconfig:"VOLUME_CLEANUP_INTERVAL_SEC" default:"30" validate:"min=10"`
42+
PollTimeout time.Duration `envconfig:"POLL_TIMEOUT" default:"30s"`
43+
PollLimit int `envconfig:"POLL_LIMIT" default:"10" validate:"min=1,max=100"`
44+
CollectorWindowSize int `envconfig:"COLLECTOR_WINDOW_SIZE" default:"60" validate:"min=1"`
45+
CPUUsageSnapshotInterval time.Duration `envconfig:"CPU_USAGE_SNAPSHOT_INTERVAL" default:"5s" validate:"min=1s"`
46+
AllocatedResourcesSnapshotInterval time.Duration `envconfig:"ALLOCATED_RESOURCES_SNAPSHOT_INTERVAL" default:"5s" validate:"min=1s"`
47+
HealthcheckInterval time.Duration `envconfig:"HEALTHCHECK_INTERVAL" default:"30s" validate:"min=10s"`
48+
HealthcheckTimeout time.Duration `envconfig:"HEALTHCHECK_TIMEOUT" default:"10s"`
49+
BackupTimeoutMin int `envconfig:"BACKUP_TIMEOUT_MIN" default:"60" validate:"min=1"`
50+
ApiVersion int `envconfig:"API_VERSION" default:"2"`
4951
}
5052

5153
var DEFAULT_API_PORT int = 8080

apps/runner/cmd/runner/main.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,13 @@ func main() {
145145
slogLogger := newSLogger()
146146

147147
// Create metrics collector
148-
metricsCollector := metrics.NewCollector(slogLogger, dockerClient, cfg.CollectorWindowSize)
148+
metricsCollector := metrics.NewCollector(metrics.CollectorConfig{
149+
Logger: slogLogger,
150+
Docker: dockerClient,
151+
WindowSize: cfg.CollectorWindowSize,
152+
CPUUsageSnapshotInterval: cfg.CPUUsageSnapshotInterval,
153+
AllocatedResourcesSnapshotInterval: cfg.AllocatedResourcesSnapshotInterval,
154+
})
149155
metricsCollector.Start(ctx)
150156

151157
_ = runner.GetInstance(&runner.RunnerInstanceConfig{

apps/runner/internal/metrics/collector.go

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,35 @@ import (
2323
"github.com/shirou/gopsutil/v4/mem"
2424
)
2525

26+
// CollectorConfig holds configuration for the metrics collector
27+
type CollectorConfig struct {
28+
Logger *slog.Logger
29+
Docker *docker.DockerClient
30+
WindowSize int
31+
CPUUsageSnapshotInterval time.Duration
32+
AllocatedResourcesSnapshotInterval time.Duration
33+
}
34+
35+
// Collector collects system metrics
36+
type Collector struct {
37+
docker *docker.DockerClient
38+
log *slog.Logger
39+
40+
// CPU usage - ring buffer for sliding window
41+
cpuRing *ring.Ring
42+
cpuMutex sync.RWMutex
43+
44+
resourcesMutex sync.RWMutex
45+
allocatedCPU float32
46+
allocatedMemoryGiB float32
47+
allocatedDiskGiB float32
48+
startedSandboxCount float32
49+
50+
// Intervals for snapshotting metrics in seconds
51+
cpuUsageSnapshotInterval time.Duration
52+
allocatedResourcesSnapshotInterval time.Duration
53+
}
54+
2655
// CPUSnapshot represents a point-in-time CPU measurement
2756
type CPUSnapshot struct {
2857
timestamp time.Time
@@ -45,33 +74,14 @@ type Metrics struct {
4574
StartedSandboxCount float32
4675
}
4776

48-
// Collector collects system metrics
49-
type Collector struct {
50-
docker *docker.DockerClient
51-
log *slog.Logger
52-
53-
// CPU usage - ring buffer for sliding window
54-
cpuRing *ring.Ring
55-
cpuMutex sync.RWMutex
56-
57-
resourcesMutex sync.RWMutex
58-
allocatedCPU float32
59-
allocatedMemoryGiB float32
60-
allocatedDiskGiB float32
61-
startedSandboxCount float32
62-
}
63-
6477
// NewCollector creates a new metrics collector
65-
func NewCollector(logger *slog.Logger, docker *docker.DockerClient, windowSize int) *Collector {
66-
if windowSize <= 0 {
67-
// Default to size 60
68-
windowSize = 60
69-
}
70-
78+
func NewCollector(cfg CollectorConfig) *Collector {
7179
return &Collector{
72-
log: logger.With(slog.String("component", "metrics")),
73-
docker: docker,
74-
cpuRing: ring.New(windowSize),
80+
log: cfg.Logger.With(slog.String("component", "metrics")),
81+
docker: cfg.Docker,
82+
cpuRing: ring.New(cfg.WindowSize),
83+
cpuUsageSnapshotInterval: cfg.CPUUsageSnapshotInterval,
84+
allocatedResourcesSnapshotInterval: cfg.AllocatedResourcesSnapshotInterval,
7585
}
7686
}
7787

@@ -169,7 +179,7 @@ func (c *Collector) collect(ctx context.Context) (*Metrics, error) {
169179

170180
// snapshotCPUUsage runs in a background goroutine, continuously monitoring CPU usage
171181
func (c *Collector) snapshotCPUUsage(ctx context.Context) {
172-
ticker := time.NewTicker(5 * time.Second)
182+
ticker := time.NewTicker(c.cpuUsageSnapshotInterval)
173183
defer ticker.Stop()
174184

175185
for {
@@ -224,7 +234,7 @@ func (c *Collector) collectCPUUsageAverage() (float64, error) {
224234
}
225235

226236
func (c *Collector) snapshotAllocatedResources(ctx context.Context) {
227-
ticker := time.NewTicker(5 * time.Second)
237+
ticker := time.NewTicker(c.allocatedResourcesSnapshotInterval)
228238
defer ticker.Stop()
229239

230240
for {

0 commit comments

Comments
 (0)