Skip to content

Commit c5e3b24

Browse files
Merge pull request #50 from WarpBuilds/obs
Observability improvements and github log collector
2 parents 2b1b6ac + c4786c6 commit c5e3b24

File tree

256 files changed

+5795
-58584
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

256 files changed

+5795
-58584
lines changed

.github/workflows/release-testing.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ name: Branch Build and Upload to R2
33
on:
44
push:
55
branches:
6+
- "obs"
67
- feat-transparent-cache
78

89
env:

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,13 @@
11
# warpbuild-agent
2+
23
Application for runner lifecycle management
4+
5+
## Generating the API client from the OpenAPI specification in backend-core
6+
7+
Use Redocly to generate the API client.
8+
9+
```bash
10+
./scripts/generate-from-openapi-redocly.sh pkg/warpbuild --go 1.0.0
11+
```
12+
13+
The filters in redocly.yaml are used to generate the API client.

pkg/telemetry/binaries/windows/amd64/README.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

pkg/telemetry/manager.go

Lines changed: 116 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"path/filepath"
99
"runtime"
1010
"sync"
11-
"syscall"
1211
"text/template"
1312
"time"
1413

@@ -25,8 +24,9 @@ type TelemetryManager struct {
2524
mu sync.RWMutex
2625

2726
// Components
28-
receiver *uploader.Receiver
29-
s3Uploader *uploader.S3Uploader
27+
receiver *uploader.Receiver
28+
s3Uploader *uploader.S3Uploader
29+
otelCollectorCmd *exec.Cmd
3030

3131
// Configuration
3232
port int
@@ -78,6 +78,10 @@ func (tm *TelemetryManager) Start() error {
7878
tm.wg.Add(1)
7979
go tm.startOtelCollector()
8080

81+
// Start telemetry status monitoring
82+
tm.wg.Add(1)
83+
go tm.monitorTelemetryStatus()
84+
8185
log.Logger().Infof("Telemetry manager started successfully")
8286
return nil
8387
}
@@ -129,26 +133,16 @@ func (tm *TelemetryManager) startOtelCollector() {
129133

130134
log.Logger().Infof("OpenTelemetry Collector configuration written successfully")
131135

132-
// Channel to signal when the application should terminate
133-
done := make(chan bool, 1)
134-
135-
// Start OpenTelemetry Collector Contrib
136-
go func() {
137-
defer tm.handlePanic()
138-
log.Logger().Infof("Launching OpenTelemetry Collector in background...")
139-
tm.runOtelCollector(collectorPath, done)
140-
}()
136+
log.Logger().Infof("Launching OpenTelemetry Collector in background...")
141137

142-
// Wait for context cancellation
143-
<-tm.ctx.Done()
144-
log.Logger().Infof("Context cancelled, stopping OTEL collector...")
138+
// Start the OTEL collector and wait for context cancellation
139+
tm.runOtelCollector(collectorPath)
145140

146-
// Signal the OpenTelemetry Collector process to terminate
147-
done <- true
141+
log.Logger().Infof("OTEL collector goroutine exited")
148142
}
149143

150144
// runOtelCollector runs the OTEL collector process
151-
func (tm *TelemetryManager) runOtelCollector(collectorPath string, done chan bool) {
145+
func (tm *TelemetryManager) runOtelCollector(collectorPath string) {
152146
configPath := tm.getConfigFilePath()
153147
log.Logger().Infof("Starting OpenTelemetry Collector with config: %s", configPath)
154148

@@ -160,29 +154,57 @@ func (tm *TelemetryManager) runOtelCollector(collectorPath string, done chan boo
160154

161155
log.Logger().Infof("OpenTelemetry Collector command: %s --config %s", collectorPath, configPath)
162156

163-
err := cmd.Start()
164-
if err != nil {
157+
if err := cmd.Start(); err != nil {
165158
log.Logger().Errorf("Failed to start OpenTelemetry Collector: %v", err)
166159
return
167160
}
168161

162+
// Store the command reference so we can stop it later
163+
tm.mu.Lock()
164+
tm.otelCollectorCmd = cmd
165+
tm.mu.Unlock()
166+
169167
log.Logger().Infof("OpenTelemetry Collector started with PID: %d", cmd.Process.Pid)
170168

169+
// Channel to track when cmd.Wait() completes
170+
waitDone := make(chan error, 1)
171+
172+
// Wait for the process to exit in a separate goroutine
171173
go func() {
172-
<-done
173-
log.Logger().Infof("Signaling OpenTelemetry Collector to terminate...")
174-
if err := cmd.Process.Signal(syscall.SIGTERM); err != nil {
175-
log.Logger().Errorf("Failed to terminate OpenTelemetry Collector: %v", err)
176-
}
174+
waitDone <- cmd.Wait()
177175
}()
178176

179-
go func() {
180-
if err := cmd.Wait(); err != nil {
177+
// Wait for either context cancellation or process exit
178+
select {
179+
case <-tm.ctx.Done():
180+
log.Logger().Infof("Context cancelled, stopping OTEL collector (PID: %d)...", cmd.Process.Pid)
181+
182+
// Kill the process - Go handles OS-specific details
183+
if err := cmd.Process.Kill(); err != nil {
184+
log.Logger().Errorf("Failed to kill OpenTelemetry Collector process: %v", err)
185+
}
186+
187+
// Wait for the process to actually exit (with timeout)
188+
select {
189+
case err := <-waitDone:
190+
if err != nil {
191+
log.Logger().Infof("OpenTelemetry Collector terminated with error: %v", err)
192+
} else {
193+
log.Logger().Infof("OpenTelemetry Collector terminated successfully")
194+
}
195+
case <-time.After(5 * time.Second):
196+
log.Logger().Warnf("Timeout waiting for OpenTelemetry Collector to exit after 5 seconds")
197+
}
198+
199+
case err := <-waitDone:
200+
if err != nil {
181201
log.Logger().Errorf("OpenTelemetry Collector exited with error: %v", err)
182202
} else {
183203
log.Logger().Infof("OpenTelemetry Collector exited successfully")
184204
}
185-
}()
205+
}
206+
207+
log.Logger().Infof("OpenTelemetry Collector process handler completed")
186208
}
187209

188210
// handlePanic handles panics in goroutines
@@ -311,3 +333,69 @@ func (tm *TelemetryManager) getOtelCollectorOutputFilePath(isMetrics bool) strin
311333
}
312334
return filepath.Join(tm.baseDirectory, "otel-out.log")
313335
}
336+
337+
// monitorTelemetryStatus monitors the telemetry enabled status via API polling
338+
func (tm *TelemetryManager) monitorTelemetryStatus() {
339+
defer tm.wg.Done()
340+
341+
log.Logger().Infof("Starting telemetry status monitoring...")
342+
343+
ticker := time.NewTicker(1 * time.Second)
344+
defer ticker.Stop()
345+
346+
for {
347+
select {
348+
case <-ticker.C:
349+
// Poll the API to check telemetry status
350+
allocationDetails, resp, err := tm.warpbuildAPI.V1RunnerInstanceAPI.
351+
GetRunnerInstanceAllocationDetails(tm.ctx, tm.runnerID).
352+
XPOLLINGSECRET(tm.pollingSecret).
353+
Execute()
354+
355+
if err != nil {
356+
log.Logger().Debugf("Failed to get runner instance allocation details: %v", err)
357+
if resp != nil {
358+
log.Logger().Debugf("Response: %+v", resp)
359+
}
360+
continue
361+
}
362+
363+
if allocationDetails == nil {
364+
log.Logger().Debugf("No runner instance allocation details found")
365+
continue
366+
}
367+
368+
// Check if telemetry is disabled
369+
if allocationDetails.HasTelemetryEnabled() {
370+
telemetryEnabled := allocationDetails.GetTelemetryEnabled()
371+
372+
if !telemetryEnabled {
373+
log.Logger().Infof("Telemetry has been disabled via API. Stopping telemetry collection...")
374+
tm.stopOtelCollector()
375+
376+
// Cancel the context to stop the entire telemetry manager
377+
tm.cancel()
378+
return
379+
}
380+
}
381+
382+
case <-tm.ctx.Done():
383+
log.Logger().Infof("Context cancelled, stopping telemetry status monitoring...")
384+
return
385+
}
386+
}
387+
}
388+
389+
// stopOtelCollector stops the OTEL collector process by canceling the context
390+
func (tm *TelemetryManager) stopOtelCollector() {
391+
tm.mu.RLock()
392+
defer tm.mu.RUnlock()
393+
394+
if tm.otelCollectorCmd == nil || tm.otelCollectorCmd.Process == nil {
395+
log.Logger().Infof("OTEL collector process not running")
396+
return
397+
}
398+
399+
log.Logger().Infof("Stopping OTEL collector process (PID: %d)...", tm.otelCollectorCmd.Process.Pid)
400+
// The actual termination will be handled by the context cancellation in runOtelCollector
401+
}

pkg/telemetry/otel-collector-config.tmpl

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,30 @@ receivers:
1414
- 'C:\warpbuilds\warpbuild-agentd-restarter.stderr.log'
1515
- 'C:\warpbuilds\warpbuild-agentd.stdout.log'
1616
- 'C:\warpbuilds\warpbuild-agentd.stderr.log'
17+
- 'C:\warpbuild-agentd-debug.log'
1718
- 'C:\warpbuilds\warpbuild-telemetryd.stdout.log'
1819
- 'C:\warpbuilds\warpbuild-telemetryd.stderr.log'
20+
- 'C:\warpbuilds\warpbuild-proxyd.stdout.log'
21+
- 'C:\warpbuilds\warpbuild-proxyd.stderr.log'
1922
start_at: end
2023
max_log_size: 200KiB
2124
include_file_name: true
2225
include_file_path: true
2326
poll_interval: 20s
27+
28+
# GitHub Actions job logs and diagnostic logs
29+
filelog/gha_logs:
30+
include:
31+
- 'C:\warpbuilds\runner\_diag\*.log'
32+
- 'C:\warpbuilds\runner\_diag\**\*.log'
33+
start_at: end
34+
max_log_size: 500KiB
35+
include_file_name: true
36+
include_file_path: true
37+
include_file_path_resolved: true
38+
poll_interval: 5s
39+
attributes:
40+
log.type: github_actions
2441
{{- else}}
2542
filelog:
2643
{{- if eq .OS "darwin"}}
@@ -33,6 +50,24 @@ receivers:
3350
include_file_name: false
3451
include_file_path: false
3552
poll_interval: 1s
53+
54+
# GitHub Actions job logs and diagnostic logs
55+
filelog/gha_logs:
56+
include:
57+
{{- if eq .OS "darwin"}}
58+
- '/Users/runner/.warpbuild/github-runner/runner-app-new/_diag/*.log'
59+
- '/Users/runner/.warpbuild/github-runner/runner-app-new/_diag/**/*.log'
60+
{{- else if eq .OS "linux"}}
61+
- '/runner/_diag/*.log'
62+
- '/runner/_diag/**/*.log'
63+
{{- end}}
64+
start_at: 'end'
65+
max_log_size: 500KiB
66+
include_file_name: true
67+
include_file_path: true
68+
poll_interval: 10s
69+
attributes:
70+
log.type: github_actions
3671
{{- end}}
3772

3873
hostmetrics:
@@ -65,6 +100,23 @@ receivers:
65100
enabled: false
66101
system.network.dropped:
67102
enabled: false
103+
disk:
104+
metrics:
105+
# collect disk throughput and IOPS metrics
106+
system.disk.io:
107+
enabled: true
108+
system.disk.operations:
109+
enabled: true
110+
system.disk.io_time:
111+
enabled: true
112+
system.disk.operation_time:
113+
enabled: true
114+
system.disk.pending_operations:
115+
enabled: false
116+
system.disk.merged:
117+
enabled: false
118+
system.disk.weighted_io_time:
119+
enabled: false
68120
filesystem:
69121
# Drop pseudo/virtual/container overlay mounts; keep real disks
70122
exclude_fs_types:
@@ -93,7 +145,7 @@ receivers:
93145

94146
{{- if eq .OS "windows"}}
95147
windowsperfcounters/processor:
96-
collection_interval: 30s
148+
collection_interval: 1s
97149
metrics:
98150
system.cpu.time:
99151
description: percentage of cpu time
@@ -122,7 +174,7 @@ processors:
122174
timeout: 30s
123175
send_batch_size: 10_000_000
124176

125-
# Keep ONLY these 4 metrics in the pipeline
177+
# Keep ONLY these metrics in the pipeline
126178
filter/only_needed:
127179
metrics:
128180
include:
@@ -132,24 +184,36 @@ processors:
132184
- system.memory.utilization
133185
- system.filesystem.utilization
134186
- system.network.io
187+
- system.disk.io
188+
- system.disk.operations
189+
- system.disk.io_time
190+
- system.disk.operation_time
135191

136192
filter/drop_non_internet_nics:
137193
error_mode: ignore
138194
metrics:
139195
datapoint:
140196
- 'metric.name == "system.network.io" and IsMatch(attributes["device"], "^(lo|lo0|docker.*|cni.*|veth.*|br-.*|virbr.*|wg.*|tun.*|tap.*)$")'
141197

142-
# Convert cumulative network counters -> delta
198+
# Convert cumulative counters -> delta
143199
cumulativetodelta:
144200
include:
145201
match_type: strict
146202
metrics:
147203
- system.network.io
204+
- system.disk.io
205+
- system.disk.operations
206+
- system.disk.io_time
207+
- system.disk.operation_time
148208

149209
# Convert delta -> per-second rate (required list syntax)
150210
deltatorate:
151211
metrics:
152212
- system.network.io
213+
- system.disk.io
214+
- system.disk.operations
215+
- system.disk.io_time
216+
- system.disk.operation_time
153217

154218
transform/normalize:
155219
metric_statements:
@@ -243,16 +307,25 @@ exporters:
243307
metrics_endpoint: "http://localhost:{{.Port}}/v1/metrics"
244308
encoding: "json"
245309
compression: "none"
310+
311+
otlphttp/gha_logs:
312+
logs_endpoint: "http://localhost:{{.Port}}/v1/gha-logs"
313+
encoding: "json"
314+
compression: "none"
246315

247316
service:
248317
telemetry:
249318
metrics:
250319
level: none
251320
pipelines:
252321
logs:
253-
receivers: [{{- if eq .OS "windows"}}windowseventlog/application, windowseventlog/security, filelog/services{{- else if eq .OS "darwin"}}filelog{{- else if eq .OS "linux"}}filelog{{- end}}]
322+
receivers: [{{- if eq .OS "windows"}}windowseventlog/application, windowseventlog/security, filelog/services{{- else }}filelog{{- end}}]
254323
processors: [batch/logs]
255324
exporters: [otlphttp] # tee: still send to your HTTP service, and also print locally
325+
logs/gha_logs:
326+
receivers: [filelog/gha_logs]
327+
processors: [batch/logs]
328+
exporters: [otlphttp/gha_logs]
256329
metrics:
257330
receivers: [hostmetrics{{- if eq .OS "windows"}}, windowsperfcounters/processor{{- end}}]
258331
processors:

0 commit comments

Comments
 (0)