Skip to content

Commit 6ce7080

Browse files
authored
parameterize graph labels and generalize metrics (#24)
2 parents 306d644 + 4a0af92 commit 6ce7080

File tree

14 files changed

+185
-71
lines changed

14 files changed

+185
-71
lines changed

.github/workflows/ci.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [master]
6+
pull_request:
7+
branches: [master]
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: actions/checkout@v4
14+
15+
- uses: cachix/install-nix-action@v31
16+
with:
17+
nix_path: nixpkgs=channel:nixos-unstable
18+
19+
- uses: cachix/cachix-action@v16
20+
with:
21+
name: devenv
22+
23+
- name: Run tests
24+
run: nix develop --command make test-ci

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ test:
44
staticcheck ./...
55
go test -count=1 --race -v ./...
66

7+
test-ci:
8+
staticcheck ./...
9+
go test -short -count=1 --race -v ./...
10+
711
build:
812
go build -o out/wasgehtd cmd/wasgehtd/main.go
913

README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
# Was Geht
22

3+
[![CI](https://github.com/kylerisse/wasgeht/actions/workflows/ci.yml/badge.svg)](https://github.com/kylerisse/wasgeht/actions/workflows/ci.yml)
4+
35
## Overview
46

5-
**Was Geht** is a small Go application that pings a list of hosts at regular intervals, tracks their availability (UP or DOWN), and records the latency in a Round Robin Database (RRD). A lightweight web interface serves host status information and interactive graphs of the recorded latency.
7+
**Was Geht** is a small Go application that monitors a list of hosts at regular intervals, tracks their availability and metrics, and records the data in Round Robin Databases (RRD). A lightweight web interface serves host status information and interactive graphs of the recorded metrics.
68

79
## Features
810

11+
- **Extensible Check System**: Modular check types (ping, with more planned) via a Registry/Factory pattern.
912
- **Ping Monitoring**: Sends ICMP Echo Requests to check host availability.
1013
- **Latency Logging**: Uses RRD to store latency data over time.
1114
- **Graphs Generation**: Generates historical latency graphs (15 minutes, 4 hours, 8 hours, etc.) for each host.
@@ -27,7 +30,7 @@ nix develop
2730

2831
You will need experimental features `flakes` and `nix-command`.
2932

30-
This loads the environment specified in `shell.nix`:
33+
This loads the environment specified in `flake.nix`:
3134

3235
- Go (for building),
3336
- gnumake (for Makefile),
@@ -41,7 +44,7 @@ Once inside the shell, you can run the usual make commands
4144

4245
Ensure the following are installed:
4346

44-
- **Go** (1.23+ recommended)
47+
- **Go** (1.25+ recommended)
4548
- **air** (for live reload during development, optional)
4649
- **rrdtool** and **unixtools ping** must be installed and available on the system path.
4750
- Basic Unix tools for building and running (`make`, etc.).
@@ -126,6 +129,7 @@ Each check type gets its own RRD file (e.g., `ping.rrd`), making it straightforw
126129

127130
## Makefile Targets
128131

132+
- **test**: Runs staticcheck and `go test` with race detection.
129133
- **build**: Compiles the Go code and produces `wasgehtd`.
130134
- **deps**: Verifies module dependencies and updates `go.mod` and `go.sum`.
131135
- **clean**: Removes the `wasgehtd` binary and any generated graphs.

pkg/check/check.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ type MetricDef struct {
5959

6060
// DSName is the RRD data source name (e.g. "latency").
6161
DSName string
62+
63+
// Label is a human-readable label for graphs and display (e.g. "latency").
64+
Label string
65+
66+
// Unit is the unit of measurement for graphs and display (e.g. "ms").
67+
Unit string
6268
}
6369

6470
// Descriptor declares static metadata about a check type, including

pkg/check/check_test.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func failingFactory(config map[string]any) (Check, error) {
2929
}
3030

3131
var stubDescriptor = Descriptor{
32-
Metrics: []MetricDef{{ResultKey: "value", DSName: "value"}},
32+
Metrics: []MetricDef{{ResultKey: "value", DSName: "value", Label: "value", Unit: "units"}},
3333
}
3434

3535
func TestResult_ZeroValue(t *testing.T) {
@@ -223,7 +223,7 @@ func TestRegistry_Describe(t *testing.T) {
223223

224224
desc := Descriptor{
225225
Metrics: []MetricDef{
226-
{ResultKey: "latency_us", DSName: "latency"},
226+
{ResultKey: "latency_us", DSName: "latency", Label: "latency", Unit: "ms"},
227227
},
228228
}
229229
reg.Register("ping", stubFactory("ping", Result{}), desc)
@@ -241,6 +241,12 @@ func TestRegistry_Describe(t *testing.T) {
241241
if got.Metrics[0].DSName != "latency" {
242242
t.Errorf("expected DSName 'latency', got %q", got.Metrics[0].DSName)
243243
}
244+
if got.Metrics[0].Label != "latency" {
245+
t.Errorf("expected Label 'latency', got %q", got.Metrics[0].Label)
246+
}
247+
if got.Metrics[0].Unit != "ms" {
248+
t.Errorf("expected Unit 'ms', got %q", got.Metrics[0].Unit)
249+
}
244250
}
245251

246252
func TestRegistry_DescribeUnknownType(t *testing.T) {
@@ -257,8 +263,8 @@ func TestRegistry_DescribeMultipleMetrics(t *testing.T) {
257263

258264
desc := Descriptor{
259265
Metrics: []MetricDef{
260-
{ResultKey: "rx_bytes", DSName: "rx"},
261-
{ResultKey: "tx_bytes", DSName: "tx"},
266+
{ResultKey: "rx_bytes", DSName: "rx", Label: "received", Unit: "bytes"},
267+
{ResultKey: "tx_bytes", DSName: "tx", Label: "transmitted", Unit: "bytes"},
262268
},
263269
}
264270
reg.Register("bandwidth", stubFactory("bandwidth", Result{}), desc)
@@ -276,7 +282,7 @@ func TestRegistry_ConcurrentDescribe(t *testing.T) {
276282
reg := NewRegistry()
277283

278284
desc := Descriptor{
279-
Metrics: []MetricDef{{ResultKey: "val", DSName: "val"}},
285+
Metrics: []MetricDef{{ResultKey: "val", DSName: "val", Label: "value", Unit: "units"}},
280286
}
281287
reg.Register("test", stubFactory("test", Result{}), desc)
282288

pkg/check/ping/ping.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ const (
3030
// Desc describes the metrics produced by a ping check.
3131
var Desc = check.Descriptor{
3232
Metrics: []check.MetricDef{
33-
{ResultKey: "latency_us", DSName: "latency"},
33+
{ResultKey: "latency_us", DSName: "latency", Label: "latency", Unit: "ms"},
3434
},
3535
}
3636

pkg/check/ping/ping_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,12 @@ func TestDesc(t *testing.T) {
261261
if m.DSName != "latency" {
262262
t.Errorf("expected DSName 'latency', got %q", m.DSName)
263263
}
264+
if m.Label != "latency" {
265+
t.Errorf("expected Label 'latency', got %q", m.Label)
266+
}
267+
if m.Unit != "ms" {
268+
t.Errorf("expected Unit 'ms', got %q", m.Unit)
269+
}
264270
}
265271

266272
func TestRegistryIntegration(t *testing.T) {

pkg/check/status.go

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,18 @@ package check
22

33
import (
44
"sync"
5-
"time"
65
)
76

87
// Status tracks the latest result of a check execution.
98
// It is safe for concurrent reads via the exported accessor methods,
109
// but writes should be done through SetResult.
1110
type Status struct {
1211
mu sync.RWMutex
13-
alive bool
14-
latency time.Duration
12+
lastResult Result
1513
lastUpdate int64
1614
}
1715

18-
// NewStatus creates a Status with zero values (not alive, no latency).
16+
// NewStatus creates a Status with zero values (not alive, no metrics).
1917
func NewStatus() *Status {
2018
return &Status{}
2119
}
@@ -24,14 +22,20 @@ func NewStatus() *Status {
2422
func (s *Status) Alive() bool {
2523
s.mu.RLock()
2624
defer s.mu.RUnlock()
27-
return s.alive
25+
return s.lastResult.Success
2826
}
2927

30-
// Latency returns the latency from the check's last execution.
31-
func (s *Status) Latency() time.Duration {
28+
// Metric returns the value of a named metric from the last result.
29+
// Returns the value and true if found, or 0 and false if not present
30+
// or the last check failed.
31+
func (s *Status) Metric(key string) (float64, bool) {
3232
s.mu.RLock()
3333
defer s.mu.RUnlock()
34-
return s.latency
34+
if !s.lastResult.Success || s.lastResult.Metrics == nil {
35+
return 0, false
36+
}
37+
v, ok := s.lastResult.Metrics[key]
38+
return v, ok
3539
}
3640

3741
// LastUpdate returns the unix timestamp of the last successful RRD update.
@@ -41,19 +45,11 @@ func (s *Status) LastUpdate() int64 {
4145
return s.lastUpdate
4246
}
4347

44-
// SetResult updates the status from a check Result.
45-
// For successful results, it extracts latency from the "latency_us" metric
46-
// if present. For failed results, alive is set to false.
48+
// SetResult stores the latest check result.
4749
func (s *Status) SetResult(result Result) {
4850
s.mu.Lock()
4951
defer s.mu.Unlock()
50-
51-
s.alive = result.Success
52-
if result.Success {
53-
if latencyUs, ok := result.Metrics["latency_us"]; ok {
54-
s.latency = time.Duration(latencyUs) * time.Microsecond
55-
}
56-
}
52+
s.lastResult = result
5753
}
5854

5955
// SetLastUpdate records the unix timestamp of the last successful RRD update.
@@ -68,16 +64,26 @@ func (s *Status) SetLastUpdate(ts int64) {
6864
func (s *Status) Snapshot() StatusSnapshot {
6965
s.mu.RLock()
7066
defer s.mu.RUnlock()
67+
68+
// Deep copy the metrics map so the snapshot is independent
69+
var metrics map[string]float64
70+
if s.lastResult.Metrics != nil {
71+
metrics = make(map[string]float64, len(s.lastResult.Metrics))
72+
for k, v := range s.lastResult.Metrics {
73+
metrics[k] = v
74+
}
75+
}
76+
7177
return StatusSnapshot{
72-
Alive: s.alive,
73-
Latency: s.latency,
78+
Alive: s.lastResult.Success,
79+
Metrics: metrics,
7480
LastUpdate: s.lastUpdate,
7581
}
7682
}
7783

7884
// StatusSnapshot is a point-in-time copy of Status fields.
7985
type StatusSnapshot struct {
8086
Alive bool
81-
Latency time.Duration
87+
Metrics map[string]float64
8288
LastUpdate int64
8389
}

pkg/check/status_test.go

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,15 @@ package check
33
import (
44
"sync"
55
"testing"
6-
"time"
76
)
87

98
func TestNewStatus_ZeroValues(t *testing.T) {
109
s := NewStatus()
1110
if s.Alive() {
1211
t.Error("new status should not be alive")
1312
}
14-
if s.Latency() != 0 {
15-
t.Errorf("new status should have zero latency, got %v", s.Latency())
13+
if v, ok := s.Metric("latency_us"); ok {
14+
t.Errorf("new status should have no metrics, got latency_us=%f", v)
1615
}
1716
if s.LastUpdate() != 0 {
1817
t.Errorf("new status should have zero last update, got %d", s.LastUpdate())
@@ -29,9 +28,12 @@ func TestStatus_SetResult_Success(t *testing.T) {
2928
if !s.Alive() {
3029
t.Error("expected alive after successful result")
3130
}
32-
expected := time.Duration(1234) * time.Microsecond
33-
if s.Latency() != expected {
34-
t.Errorf("expected latency %v, got %v", expected, s.Latency())
31+
v, ok := s.Metric("latency_us")
32+
if !ok {
33+
t.Fatal("expected latency_us metric to be present")
34+
}
35+
if v != 1234.0 {
36+
t.Errorf("expected latency_us=1234.0, got %f", v)
3537
}
3638
}
3739

@@ -50,9 +52,12 @@ func TestStatus_SetResult_Failure(t *testing.T) {
5052
if s.Alive() {
5153
t.Error("expected not alive after failed result")
5254
}
55+
if _, ok := s.Metric("latency_us"); ok {
56+
t.Error("expected no metrics after failed result")
57+
}
5358
}
5459

55-
func TestStatus_SetResult_SuccessWithoutLatency(t *testing.T) {
60+
func TestStatus_SetResult_SuccessWithoutMetrics(t *testing.T) {
5661
s := NewStatus()
5762
s.SetResult(Result{
5863
Success: true,
@@ -62,8 +67,31 @@ func TestStatus_SetResult_SuccessWithoutLatency(t *testing.T) {
6267
if !s.Alive() {
6368
t.Error("expected alive after successful result")
6469
}
65-
if s.Latency() != 0 {
66-
t.Errorf("expected zero latency when metric absent, got %v", s.Latency())
70+
if _, ok := s.Metric("latency_us"); ok {
71+
t.Error("expected no latency_us when metric absent")
72+
}
73+
}
74+
75+
func TestStatus_Metric_MultipleMetrics(t *testing.T) {
76+
s := NewStatus()
77+
s.SetResult(Result{
78+
Success: true,
79+
Metrics: map[string]float64{
80+
"latency_us": 1234.0,
81+
"response_code": 200.0,
82+
},
83+
})
84+
85+
v, ok := s.Metric("latency_us")
86+
if !ok || v != 1234.0 {
87+
t.Errorf("expected latency_us=1234.0, got %f (ok=%v)", v, ok)
88+
}
89+
v, ok = s.Metric("response_code")
90+
if !ok || v != 200.0 {
91+
t.Errorf("expected response_code=200.0, got %f (ok=%v)", v, ok)
92+
}
93+
if _, ok := s.Metric("nonexistent"); ok {
94+
t.Error("expected nonexistent metric to not be found")
6795
}
6896
}
6997

@@ -89,9 +117,8 @@ func TestStatus_Snapshot(t *testing.T) {
89117
if !snap.Alive {
90118
t.Error("snapshot should be alive")
91119
}
92-
expected := time.Duration(5678) * time.Microsecond
93-
if snap.Latency != expected {
94-
t.Errorf("snapshot latency: expected %v, got %v", expected, snap.Latency)
120+
if v, ok := snap.Metrics["latency_us"]; !ok || v != 5678.0 {
121+
t.Errorf("snapshot metrics: expected latency_us=5678.0, got %v (ok=%v)", v, ok)
95122
}
96123
if snap.LastUpdate != 1700000000 {
97124
t.Errorf("snapshot last update: expected 1700000000, got %d", snap.LastUpdate)
@@ -114,6 +141,28 @@ func TestStatus_Snapshot_Independent(t *testing.T) {
114141
if !snap.Alive {
115142
t.Error("snapshot should be independent of subsequent mutations")
116143
}
144+
if _, ok := snap.Metrics["latency_us"]; !ok {
145+
t.Error("snapshot metrics should be independent of subsequent mutations")
146+
}
147+
}
148+
149+
func TestStatus_Snapshot_MetricsMapIndependent(t *testing.T) {
150+
s := NewStatus()
151+
s.SetResult(Result{
152+
Success: true,
153+
Metrics: map[string]float64{"latency_us": 1000.0},
154+
})
155+
156+
snap := s.Snapshot()
157+
158+
// Mutate the snapshot's metrics map
159+
snap.Metrics["latency_us"] = 9999.0
160+
161+
// Status should be unaffected
162+
v, ok := s.Metric("latency_us")
163+
if !ok || v != 1000.0 {
164+
t.Errorf("mutating snapshot should not affect status, got %f", v)
165+
}
117166
}
118167

119168
func TestStatus_ConcurrentAccess(t *testing.T) {
@@ -139,7 +188,7 @@ func TestStatus_ConcurrentAccess(t *testing.T) {
139188
go func() {
140189
defer wg.Done()
141190
_ = s.Alive()
142-
_ = s.Latency()
191+
_, _ = s.Metric("latency_us")
143192
_ = s.LastUpdate()
144193
_ = s.Snapshot()
145194
}()

0 commit comments

Comments
 (0)