Skip to content

Commit bd2aa33

Browse files
committed
pkg/services/otelhealth: add otelhealth; update promhealth
1 parent 410cca3 commit bd2aa33

File tree

4 files changed

+116
-32
lines changed

4 files changed

+116
-32
lines changed

pkg/loop/server.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/smartcontractkit/chainlink-common/pkg/config/build"
1616
"github.com/smartcontractkit/chainlink-common/pkg/logger"
1717
"github.com/smartcontractkit/chainlink-common/pkg/services"
18+
"github.com/smartcontractkit/chainlink-common/pkg/services/otelhealth"
1819
"github.com/smartcontractkit/chainlink-common/pkg/settings/limits"
1920
"github.com/smartcontractkit/chainlink-common/pkg/services/promhealth"
2021
"github.com/smartcontractkit/chainlink-common/pkg/sqlutil"
@@ -154,7 +155,17 @@ func (s *Server) start() error {
154155
return fmt.Errorf("error starting prometheus server: %w", err)
155156
}
156157

157-
s.checker = promhealth.NewChecker("", "")
158+
var healthCfg services.HealthCheckerConfig
159+
healthCfg = promhealth.ConfigureHooks(healthCfg)
160+
if bc := beholder.GetClient(); bc != nil {
161+
var err error
162+
healthCfg, err = otelhealth.ConfigureHooks(healthCfg, bc.Meter)
163+
if err != nil {
164+
return fmt.Errorf("failed to configure health checker otel hooks: %w", err)
165+
}
166+
}
167+
s.checker = healthCfg.New()
168+
158169
if err := s.checker.Start(); err != nil {
159170
return fmt.Errorf("error starting health checker: %w", err)
160171
}

pkg/services/health.go

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package services
22

33
import (
4+
"context"
45
"errors"
56
"fmt"
67
"maps"
@@ -88,10 +89,10 @@ type HealthCheckerConfig struct {
8889
// Optionally override debug.BuildInfo
8990
Ver, Sha string
9091
// Optional hooks for reporting.
91-
IncVersion func(ver string, sha string)
92-
AddUptime func(duration time.Duration)
93-
SetStatus func(name string, status int)
94-
Delete func(name string)
92+
IncVersion func(ctx context.Context, ver string, sha string)
93+
AddUptime func(ctx context.Context, duration time.Duration)
94+
SetStatus func(ctx context.Context, name string, status int)
95+
Delete func(ctx context.Context, name string)
9596
}
9697

9798
func (cfg HealthCheckerConfig) initVerSha() {
@@ -112,16 +113,16 @@ func (cfg HealthCheckerConfig) initVerSha() {
112113

113114
func (cfg HealthCheckerConfig) setNoopHooks() {
114115
if cfg.IncVersion == nil {
115-
cfg.IncVersion = func(ver, sha string) {}
116+
cfg.IncVersion = func(ctx context.Context, ver, sha string) {}
116117
}
117118
if cfg.AddUptime == nil {
118-
cfg.AddUptime = func(d time.Duration) {}
119+
cfg.AddUptime = func(ctx context.Context, d time.Duration) {}
119120
}
120121
if cfg.SetStatus == nil {
121-
cfg.SetStatus = func(name string, status int) {}
122+
cfg.SetStatus = func(ctx context.Context, name string, status int) {}
122123
}
123124
if cfg.Delete == nil {
124-
cfg.Delete = func(name string) {}
125+
cfg.Delete = func(ctx context.Context, name string) {}
125126
}
126127
}
127128

@@ -140,10 +141,11 @@ func (cfg HealthCheckerConfig) New() *HealthChecker {
140141

141142
func (c *HealthChecker) Start() error {
142143
return c.StartOnce("HealthCheck", func() error {
143-
c.cfg.IncVersion(c.cfg.Ver, c.cfg.Sha)
144+
ctx := context.Background()
145+
c.cfg.IncVersion(ctx, c.cfg.Ver, c.cfg.Sha)
144146

145147
// update immediately
146-
c.update()
148+
c.update(ctx)
147149

148150
go c.run()
149151

@@ -162,19 +164,20 @@ func (c *HealthChecker) Close() error {
162164
func (c *HealthChecker) run() {
163165
defer close(c.chDone)
164166

167+
ctx := context.Background()
165168
ticker := time.NewTicker(interval)
166169

167170
for {
168171
select {
169172
case <-ticker.C:
170-
c.update()
173+
c.update(ctx)
171174
case <-c.chStop:
172175
return
173176
}
174177
}
175178
}
176179

177-
func (c *HealthChecker) update() {
180+
func (c *HealthChecker) update(ctx context.Context) {
178181
// copy services into a new map to avoid lock contention while doing checks
179182
c.servicesMu.RLock()
180183
l := len(c.services)
@@ -196,10 +199,10 @@ func (c *HealthChecker) update() {
196199
}
197200

198201
// report metrics to prometheus
199-
c.cfg.SetStatus(name, value)
202+
c.cfg.SetStatus(ctx, name, value)
200203
}
201204
}
202-
c.cfg.AddUptime(interval)
205+
c.cfg.AddUptime(ctx, interval)
203206

204207
// save state
205208
c.stateMu.Lock()
@@ -231,11 +234,12 @@ func (c *HealthChecker) Unregister(name string) error {
231234
if name == "" {
232235
return fmt.Errorf("name cannot be empty")
233236
}
237+
ctx := context.Background()
234238

235239
c.servicesMu.Lock()
236240
defer c.servicesMu.Unlock()
237241
delete(c.services, name)
238-
c.cfg.Delete(name)
242+
c.cfg.Delete(ctx, name)
239243
return nil
240244
}
241245

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package otelhealth
2+
3+
import (
4+
"context"
5+
"time"
6+
7+
"go.opentelemetry.io/otel/attribute"
8+
"go.opentelemetry.io/otel/metric"
9+
10+
"github.com/smartcontractkit/chainlink-common/pkg/services"
11+
)
12+
13+
func NewChecker(ver, sha string, meter metric.Meter) (*services.HealthChecker, error) {
14+
cfg, err := ConfigureHooks(services.HealthCheckerConfig{Ver: ver, Sha: sha}, meter)
15+
if err != nil {
16+
return nil, err
17+
}
18+
return cfg.New(), nil
19+
}
20+
21+
func ConfigureHooks(orig services.HealthCheckerConfig, meter metric.Meter) (services.HealthCheckerConfig, error) {
22+
cfg := orig // copy
23+
healthStatus, err := meter.Int64Gauge("health", metric.WithDescription("Health status by service"))
24+
if err != nil {
25+
return services.HealthCheckerConfig{}, err
26+
}
27+
version, err := meter.Int64Counter("version", metric.WithDescription("Version and SHA of the service"))
28+
if err != nil {
29+
return services.HealthCheckerConfig{}, err
30+
}
31+
uptimeSeconds, err := meter.Float64Gauge("uptime_seconds", metric.WithDescription("Uptime of the service"))
32+
if err != nil {
33+
return services.HealthCheckerConfig{}, err
34+
}
35+
cfg.AddUptime = func(ctx context.Context, d time.Duration) {
36+
if orig.AddUptime != nil {
37+
orig.AddUptime(ctx, d)
38+
}
39+
uptimeSeconds.Record(ctx, d.Seconds())
40+
}
41+
cfg.IncVersion = func(ctx context.Context, ver string, sha string) {
42+
if orig.IncVersion != nil {
43+
orig.IncVersion(ctx, ver, sha)
44+
}
45+
version.Add(ctx, 1, metric.WithAttributes(attribute.String("version", ver), attribute.String("commit", sha)))
46+
}
47+
cfg.SetStatus = func(ctx context.Context, name string, value int) {
48+
if orig.SetStatus != nil {
49+
orig.SetStatus(ctx, name, value)
50+
}
51+
healthStatus.Record(ctx, int64(value), metric.WithAttributes(attribute.String("service_id", name)))
52+
}
53+
return cfg, nil
54+
}

pkg/services/promhealth/promhealth.go

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package promhealth
22

33
import (
4+
"context"
45
"time"
56

67
"github.com/prometheus/client_golang/prometheus"
@@ -34,20 +35,34 @@ var (
3435

3536
// NewChecker returns a *services.HealthChecker with hooks for prometheus metrics.
3637
func NewChecker(ver, sha string) *services.HealthChecker {
37-
return services.HealthCheckerConfig{
38-
Ver: ver,
39-
Sha: sha,
40-
AddUptime: func(d time.Duration) {
41-
uptimeSeconds.Add(d.Seconds())
42-
},
43-
IncVersion: func(ver string, sha string) {
44-
version.WithLabelValues(ver, sha).Inc()
45-
},
46-
SetStatus: func(name string, value int) {
47-
healthStatus.WithLabelValues(name).Set(float64(value))
48-
},
49-
Delete: func(name string) {
50-
healthStatus.DeleteLabelValues(name)
51-
},
52-
}.New()
38+
return ConfigureHooks(services.HealthCheckerConfig{Ver: ver, Sha: sha}).New()
39+
}
40+
41+
func ConfigureHooks(orig services.HealthCheckerConfig) services.HealthCheckerConfig {
42+
cfg := orig // copy
43+
cfg.AddUptime = func(ctx context.Context, d time.Duration) {
44+
if orig.AddUptime != nil {
45+
orig.AddUptime(ctx, d)
46+
}
47+
uptimeSeconds.Add(d.Seconds())
48+
}
49+
cfg.IncVersion = func(ctx context.Context, ver string, sha string) {
50+
if orig.IncVersion != nil {
51+
orig.IncVersion(ctx, ver, sha)
52+
}
53+
version.WithLabelValues(ver, sha).Inc()
54+
}
55+
cfg.SetStatus = func(ctx context.Context, name string, value int) {
56+
if orig.SetStatus != nil {
57+
orig.SetStatus(ctx, name, value)
58+
}
59+
healthStatus.WithLabelValues(name).Set(float64(value))
60+
}
61+
cfg.Delete = func(ctx context.Context, name string) {
62+
if orig.Delete != nil {
63+
orig.Delete(ctx, name)
64+
}
65+
healthStatus.DeleteLabelValues(name)
66+
}
67+
return cfg
5368
}

0 commit comments

Comments
 (0)