Skip to content

Commit eedc13b

Browse files
committed
fix: do not register CRDB metrics in init()
1 parent 0e16dd1 commit eedc13b

File tree

12 files changed

+177
-129
lines changed

12 files changed

+177
-129
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
1111
### Fixed
1212
- Regression introduced in 1.49.2: missing spans in ReadSchema calls (https://github.com/authzed/spicedb/pull/2947)
1313
- Long standing bug in the way postgres revisions were being compared. Sometimes revisions that were actually overlapping were erroneously being ordered. (https://github.com/authzed/spicedb/pull/2958)
14+
- Do not register CRDB metrics in `init()` functions (https://github.com/authzed/spicedb/pull/2966)
1415

1516
## [1.49.2] - 2026-03-02
1617
### Added

internal/datastore/crdb/crdb.go

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ func newCRDBDatastore(ctx context.Context, url string, options ...Option) (datas
197197
gcWindow: config.gcWindow,
198198
watchEnabled: !config.watchDisabled,
199199
schema: *schema.Schema(config.columnOptimizationOption, config.withIntegrity, false),
200+
healthTracker: healthChecker,
200201
}
201202
ds.SetNowFunc(ds.headRevisionInternal)
202203

@@ -226,14 +227,14 @@ func newCRDBDatastore(ctx context.Context, url string, options ...Option) (datas
226227
if config.enableConnectionBalancing {
227228
log.Ctx(initCtx).Info().Msg("starting cockroach connection balancer")
228229
ds.pruneGroup, ds.ctx = errgroup.WithContext(ds.ctx)
229-
writePoolBalancer := pool.NewNodeConnectionBalancer(ds.writePool, healthChecker, 5*time.Second)
230-
readPoolBalancer := pool.NewNodeConnectionBalancer(ds.readPool, healthChecker, 5*time.Second)
230+
ds.writePoolBalancer = pool.NewNodeConnectionBalancer(ds.writePool, healthChecker, 5*time.Second)
231+
ds.readPoolBalancer = pool.NewNodeConnectionBalancer(ds.readPool, healthChecker, 5*time.Second)
231232
ds.pruneGroup.Go(func() error {
232-
writePoolBalancer.Prune(ds.ctx)
233+
ds.writePoolBalancer.Prune(ds.ctx)
233234
return nil
234235
})
235236
ds.pruneGroup.Go(func() error {
236-
readPoolBalancer.Prune(ds.ctx)
237+
ds.readPoolBalancer.Prune(ds.ctx)
237238
return nil
238239
})
239240
ds.pruneGroup.Go(func() error {
@@ -261,19 +262,20 @@ type crdbDatastore struct {
261262
revisions.CommonDecoder
262263
*common.MigrationValidator
263264

264-
dburl string
265-
readPool, writePool *pool.RetryPool
266-
collectors []prometheus.Collector
267-
watchBufferLength uint16
268-
watchChangeBufferMaximumSize uint64
269-
watchBufferWriteTimeout time.Duration
270-
watchConnectTimeout time.Duration
271-
writeOverlapKeyer overlapKeyer
272-
overlapKeyInit func(ctx context.Context) keySet
273-
analyzeBeforeStatistics bool
274-
gcWindow time.Duration
275-
schema common.SchemaInformation
276-
acquireTimeout time.Duration
265+
dburl string
266+
readPool, writePool *pool.RetryPool
267+
readPoolBalancer, writePoolBalancer *pool.NodeConnectionBalancer
268+
collectors []prometheus.Collector
269+
watchBufferLength uint16
270+
watchChangeBufferMaximumSize uint64
271+
watchBufferWriteTimeout time.Duration
272+
watchConnectTimeout time.Duration
273+
writeOverlapKeyer overlapKeyer
274+
overlapKeyInit func(ctx context.Context) keySet
275+
analyzeBeforeStatistics bool
276+
gcWindow time.Duration
277+
schema common.SchemaInformation
278+
acquireTimeout time.Duration
277279

278280
beginChangefeedQuery string
279281
transactionNowQuery string
@@ -289,6 +291,8 @@ type crdbDatastore struct {
289291
supportsIntegrity bool
290292
watchEnabled bool
291293

294+
healthTracker *pool.NodeHealthTracker
295+
292296
uniqueID atomic.Pointer[string]
293297
}
294298

@@ -481,6 +485,15 @@ func (cds *crdbDatastore) Close() error {
481485
}
482486
cds.readPool.Close()
483487
cds.writePool.Close()
488+
if cds.writePoolBalancer != nil {
489+
cds.writePoolBalancer.Close()
490+
}
491+
if cds.readPoolBalancer != nil {
492+
cds.readPoolBalancer.Close()
493+
}
494+
if cds.healthTracker != nil {
495+
cds.healthTracker.Close()
496+
}
484497
for _, collector := range cds.collectors {
485498
ok := prometheus.Unregister(collector)
486499
if !ok {

internal/datastore/crdb/crdb_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -904,7 +904,7 @@ func TestRegisterPrometheusCollectors(t *testing.T) {
904904

905905
writePoolConfig, err := pgxpool.ParseConfig(fmt.Sprintf("postgres://db:password@pg.example.com:5432/mydb?pool_max_conns=%d", writeMaxConns))
906906
require.NoError(t, err)
907-
writePool, err := pool.NewRetryPool(t.Context(), "read", writePoolConfig, nil, 18, 20)
907+
writePool, err := pool.NewRetryPool(t.Context(), "write", writePoolConfig, nil, 18, 20)
908908
require.NoError(t, err)
909909

910910
// Create datastore with those pools

internal/datastore/crdb/options.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,19 @@ type crdbOptions struct {
1515
readPoolOpts, writePoolOpts pgxcommon.PoolOptions
1616
connectRate time.Duration
1717

18-
watchBufferLength uint16
19-
watchChangeBufferMaximumSize uint64
20-
watchBufferWriteTimeout time.Duration
21-
watchConnectTimeout time.Duration
22-
revisionQuantization time.Duration
23-
followerReadDelay time.Duration
24-
maxRevisionStalenessPercent float64
25-
gcWindow time.Duration
26-
maxRetries uint8
27-
overlapStrategy string
28-
overlapKey string
29-
enableConnectionBalancing bool
18+
watchBufferLength uint16
19+
watchChangeBufferMaximumSize uint64
20+
watchBufferWriteTimeout time.Duration
21+
watchConnectTimeout time.Duration
22+
revisionQuantization time.Duration
23+
followerReadDelay time.Duration
24+
maxRevisionStalenessPercent float64
25+
gcWindow time.Duration
26+
maxRetries uint8
27+
overlapStrategy string
28+
overlapKey string
29+
enableConnectionBalancing bool
30+
3031
analyzeBeforeStatistics bool
3132
filterMaximumIDCount uint16
3233
enablePrometheusStats bool

internal/datastore/crdb/pool/balancer.go

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,24 +20,6 @@ import (
2020
"github.com/authzed/spicedb/pkg/genutil"
2121
)
2222

23-
var (
24-
connectionsPerCRDBNodeCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
25-
Name: "crdb_connections_per_node",
26-
Help: "The number of active connections SpiceDB holds to each CockroachDB node, by pool (read/write). Imbalanced values across nodes suggest the connection balancer is unable to redistribute connections evenly.",
27-
}, []string{"pool", "node_id"})
28-
29-
pruningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
30-
Name: "crdb_pruning_duration",
31-
Help: "Duration in milliseconds of one iteration of the CockroachDB connection balancer pruning excess connections from over-represented nodes. Elevated values indicate the balancer is struggling to rebalance connections.",
32-
Buckets: []float64{.1, .2, .5, 1, 2, 5, 10, 20, 50, 100},
33-
}, []string{"pool"})
34-
)
35-
36-
func init() {
37-
prometheus.MustRegister(connectionsPerCRDBNodeCountGauge)
38-
prometheus.MustRegister(pruningTimeHistogram)
39-
}
40-
4123
type balancePoolConn[C balanceConn] interface {
4224
Conn() C
4325
Release()
@@ -81,6 +63,9 @@ type nodeConnectionBalancer[P balancePoolConn[C], C balanceConn] struct {
8163
healthTracker *NodeHealthTracker
8264
rnd *rand.Rand
8365
seed int64
66+
67+
connectionsPerCRDBNodeCountGauge *prometheus.GaugeVec
68+
pruningTimeHistogram prometheus.Histogram
8469
}
8570

8671
// newNodeConnectionBalancer is generic over underlying connection types for
@@ -99,12 +84,35 @@ func newNodeConnectionBalancer[P balancePoolConn[C], C balanceConn](pool balance
9984
seed = 0
10085
}
10186
}
87+
var (
88+
connectionsPerCRDBNodeCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
89+
Name: "crdb_connections_per_node",
90+
ConstLabels: prometheus.Labels{
91+
"pool": pool.ID(),
92+
},
93+
Help: "The number of active connections SpiceDB holds to each CockroachDB node, by pool (read/write). Imbalanced values across nodes suggest the connection balancer is unable to redistribute connections evenly.",
94+
}, []string{"node_id"})
95+
96+
pruningTimeHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{
97+
Name: "crdb_pruning_duration",
98+
ConstLabels: prometheus.Labels{
99+
"pool": pool.ID(),
100+
},
101+
Help: "Duration in milliseconds of one iteration of the CockroachDB connection balancer pruning excess connections from over-represented nodes. Elevated values indicate the balancer is struggling to rebalance connections.",
102+
Buckets: []float64{.1, .2, .5, 1, 2, 5, 10, 20, 50, 100},
103+
})
104+
)
105+
106+
prometheus.MustRegister(connectionsPerCRDBNodeCountGauge)
107+
prometheus.MustRegister(pruningTimeHistogram)
102108
return &nodeConnectionBalancer[P, C]{
103-
ticker: time.NewTicker(interval),
104-
sem: semaphore.NewWeighted(1),
105-
healthTracker: healthTracker,
106-
pool: pool,
107-
seed: seed,
109+
ticker: time.NewTicker(interval),
110+
sem: semaphore.NewWeighted(1),
111+
healthTracker: healthTracker,
112+
pool: pool,
113+
seed: seed,
114+
connectionsPerCRDBNodeCountGauge: connectionsPerCRDBNodeCountGauge,
115+
pruningTimeHistogram: pruningTimeHistogram,
108116
// nolint:gosec
109117
// use of non cryptographically secure random number generator is not concern here,
110118
// as it's used for shuffling the nodes to balance the connections when the number of
@@ -113,6 +121,11 @@ func newNodeConnectionBalancer[P balancePoolConn[C], C balanceConn](pool balance
113121
}
114122
}
115123

124+
func (p *nodeConnectionBalancer[P, C]) Close() {
125+
prometheus.Unregister(p.connectionsPerCRDBNodeCountGauge)
126+
prometheus.Unregister(p.pruningTimeHistogram)
127+
}
128+
116129
// Prune starts periodically checking idle connections and killing ones that are determined to be unbalanced.
117130
func (p *nodeConnectionBalancer[P, C]) Prune(ctx context.Context) {
118131
for {
@@ -137,7 +150,7 @@ func (p *nodeConnectionBalancer[P, C]) Prune(ctx context.Context) {
137150
func (p *nodeConnectionBalancer[P, C]) mustPruneConnections(ctx context.Context) {
138151
start := time.Now()
139152
defer func() {
140-
pruningTimeHistogram.WithLabelValues(p.pool.ID()).Observe(float64(time.Since(start).Milliseconds()))
153+
p.pruningTimeHistogram.Observe(float64(time.Since(start).Milliseconds()))
141154
}()
142155
conns := p.pool.AcquireAllIdle(ctx)
143156
defer func() {
@@ -182,8 +195,7 @@ func (p *nodeConnectionBalancer[P, C]) mustPruneConnections(ctx context.Context)
182195
p.healthTracker.RLock()
183196
for node := range p.healthTracker.nodesEverSeen {
184197
if _, ok := connectionCounts[node]; !ok {
185-
connectionsPerCRDBNodeCountGauge.DeletePartialMatch(map[string]string{
186-
"pool": p.pool.ID(),
198+
p.connectionsPerCRDBNodeCountGauge.DeletePartialMatch(map[string]string{
187199
"node_id": strconv.FormatUint(uint64(node), 10),
188200
})
189201
}
@@ -205,8 +217,7 @@ func (p *nodeConnectionBalancer[P, C]) mustPruneConnections(ctx context.Context)
205217
initialPerNodeMax := p.pool.MaxConns() / nodeCount
206218
for i, node := range nodes {
207219
count := connectionCounts[node]
208-
connectionsPerCRDBNodeCountGauge.WithLabelValues(
209-
p.pool.ID(),
220+
p.connectionsPerCRDBNodeCountGauge.WithLabelValues(
210221
strconv.FormatUint(uint64(node), 10),
211222
).Set(float64(count))
212223

internal/datastore/crdb/pool/balancer_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ func TestNodeConnectionBalancerPrune(t *testing.T) {
141141
t.Run(tt.name, func(t *testing.T) {
142142
tracker, err := NewNodeHealthChecker("")
143143
require.NoError(t, err)
144+
t.Cleanup(func() {
145+
tracker.Close()
146+
})
144147
for _, n := range tt.nodes {
145148
tracker.healthyNodes[n] = struct{}{}
146149
}

internal/datastore/crdb/pool/health.go

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package pool
22

33
import (
44
"context"
5+
"fmt"
56
"math/rand"
67
"sync"
78
"time"
@@ -18,26 +19,18 @@ import (
1819

1920
const errorBurst = 2
2021

21-
var healthyCRDBNodeCountGauge = prometheus.NewGauge(prometheus.GaugeOpts{
22-
Name: "crdb_healthy_nodes",
23-
Help: "the number of healthy crdb nodes detected by spicedb",
24-
})
25-
26-
func init() {
27-
prometheus.MustRegister(healthyCRDBNodeCountGauge)
28-
}
29-
3022
// NodeHealthTracker detects changes in the node pool by polling the cluster periodically and recording
3123
// the node ids that are seen. This is used to detect new nodes that come online that have either previously
3224
// been marked unhealthy due to connection errors or due to scale up.
3325
//
3426
// Consumers can manually mark a node healthy or unhealthy as well.
3527
type NodeHealthTracker struct {
3628
sync.RWMutex
37-
connConfig *pgx.ConnConfig
38-
healthyNodes map[uint32]struct{} // GUARDED_BY(RWMutex)
39-
nodesEverSeen map[uint32]*rate.Limiter // GUARDED_BY(RWMutex)
40-
newLimiter func() *rate.Limiter
29+
connConfig *pgx.ConnConfig
30+
healthyNodes map[uint32]struct{} // GUARDED_BY(RWMutex)
31+
nodesEverSeen map[uint32]*rate.Limiter // GUARDED_BY(RWMutex)
32+
newLimiter func() *rate.Limiter
33+
healthyCRDBNodeCountGauge prometheus.Gauge
4134
}
4235

4336
// NewNodeHealthChecker builds a health checker that polls the cluster at the given url.
@@ -47,16 +40,31 @@ func NewNodeHealthChecker(url string) (*NodeHealthTracker, error) {
4740
return nil, err
4841
}
4942

43+
healthyCRDBNodeCountGauge := prometheus.NewGauge(prometheus.GaugeOpts{
44+
Name: "crdb_healthy_nodes",
45+
Help: "the number of healthy crdb nodes detected by spicedb",
46+
})
47+
48+
err = prometheus.Register(healthyCRDBNodeCountGauge)
49+
if err != nil {
50+
return nil, fmt.Errorf("failed to register crdb healthy nodes metric: %w", err)
51+
}
52+
5053
return &NodeHealthTracker{
51-
connConfig: connConfig,
52-
healthyNodes: make(map[uint32]struct{}, 0),
53-
nodesEverSeen: make(map[uint32]*rate.Limiter, 0),
54+
healthyCRDBNodeCountGauge: healthyCRDBNodeCountGauge,
55+
connConfig: connConfig,
56+
healthyNodes: make(map[uint32]struct{}, 0),
57+
nodesEverSeen: make(map[uint32]*rate.Limiter, 0),
5458
newLimiter: func() *rate.Limiter {
5559
return rate.NewLimiter(rate.Every(1*time.Minute), errorBurst)
5660
},
5761
}, nil
5862
}
5963

64+
func (t *NodeHealthTracker) Close() {
65+
_ = prometheus.Unregister(t.healthyCRDBNodeCountGauge)
66+
}
67+
6068
// Poll starts polling the cluster and recording the node IDs that it sees.
6169
func (t *NodeHealthTracker) Poll(ctx context.Context, interval time.Duration) {
6270
ticker := jitterbug.New(interval, jitterbug.Uniform{
@@ -104,7 +112,7 @@ func (t *NodeHealthTracker) SetNodeHealth(nodeID uint32, healthy bool) {
104112
t.Lock()
105113
defer t.Unlock()
106114
defer func() {
107-
healthyCRDBNodeCountGauge.Set(float64(len(t.healthyNodes)))
115+
t.healthyCRDBNodeCountGauge.Set(float64(len(t.healthyNodes)))
108116
}()
109117

110118
if _, ok := t.nodesEverSeen[nodeID]; !ok {

internal/datastore/crdb/pool/health_test.go

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,16 @@ package pool
22

33
import (
44
"testing"
5-
"time"
65

76
"github.com/stretchr/testify/require"
8-
"golang.org/x/time/rate"
97
)
108

119
func TestNodeHealthTracker(t *testing.T) {
12-
tracker := &NodeHealthTracker{
13-
healthyNodes: make(map[uint32]struct{}),
14-
nodesEverSeen: make(map[uint32]*rate.Limiter),
15-
newLimiter: func() *rate.Limiter {
16-
return rate.NewLimiter(rate.Every(1*time.Minute), 2)
17-
},
18-
}
10+
tracker, err := NewNodeHealthChecker("postgres://user:password@localhost:5432/dbname")
11+
require.NoError(t, err)
12+
t.Cleanup(func() {
13+
tracker.Close()
14+
})
1915

2016
tracker.SetNodeHealth(1, true)
2117
require.True(t, tracker.IsHealthy(1))

0 commit comments

Comments
 (0)