Skip to content

Commit 8696560

Browse files
committed
mmaprototype: introduce Status
1 parent d00e9c3 commit 8696560

File tree

2 files changed

+257
-0
lines changed

2 files changed

+257
-0
lines changed

pkg/kv/kvserver/allocator/mmaprototype/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ go_library(
1515
"rebalance_advisor.go",
1616
"store_load_summary.go",
1717
"store_set.go",
18+
"store_status.go",
1819
"top_k_replicas.go",
1920
],
2021
importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/mmaprototype",
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package mmaprototype
7+
8+
import (
9+
"github.com/cockroachdb/errors"
10+
"github.com/cockroachdb/redact"
11+
)
12+
13+
type (
14+
// Health represents whether the store "is around", i.e. able to participate
15+
// in replication. What this means exactly is left to the outside world. MMA
16+
// relies on the Disposition as much as possible without consulting the health
17+
// status, aided by the invariant described on Status (only healthy nodes can
18+
// have "green" dispositions). Its only (envisioned) use for explicit health
19+
// tracking are sanity checks and preferring unhealthy candidates when picking
20+
// sources (for example, when rebalancing, it may make sense to rebalance off
21+
// an unhealthy or dead node over a healthy one as this actually improves
22+
// availability).
23+
Health byte
24+
LeaseDisposition byte
25+
ReplicaDisposition byte
26+
)
27+
28+
const (
29+
// HealthUnknown is the initial state for health. It's treated as an unhealthy
30+
// state, distinguished from HealthUnhealthy to tailor logging and metrics.
31+
HealthUnknown Health = iota
32+
// HealthOK describes a store that has been determined to be alive and well.
33+
HealthOK
34+
// HealthUnhealthy describes a store that has recently become unhealthy, but
35+
// which has not been unhealthy for "extended periods of time" and is expected
36+
// to return (as determined by a higher layer)
37+
HealthUnhealthy
38+
// HealthDead describes a store that is unhealthy and has been in that state
39+
// for what is considered a "long time" or which may not return.
40+
HealthDead
41+
healthCount // sentinel
42+
)
43+
44+
const (
45+
// LeaseDispositionOK denotes that the store is in principle willing to
46+
// receive leases.
47+
LeaseDispositionOK LeaseDisposition = iota
48+
// LeaseDispositionRefusing denotes that the store should not be considered
49+
// as a target for leases.
50+
LeaseDispositionRefusing
51+
// LeaseDispositionShedding denotes that the store should actively be
52+
// considered for removal of leases.
53+
LeaseDispositionShedding
54+
leaseDispositionCount // sentinel
55+
)
56+
57+
const (
58+
// ReplicaDispositionRefusing denotes that the store can receive replicas.
59+
ReplicaDispositionOK ReplicaDisposition = iota
60+
// ReplicaDispositionRefusing denotes that the store should not be considered as
61+
// a target for replica placement.
62+
ReplicaDispositionRefusing
63+
// ReplicaDispositionShedding denotes that the store should actively be
64+
// considered for removal of replicas.
65+
ReplicaDispositionShedding
66+
replicaDispositionCount // sentinel
67+
)
68+
69+
// Disposition represents the interest a store has in accepting or
70+
// shedding leases or replicas.
71+
type Disposition struct {
72+
Lease LeaseDisposition
73+
Replica ReplicaDisposition
74+
}
75+
76+
// Status represents the health and disposition of a store. It serves as a
77+
// translation layer between the "messy" outside world in which we may have
78+
// multiple possibly conflicting health signals, and various lifecycle states.
79+
// The Status is never derived internally in mma, but is always passed in and
80+
// kept up to date through the Allocator interface.
81+
//
82+
// The contained Health primarily plays a role when looking for (lease or
83+
// replicas) sources: all other things being equal, we want to be able to
84+
// identify suspect or dead stores as more beneficial sources for, say, a
85+
// replica removal. We also want to be able to tell when a range has lost quorum
86+
// so that we don't even try to make changes in that case. On the flip side, due
87+
// to the INVARIANT below, when looking for targets, health does not typically
88+
// need to be considered as we prohibit "non-healthy" stores from claiming to
89+
// accept replicas or leases.
90+
//
91+
// The wrapped Disposition is important when looking for both targets and
92+
// sources. In principle, stores can reject leases but accept replicas and vice
93+
// versa, though not all combinations may be used in practice (because they do
94+
// not all make sense).
95+
//
96+
// The multi-metric allocator is not currently responsible for establishing
97+
// constraint conformance or for shedding, so not all aspects discussed above
98+
// are necessarily used by it yet.
99+
//
100+
// INVARIANT: if Health != HealthOK, Disposition.{Lease,Replica} = Shedding.
101+
type Status struct {
102+
Health Health
103+
Disposition Disposition
104+
}
105+
106+
func MakeStatus(h Health, l LeaseDisposition, r ReplicaDisposition) Status {
107+
s := Status{
108+
Health: h,
109+
Disposition: Disposition{Lease: l, Replica: r},
110+
}
111+
s.assertOrPanic()
112+
return s
113+
}
114+
115+
func (ss *Status) assert() error {
116+
// NB: byte values can't be negative, so we don't have to check.
117+
118+
if ss.Health >= healthCount {
119+
return errors.AssertionFailedf("invalid health status %d", ss.Health)
120+
}
121+
d := ss.Disposition
122+
if d.Lease >= leaseDispositionCount {
123+
return errors.AssertionFailedf("invalid lease disposition %d", d.Lease)
124+
}
125+
if d.Replica >= replicaDispositionCount {
126+
return errors.AssertionFailedf("invalid replica disposition %d", d.Replica)
127+
}
128+
if ss.Health != HealthOK && (ss.Disposition.Replica == ReplicaDispositionOK || ss.Disposition.Lease == LeaseDispositionOK) {
129+
return errors.AssertionFailedf("non-healthy status must not accept leases or replicas: %+v", ss)
130+
}
131+
return nil
132+
}
133+
134+
func (ss *Status) assertOrPanic() {
135+
if err := ss.assert(); err != nil {
136+
panic(err)
137+
}
138+
}
139+
140+
func (h Health) String() string {
141+
return redact.StringWithoutMarkers(h)
142+
}
143+
144+
// SafeFormat implements the redact.SafeFormatter interface.
145+
func (h Health) SafeFormat(w redact.SafePrinter, _ rune) {
146+
switch h {
147+
case HealthUnknown:
148+
w.Print("unknown")
149+
case HealthOK:
150+
w.Print("ok")
151+
case HealthUnhealthy:
152+
w.Print("unhealthy")
153+
case HealthDead:
154+
w.Print("dead")
155+
default:
156+
w.Print("unknown")
157+
}
158+
}
159+
160+
func (l LeaseDisposition) String() string {
161+
return redact.StringWithoutMarkers(l)
162+
}
163+
164+
// SafeFormat implements the redact.SafeFormatter interface.
165+
func (l LeaseDisposition) SafeFormat(w redact.SafePrinter, _ rune) {
166+
switch l {
167+
case LeaseDispositionOK:
168+
w.Print("ok")
169+
case LeaseDispositionRefusing:
170+
w.Print("refusing")
171+
case LeaseDispositionShedding:
172+
w.Print("shedding")
173+
default:
174+
w.Print("unknown")
175+
}
176+
}
177+
178+
func (r ReplicaDisposition) String() string {
179+
return redact.StringWithoutMarkers(r)
180+
}
181+
182+
// SafeFormat implements the redact.SafeFormatter interface.
183+
func (r ReplicaDisposition) SafeFormat(w redact.SafePrinter, _ rune) {
184+
switch r {
185+
case ReplicaDispositionOK:
186+
w.Print("ok")
187+
case ReplicaDispositionRefusing:
188+
w.Print("refusing")
189+
case ReplicaDispositionShedding:
190+
w.Print("shedding")
191+
default:
192+
w.Print("unknown")
193+
}
194+
}
195+
196+
func (d Disposition) String() string {
197+
return redact.StringWithoutMarkers(d)
198+
}
199+
200+
// SafeFormat implements the redact.SafeFormatter interface.
201+
func (d Disposition) SafeFormat(w redact.SafePrinter, _ rune) {
202+
if d.Lease == LeaseDispositionOK && d.Replica == ReplicaDispositionOK {
203+
w.Print("accepting all")
204+
return
205+
}
206+
207+
var refusing []redact.SafeString
208+
var shedding []redact.SafeString
209+
210+
if d.Lease == LeaseDispositionRefusing {
211+
refusing = append(refusing, redact.SafeString("leases"))
212+
} else if d.Lease == LeaseDispositionShedding {
213+
shedding = append(shedding, redact.SafeString("leases"))
214+
}
215+
216+
if d.Replica == ReplicaDispositionRefusing {
217+
refusing = append(refusing, redact.SafeString("replicas"))
218+
} else if d.Replica == ReplicaDispositionShedding {
219+
shedding = append(shedding, redact.SafeString("replicas"))
220+
}
221+
222+
first := true
223+
if len(refusing) > 0 {
224+
w.Print("refusing=")
225+
for i, s := range refusing {
226+
if i > 0 {
227+
w.Print(",")
228+
}
229+
w.Print(s)
230+
}
231+
first = false
232+
}
233+
if len(shedding) > 0 {
234+
if !first {
235+
w.Print(" ")
236+
}
237+
w.Print("shedding=")
238+
for i, s := range shedding {
239+
if i > 0 {
240+
w.Print(",")
241+
}
242+
w.Print(s)
243+
}
244+
}
245+
}
246+
247+
func (ss Status) String() string {
248+
return redact.StringWithoutMarkers(ss)
249+
}
250+
251+
// SafeFormat implements the redact.SafeFormatter interface.
252+
func (ss Status) SafeFormat(w redact.SafePrinter, _ rune) {
253+
ss.Health.SafeFormat(w, 's')
254+
w.Print(" ")
255+
ss.Disposition.SafeFormat(w, 's')
256+
}

0 commit comments

Comments
 (0)