Skip to content

Commit d00e9c3

Browse files
craig[bot]tbg
andcommitted
Merge #156933
156933: mmaprototype: remove health tracking, do some housekeeping r=tbg a=tbg Epic: CRDB-55052 Co-authored-by: Tobias Grieger <[email protected]>
2 parents 944a705 + 4822f99 commit d00e9c3

18 files changed

+271
-403
lines changed

pkg/kv/kvserver/allocator/mmaprototype/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ go_library(
1313
"memo_helper.go",
1414
"messages.go",
1515
"rebalance_advisor.go",
16+
"store_load_summary.go",
17+
"store_set.go",
1618
"top_k_replicas.go",
1719
],
1820
importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/mmaprototype",

pkg/kv/kvserver/allocator/mmaprototype/allocator.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,6 @@ type Allocator interface {
3939
// about the nodes in the cluster is a side effect of this method.
4040
SetStore(store StoreAttributesAndLocality)
4141

42-
// RemoveNodeAndStores tells the allocator to remove the NodeID and all its
43-
// stores.
44-
RemoveNodeAndStores(nodeID roachpb.NodeID) error
45-
46-
// UpdateFailureDetectionSummary tells the allocator about the current
47-
// failure detection state for a node. A node starts in the fdOK state.
48-
UpdateFailureDetectionSummary(nodeID roachpb.NodeID, fd failureDetectionSummary) error
49-
5042
// ProcessStoreLoadMsg provides frequent the state of every store and its
5143
// associated node in the cluster.
5244
ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoadMsg)

pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -353,8 +353,8 @@ func (a *allocatorState) rebalanceStores(
353353

354354
var changes []PendingRangeChange
355355
var disj [1]constraintsConj
356-
var storesToExclude storeIDPostingList
357-
var storesToExcludeForRange storeIDPostingList
356+
var storesToExclude storeSet
357+
var storesToExcludeForRange storeSet
358358
scratchNodes := map[roachpb.NodeID]*NodeLoad{}
359359
scratchStores := map[roachpb.StoreID]struct{}{}
360360
// The caller has a fixed concurrency limit it can move ranges at, when it
@@ -461,7 +461,7 @@ func (a *allocatorState) rebalanceStores(
461461
store.StoreID, rangeID))
462462
}
463463
cands, _ := rstate.constraints.candidatesToMoveLease()
464-
var candsPL storeIDPostingList
464+
var candsPL storeSet
465465
for _, cand := range cands {
466466
candsPL.insert(cand.storeID)
467467
}
@@ -493,10 +493,6 @@ func (a *allocatorState) rebalanceStores(
493493
continue
494494
}
495495
candSls := a.cs.computeLoadSummary(ctx, cand.storeID, &means.storeLoad, &means.nodeLoad)
496-
if sls.fd != fdOK {
497-
log.KvDistribution.VInfof(ctx, 2, "skipping store s%d: failure detection status not OK", cand.storeID)
498-
continue
499-
}
500496
candsSet.candidates = append(candsSet.candidates, candidateInfo{
501497
StoreID: cand.storeID,
502498
storeLoadSummary: candSls,
@@ -608,7 +604,7 @@ func (a *allocatorState) rebalanceStores(
608604
// If the node is cpu overloaded, or the store/node is not fdOK, exclude
609605
// the other stores on this node from receiving replicas shed by this
610606
// store.
611-
excludeStoresOnNode := store.nls > overloadSlow || store.fd != fdOK
607+
excludeStoresOnNode := store.nls > overloadSlow
612608
storesToExclude = storesToExclude[:0]
613609
if excludeStoresOnNode {
614610
nodeID := ss.NodeID
@@ -812,22 +808,6 @@ func (a *allocatorState) SetStore(store StoreAttributesAndLocality) {
812808
a.cs.setStore(store)
813809
}
814810

815-
// RemoveNodeAndStores implements the Allocator interface.
816-
func (a *allocatorState) RemoveNodeAndStores(nodeID roachpb.NodeID) error {
817-
a.mu.Lock()
818-
defer a.mu.Unlock()
819-
panic("unimplemented")
820-
}
821-
822-
// UpdateFailureDetectionSummary implements the Allocator interface.
823-
func (a *allocatorState) UpdateFailureDetectionSummary(
824-
nodeID roachpb.NodeID, fd failureDetectionSummary,
825-
) error {
826-
a.mu.Lock()
827-
defer a.mu.Unlock()
828-
panic("unimplemented")
829-
}
830-
831811
// ProcessStoreLeaseholderMsg implements the Allocator interface.
832812
func (a *allocatorState) ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoadMsg) {
833813
a.mu.Lock()
@@ -1222,7 +1202,7 @@ func sortTargetCandidateSetAndPick(
12221202
// ones that have notMatchedLeasePreferenceIndex.
12231203
j = 0
12241204
for _, cand := range cands.candidates {
1225-
if cand.leasePreferenceIndex == notMatchedLeasePreferencIndex {
1205+
if cand.leasePreferenceIndex == notMatchedLeasePreferenceIndex {
12261206
break
12271207
}
12281208
j++
@@ -1360,7 +1340,7 @@ func (a *allocatorState) ensureAnalyzedConstraints(rstate *rangeState) bool {
13601340
func (a *allocatorState) computeCandidatesForRange(
13611341
ctx context.Context,
13621342
expr constraintsDisj,
1363-
storesToExclude storeIDPostingList,
1343+
storesToExclude storeSet,
13641344
loadSheddingStore roachpb.StoreID,
13651345
) (_ candidateSet, sheddingSLS storeLoadSummary) {
13661346
means := a.cs.meansMemo.getMeans(expr)
@@ -1381,9 +1361,6 @@ func (a *allocatorState) computeCandidatesForRange(
13811361
}
13821362
ss := a.cs.stores[storeID]
13831363
csls := a.cs.meansMemo.getStoreLoadSummary(ctx, means, storeID, ss.loadSeqNum)
1384-
if csls.fd != fdOK {
1385-
continue
1386-
}
13871364
cset.candidates = append(cset.candidates, candidateInfo{
13881365
StoreID: storeID,
13891366
storeLoadSummary: csls,

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 0 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -597,37 +597,9 @@ type pendingReplicaChange struct {
597597
enactedAtTime time.Time
598598
}
599599

600-
// TODO(kvoli): This will eventually be used to represent the state of a node's
601-
// membership. This corresponds to non-decommissioning, decommissioning and
602-
// decommissioned. Fill in commentary and use.
603-
type storeMembership int8
604-
605-
const (
606-
storeMembershipMember storeMembership = iota
607-
storeMembershipRemoving
608-
storeMembershipRemoved
609-
)
610-
611-
func (s storeMembership) String() string {
612-
return redact.StringWithoutMarkers(s)
613-
}
614-
615-
// SafeFormat implements the redact.SafeFormatter interface.
616-
func (s storeMembership) SafeFormat(w redact.SafePrinter, _ rune) {
617-
switch s {
618-
case storeMembershipMember:
619-
w.Print("full")
620-
case storeMembershipRemoving:
621-
w.Print("removing")
622-
case storeMembershipRemoved:
623-
w.Print("removed")
624-
}
625-
}
626-
627600
// storeState maintains the complete state about a store as known to the
628601
// allocator.
629602
type storeState struct {
630-
storeMembership
631603
storeLoad
632604
StoreAttributesAndLocality
633605
adjusted struct {
@@ -829,47 +801,10 @@ func newStoreState() *storeState {
829801
return ss
830802
}
831803

832-
// failureDetectionSummary is provided by an external entity and never
833-
// computed inside the allocator.
834-
type failureDetectionSummary uint8
835-
836-
// All state transitions are permitted by the allocator. For example, fdDead
837-
// => fdOk is allowed since the allocator can simply stop shedding replicas
838-
// and then start adding replicas (if underloaded).
839-
const (
840-
fdOK failureDetectionSummary = iota
841-
// Don't add replicas or leases.
842-
fdSuspect
843-
// Move leases away. Don't add replicas or leases.
844-
fdDrain
845-
// Node is dead, so move leases and replicas away from it.
846-
fdDead
847-
)
848-
849-
func (fds failureDetectionSummary) String() string {
850-
return redact.StringWithoutMarkers(fds)
851-
}
852-
853-
// SafeFormat implements the redact.SafeFormatter interface.
854-
func (fds failureDetectionSummary) SafeFormat(w redact.SafePrinter, _ rune) {
855-
switch fds {
856-
case fdOK:
857-
w.Print("ok")
858-
case fdSuspect:
859-
w.Print("suspect")
860-
case fdDrain:
861-
w.Print("drain")
862-
case fdDead:
863-
w.Print("dead")
864-
}
865-
}
866-
867804
type nodeState struct {
868805
stores []roachpb.StoreID
869806
NodeLoad
870807
adjustedCPU LoadValue
871-
872-
fdSummary failureDetectionSummary
873808
}
874809

875810
func newNodeState(nodeID roachpb.NodeID) *nodeState {
@@ -1897,21 +1832,6 @@ func (cs *clusterState) setStore(sal StoreAttributesAndLocality) {
18971832
}
18981833
}
18991834

1900-
func (cs *clusterState) setStoreMembership(storeID roachpb.StoreID, state storeMembership) {
1901-
if ss, ok := cs.stores[storeID]; ok {
1902-
ss.storeMembership = state
1903-
} else {
1904-
panic(fmt.Sprintf("store %d not found in cluster state", storeID))
1905-
}
1906-
}
1907-
1908-
func (cs *clusterState) updateFailureDetectionSummary(
1909-
nodeID roachpb.NodeID, fd failureDetectionSummary,
1910-
) {
1911-
ns := cs.nodes[nodeID]
1912-
ns.fdSummary = fd
1913-
}
1914-
19151835
//======================================================================
19161836
// clusterState accessors:
19171837
//
@@ -2193,7 +2113,6 @@ func computeLoadSummary(
21932113
nls: nls,
21942114
dimSummary: dimSummary,
21952115
highDiskSpaceUtilization: highDiskSpaceUtil,
2196-
fd: ns.fdSummary,
21972116
maxFractionPendingIncrease: ss.maxFractionPendingIncrease,
21982117
maxFractionPendingDecrease: ss.maxFractionPendingDecrease,
21992118
loadSeqNum: ss.loadSeqNum,

pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go

Lines changed: 8 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,8 @@ func printPendingChangesTest(changes []*pendingReplicaChange) string {
207207
return buf.String()
208208
}
209209

210-
func testingGetStoreList(t *testing.T, cs *clusterState) (member, removed storeIDPostingList) {
211-
var clusterStoreList, nodeStoreList storeIDPostingList
210+
func testingGetStoreList(t *testing.T, cs *clusterState) storeSet {
211+
var clusterStoreList, nodeStoreList storeSet
212212
// Ensure that the storeIDs in the cluster store map and the stores listed
213213
// under each node are the same.
214214
for storeID := range cs.stores {
@@ -222,15 +222,7 @@ func testingGetStoreList(t *testing.T, cs *clusterState) (member, removed storeI
222222
require.True(t, clusterStoreList.isEqual(nodeStoreList),
223223
"expected store lists to be equal %v != %v", clusterStoreList, nodeStoreList)
224224

225-
for storeID, ss := range cs.stores {
226-
switch ss.storeMembership {
227-
case storeMembershipMember, storeMembershipRemoving:
228-
member.insert(storeID)
229-
case storeMembershipRemoved:
230-
removed.insert(storeID)
231-
}
232-
}
233-
return member, removed
225+
return clusterStoreList
234226
}
235227

236228
func testingGetPendingChanges(t *testing.T, cs *clusterState) []*pendingReplicaChange {
@@ -301,12 +293,12 @@ func TestClusterState(t *testing.T) {
301293
var buf strings.Builder
302294
for _, nodeID := range nodeList {
303295
ns := cs.nodes[roachpb.NodeID(nodeID)]
304-
fmt.Fprintf(&buf, "node-id=%s failure-summary=%s locality-tiers=%s\n",
305-
ns.NodeID, ns.fdSummary, cs.stores[ns.stores[0]].StoreAttributesAndLocality.locality())
296+
fmt.Fprintf(&buf, "node-id=%s locality-tiers=%s\n",
297+
ns.NodeID, cs.stores[ns.stores[0]].StoreAttributesAndLocality.locality())
306298
for _, storeID := range ns.stores {
307299
ss := cs.stores[storeID]
308-
fmt.Fprintf(&buf, " store-id=%v membership=%v attrs=%s locality-code=%s\n",
309-
ss.StoreID, ss.storeMembership, ss.StoreAttrs, ss.localityTiers.str)
300+
fmt.Fprintf(&buf, " store-id=%v attrs=%s locality-code=%s\n",
301+
ss.StoreID, ss.StoreAttrs, ss.localityTiers.str)
310302
}
311303
}
312304
return buf.String()
@@ -336,7 +328,7 @@ func TestClusterState(t *testing.T) {
336328

337329
case "get-load-info":
338330
var buf strings.Builder
339-
memberStores, _ := testingGetStoreList(t, cs)
331+
memberStores := testingGetStoreList(t, cs)
340332
for _, storeID := range memberStores {
341333
ss := cs.stores[storeID]
342334
ns := cs.nodes[ss.NodeID]
@@ -365,40 +357,6 @@ func TestClusterState(t *testing.T) {
365357
}
366358
return printNodeListMeta()
367359

368-
case "set-store-membership":
369-
storeID := dd.ScanArg[roachpb.StoreID](t, d, "store-id")
370-
var storeMembershipVal storeMembership
371-
switch str := dd.ScanArg[string](t, d, "membership"); str {
372-
case "member":
373-
storeMembershipVal = storeMembershipMember
374-
case "removing":
375-
storeMembershipVal = storeMembershipRemoving
376-
case "removed":
377-
storeMembershipVal = storeMembershipRemoved
378-
}
379-
cs.setStoreMembership(storeID, storeMembershipVal)
380-
381-
var buf strings.Builder
382-
nonRemovedStores, removedStores := testingGetStoreList(t, cs)
383-
buf.WriteString("member store-ids: ")
384-
printPostingList(&buf, nonRemovedStores)
385-
buf.WriteString("\nremoved store-ids: ")
386-
printPostingList(&buf, removedStores)
387-
return buf.String()
388-
389-
case "update-failure-detection":
390-
nodeID := dd.ScanArg[roachpb.NodeID](t, d, "node-id")
391-
failureDetectionString := dd.ScanArg[string](t, d, "summary")
392-
var fd failureDetectionSummary
393-
for i := fdOK; i < fdDead+1; i++ {
394-
if i.String() == failureDetectionString {
395-
fd = i
396-
break
397-
}
398-
}
399-
cs.updateFailureDetectionSummary(nodeID, fd)
400-
return printNodeListMeta()
401-
402360
case "store-load-msg":
403361
msg := parseStoreLoadMsg(t, d.Input)
404362
cs.processStoreLoadMsg(context.Background(), &msg)

0 commit comments

Comments
 (0)