Skip to content

Commit 255f60e

Browse files
craig[bot]Matt Whelan
andcommitted
Merge #142930
142930: server: new endpoint to expose node criticality status r=[arulajmani,kyle-a-wong] a=MattWhelan We expose range status via the cluster API metrics, which is sufficient to derive node criticality information. However, it's complicated, and customers want a simple, opinionated way to know whether a particular node is safe to terminate when orchestrating a rolling restart. This change exposes a new endpoint, analogous to a healthcheck, but instead of indicating node health, it indicates when it's unsafe to terminate a node due to impact on cluster quorum health. Epic: none Part of: https://cockroachlabs.atlassian.net/browse/TREQ-929 Release note (ops change): The restart safety endpoint indicates when it is unsafe to terminate a node. Co-authored-by: Matt Whelan <[email protected]>
2 parents 63545b4 + bd8ee04 commit 255f60e

File tree

3 files changed

+456
-1
lines changed

3 files changed

+456
-1
lines changed

pkg/server/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ go_library(
162162
"//pkg/multitenant/tenantcapabilitiespb",
163163
"//pkg/multitenant/tenantcostmodel",
164164
"//pkg/raft",
165+
"//pkg/raft/raftpb",
165166
"//pkg/roachpb",
166167
"//pkg/rpc",
167168
"//pkg/rpc/nodedialer",

pkg/server/api_v2.go

Lines changed: 171 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ import (
3333
"strconv"
3434

3535
"github.com/cockroachdb/cockroach/pkg/kv"
36+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
37+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl"
38+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
39+
"github.com/cockroachdb/cockroach/pkg/raft/raftpb"
40+
"github.com/cockroachdb/cockroach/pkg/roachpb"
3641
"github.com/cockroachdb/cockroach/pkg/security/username"
3742
"github.com/cockroachdb/cockroach/pkg/server/apiconstants"
3843
"github.com/cockroachdb/cockroach/pkg/server/apiutil"
@@ -55,6 +60,7 @@ const (
5560

5661
type ApiV2System interface {
5762
health(w http.ResponseWriter, r *http.Request)
63+
restartSafetyCheck(w http.ResponseWriter, r *http.Request)
5864
listNodes(w http.ResponseWriter, r *http.Request)
5965
listNodeRanges(w http.ResponseWriter, r *http.Request)
6066
}
@@ -95,7 +101,7 @@ type apiV2SystemServer struct {
95101
}
96102

97103
var _ ApiV2System = &apiV2SystemServer{}
98-
var _ http.Handler = &apiV2Server{}
104+
var _ http.Handler = &apiV2SystemServer{}
99105

100106
// newAPIV2Server returns a new apiV2Server.
101107
func newAPIV2Server(ctx context.Context, opts *apiV2ServerOpts) http.Handler {
@@ -180,6 +186,7 @@ func registerRoutes(
180186
{"nodes/{node_id}/ranges/", systemRoutes.listNodeRanges, true, authserver.ViewClusterMetadataRole, false},
181187
{"ranges/hot/", a.listHotRanges, true, authserver.ViewClusterMetadataRole, false},
182188
{"ranges/{range_id:[0-9]+}/", a.listRange, true, authserver.ViewClusterMetadataRole, false},
189+
{"health/restart_safety/", systemRoutes.restartSafetyCheck, false, authserver.RegularRole, false},
183190
{"health/", systemRoutes.health, false, authserver.RegularRole, false},
184191
{"users/", a.listUsers, true, authserver.RegularRole, false},
185192
{"events/", a.listEvents, true, authserver.ViewClusterMetadataRole, false},
@@ -394,6 +401,169 @@ func (a *apiV2Server) health(w http.ResponseWriter, r *http.Request) {
394401
healthInternal(w, r, a.admin.checkReadinessForHealthCheck)
395402
}
396403

404+
func (a *apiV2Server) restartSafetyCheck(w http.ResponseWriter, r *http.Request) {
405+
apiutil.WriteJSONResponse(r.Context(), w, http.StatusNotImplemented, nil)
406+
}
407+
408+
// # Restart Safety
409+
//
410+
// Endpoint to expose restart safety status. A 200 response indicates that
411+
// terminating the node in question won't cause any ranges to become
412+
// unavailable, at the time the response was prepared. Users may use this check
413+
// as a precondition for advancing a rolling restart process. Checks fail with
414+
// a 503, or a 500 in the case of an error evaluating safety.
415+
func (a *apiV2SystemServer) restartSafetyCheck(w http.ResponseWriter, r *http.Request) {
416+
ctx := r.Context()
417+
418+
if r.Method != http.MethodGet {
419+
http.Error(w, http.StatusText(http.StatusMethodNotAllowed), http.StatusMethodNotAllowed)
420+
return
421+
}
422+
423+
const AllowMinimumQuorumFlag = "allow_minimum_quorum"
424+
425+
query := r.URL.Query()
426+
allowMinimumQuorum := false
427+
var err error
428+
if query.Has(AllowMinimumQuorumFlag) {
429+
allowMinimumQuorum, err = strconv.ParseBool(query.Get(AllowMinimumQuorumFlag))
430+
if err != nil {
431+
http.Error(w, "invalid allow_minimum_quorum value; should be true or false", http.StatusBadRequest)
432+
return
433+
}
434+
}
435+
436+
nodeID := a.systemStatus.node.Descriptor.NodeID
437+
438+
res, err := checkRestartSafe(
439+
ctx,
440+
nodeID,
441+
a.systemStatus.nodeLiveness,
442+
a.systemStatus.stores,
443+
a.systemStatus.storePool.ClusterNodeCount(),
444+
allowMinimumQuorum,
445+
)
446+
if err != nil {
447+
http.Error(w, "Error checking store status", http.StatusInternalServerError)
448+
return
449+
}
450+
451+
if !res.IsRestartSafe {
452+
// In the style of health check endpoints, we respond with a server error
453+
// to indicate unhealthy. This makes it much easier to integrate this
454+
// endpoint with relatively unsophisticated clients, such as shell scripts.
455+
apiutil.WriteJSONResponse(ctx, w, http.StatusServiceUnavailable, res)
456+
return
457+
}
458+
apiutil.WriteJSONResponse(ctx, w, http.StatusOK, res)
459+
}
460+
461+
type storeVisitor interface {
462+
VisitStores(visitor func(s *kvserver.Store) error) error
463+
}
464+
465+
// RestartSafetyResponse indicates whether the current node is critical
466+
// (cannot be restarted safely).
467+
type RestartSafetyResponse struct {
468+
NodeID int32 `json:"node_id,omitempty"`
469+
// IsRestartSafe is true if restarting this node is safe, under the
470+
// quorum restrictions requested
471+
IsRestartSafe bool `json:"is_restart_safe,omitempty"`
472+
// UnavailableRangeCount indicates how many currently unavailable
473+
// ranges contribute to restart being unsafe.
474+
UnavailableRangeCount int32 `json:"unavailable_range_count,omitempty"`
475+
// UnderreplicatedRangeCount indicates how many currently
476+
// underreplicated ranges contribute to restart being unsafe.
477+
UnderreplicatedRangeCount int32 `json:"underreplicated_range_count,omitempty"`
478+
// RaftLeadershipOnNodeCount indicates how many ranges this node leads.
479+
RaftLeadershipOnNodeCount int32 `json:"raft_leadership_on_node_count,omitempty"`
480+
// StoreNotDrainingCount indicates how many of this node's stores are
481+
// not draining.
482+
StoreNotDrainingCount int32 `json:"store_not_draining_count,omitempty"`
483+
}
484+
485+
func checkRestartSafe(
486+
ctx context.Context,
487+
nodeID roachpb.NodeID,
488+
nodeLiveness livenesspb.NodeVitalityInterface,
489+
stores storeVisitor,
490+
nodeCount int,
491+
allowMinimumQuorum bool,
492+
) (*RestartSafetyResponse, error) {
493+
res := &RestartSafetyResponse{
494+
IsRestartSafe: true,
495+
NodeID: int32(nodeID),
496+
}
497+
498+
vitality := nodeLiveness.ScanNodeVitalityFromCache()
499+
// For each of the node's stores, check each replica's status.
500+
err := stores.VisitStores(func(store *kvserver.Store) error {
501+
if int32(store.NodeID()) != res.NodeID {
502+
return nil
503+
}
504+
505+
if !store.IsDraining() {
506+
res.IsRestartSafe = false
507+
res.StoreNotDrainingCount++
508+
}
509+
510+
store.VisitReplicas(func(replica *kvserver.Replica) bool {
511+
desc, spanCfg := replica.DescAndSpanConfig()
512+
513+
neededVoters := allocatorimpl.GetNeededVoters(spanCfg.GetNumVoters(), nodeCount)
514+
rangeStatus := desc.Replicas().ReplicationStatus(func(rd roachpb.ReplicaDescriptor) bool {
515+
return vitality[rd.NodeID].IsLive(livenesspb.Metrics)
516+
}, neededVoters, -1)
517+
518+
isLeader := replica.RaftBasicStatus().RaftState == raftpb.StateLeader
519+
520+
// Reject Unavailable, Underreplicated, and Leader replicas as unsafe
521+
// to terminate. Additionally, reject when the store (really the node,
522+
// but the status is reported at the store level) is not draining.
523+
if !rangeStatus.Available {
524+
res.IsRestartSafe = false
525+
res.UnavailableRangeCount++
526+
}
527+
if isLeader {
528+
res.IsRestartSafe = false
529+
res.RaftLeadershipOnNodeCount++
530+
}
531+
532+
if rangeStatus.UnderReplicated {
533+
if neededVoters >= 5 && allowMinimumQuorum {
534+
// When neededVoters >= 5, present underreplication doesn't actually imply unavailability after we terminate
535+
// this node. The caller has opted in to allowMinimumQuorum restarts, so check whether this node being down
536+
// actually causes unavailability.
537+
futureStatus := desc.Replicas().ReplicationStatus(func(rd roachpb.ReplicaDescriptor) bool {
538+
if rd.NodeID == nodeID {
539+
return false
540+
}
541+
return vitality[rd.NodeID].IsLive(livenesspb.Metrics)
542+
}, neededVoters, -1)
543+
544+
if !futureStatus.Available {
545+
// Even minimum quorum won't be maintained if we down this node.
546+
res.IsRestartSafe = false
547+
res.UnderreplicatedRangeCount++
548+
}
549+
} else {
550+
// No allowMiniumQuorum (or not a big enough RF for that to matter), so under replication is enough.
551+
res.IsRestartSafe = false
552+
res.UnderreplicatedRangeCount++
553+
}
554+
}
555+
556+
return ctx.Err() == nil
557+
})
558+
559+
if ctx.Err() != nil {
560+
return ctx.Err()
561+
}
562+
return nil
563+
})
564+
return res, err
565+
}
566+
397567
// # Get metric recording and alerting rule templates
398568
//
399569
// Endpoint to export recommended metric recording and alerting rules.

0 commit comments

Comments
 (0)