Skip to content

Commit bd8ee04

Browse files
author
Matt Whelan
committed
server: new endpoint to expose node termination safety status
We expose range status via the cluster API metrics, which is sufficient to derive node criticality information. However, it's complicated, and customers want a simple, opinionated way to know whether a particular node is safe to terminate when orchestrating a rolling restart. This change exposes a new endpoint, analogous to a healthcheck, but instead of indicating node health, it indicates when it's unsafe to terminate a node due to impact on cluster quorum health. By default, we take a conservative approach: we fail the check on any existing under-replication, regardless of replication factor. As an option, we allow the user to relax our stance, and only fail the check when terminating the node in question would cause definite unavailability. Epic: none Part of: https://cockroachlabs.atlassian.net/browse/TREQ-929 Release note (ops change): The /health/restart_safety endpoint indicates when it is unsafe to terminate a node.
1 parent 8b5e34a commit bd8ee04

File tree

3 files changed

+456
-1
lines changed

3 files changed

+456
-1
lines changed

pkg/server/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ go_library(
162162
"//pkg/multitenant/tenantcapabilitiespb",
163163
"//pkg/multitenant/tenantcostmodel",
164164
"//pkg/raft",
165+
"//pkg/raft/raftpb",
165166
"//pkg/roachpb",
166167
"//pkg/rpc",
167168
"//pkg/rpc/nodedialer",

pkg/server/api_v2.go

Lines changed: 171 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ import (
3333
"strconv"
3434

3535
"github.com/cockroachdb/cockroach/pkg/kv"
36+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
37+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl"
38+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
39+
"github.com/cockroachdb/cockroach/pkg/raft/raftpb"
40+
"github.com/cockroachdb/cockroach/pkg/roachpb"
3641
"github.com/cockroachdb/cockroach/pkg/security/username"
3742
"github.com/cockroachdb/cockroach/pkg/server/apiconstants"
3843
"github.com/cockroachdb/cockroach/pkg/server/apiutil"
@@ -55,6 +60,7 @@ const (
5560

5661
type ApiV2System interface {
5762
health(w http.ResponseWriter, r *http.Request)
63+
restartSafetyCheck(w http.ResponseWriter, r *http.Request)
5864
listNodes(w http.ResponseWriter, r *http.Request)
5965
listNodeRanges(w http.ResponseWriter, r *http.Request)
6066
}
@@ -95,7 +101,7 @@ type apiV2SystemServer struct {
95101
}
96102

97103
var _ ApiV2System = &apiV2SystemServer{}
98-
var _ http.Handler = &apiV2Server{}
104+
var _ http.Handler = &apiV2SystemServer{}
99105

100106
// newAPIV2Server returns a new apiV2Server.
101107
func newAPIV2Server(ctx context.Context, opts *apiV2ServerOpts) http.Handler {
@@ -180,6 +186,7 @@ func registerRoutes(
180186
{"nodes/{node_id}/ranges/", systemRoutes.listNodeRanges, true, authserver.ViewClusterMetadataRole, false},
181187
{"ranges/hot/", a.listHotRanges, true, authserver.ViewClusterMetadataRole, false},
182188
{"ranges/{range_id:[0-9]+}/", a.listRange, true, authserver.ViewClusterMetadataRole, false},
189+
{"health/restart_safety/", systemRoutes.restartSafetyCheck, false, authserver.RegularRole, false},
183190
{"health/", systemRoutes.health, false, authserver.RegularRole, false},
184191
{"users/", a.listUsers, true, authserver.RegularRole, false},
185192
{"events/", a.listEvents, true, authserver.ViewClusterMetadataRole, false},
@@ -394,6 +401,169 @@ func (a *apiV2Server) health(w http.ResponseWriter, r *http.Request) {
394401
healthInternal(w, r, a.admin.checkReadinessForHealthCheck)
395402
}
396403

404+
func (a *apiV2Server) restartSafetyCheck(w http.ResponseWriter, r *http.Request) {
405+
apiutil.WriteJSONResponse(r.Context(), w, http.StatusNotImplemented, nil)
406+
}
407+
408+
// # Restart Safety
409+
//
410+
// Endpoint to expose restart safety status. A 200 response indicates that
411+
// terminating the node in question won't cause any ranges to become
412+
// unavailable, at the time the response was prepared. Users may use this check
413+
// as a precondition for advancing a rolling restart process. Checks fail with
414+
// a 503, or a 500 in the case of an error evaluating safety.
415+
func (a *apiV2SystemServer) restartSafetyCheck(w http.ResponseWriter, r *http.Request) {
416+
ctx := r.Context()
417+
418+
if r.Method != http.MethodGet {
419+
http.Error(w, http.StatusText(http.StatusMethodNotAllowed), http.StatusMethodNotAllowed)
420+
return
421+
}
422+
423+
const AllowMinimumQuorumFlag = "allow_minimum_quorum"
424+
425+
query := r.URL.Query()
426+
allowMinimumQuorum := false
427+
var err error
428+
if query.Has(AllowMinimumQuorumFlag) {
429+
allowMinimumQuorum, err = strconv.ParseBool(query.Get(AllowMinimumQuorumFlag))
430+
if err != nil {
431+
http.Error(w, "invalid allow_minimum_quorum value; should be true or false", http.StatusBadRequest)
432+
return
433+
}
434+
}
435+
436+
nodeID := a.systemStatus.node.Descriptor.NodeID
437+
438+
res, err := checkRestartSafe(
439+
ctx,
440+
nodeID,
441+
a.systemStatus.nodeLiveness,
442+
a.systemStatus.stores,
443+
a.systemStatus.storePool.ClusterNodeCount(),
444+
allowMinimumQuorum,
445+
)
446+
if err != nil {
447+
http.Error(w, "Error checking store status", http.StatusInternalServerError)
448+
return
449+
}
450+
451+
if !res.IsRestartSafe {
452+
// In the style of health check endpoints, we respond with a server error
453+
// to indicate unhealthy. This makes it much easier to integrate this
454+
// endpoint with relatively unsophisticated clients, such as shell scripts.
455+
apiutil.WriteJSONResponse(ctx, w, http.StatusServiceUnavailable, res)
456+
return
457+
}
458+
apiutil.WriteJSONResponse(ctx, w, http.StatusOK, res)
459+
}
460+
461+
type storeVisitor interface {
462+
VisitStores(visitor func(s *kvserver.Store) error) error
463+
}
464+
465+
// RestartSafetyResponse indicates whether the current node is critical
466+
// (cannot be restarted safely).
467+
type RestartSafetyResponse struct {
468+
NodeID int32 `json:"node_id,omitempty"`
469+
// IsRestartSafe is true if restarting this node is safe, under the
470+
// quorum restrictions requested
471+
IsRestartSafe bool `json:"is_restart_safe,omitempty"`
472+
// UnavailableRangeCount indicates how many currently unavailable
473+
// ranges contribute to restart being unsafe.
474+
UnavailableRangeCount int32 `json:"unavailable_range_count,omitempty"`
475+
// UnderreplicatedRangeCount indicates how many currently
476+
// underreplicated ranges contribute to restart being unsafe.
477+
UnderreplicatedRangeCount int32 `json:"underreplicated_range_count,omitempty"`
478+
// RaftLeadershipOnNodeCount indicates how many ranges this node leads.
479+
RaftLeadershipOnNodeCount int32 `json:"raft_leadership_on_node_count,omitempty"`
480+
// StoreNotDrainingCount indicates how many of this node's stores are
481+
// not draining.
482+
StoreNotDrainingCount int32 `json:"store_not_draining_count,omitempty"`
483+
}
484+
485+
func checkRestartSafe(
486+
ctx context.Context,
487+
nodeID roachpb.NodeID,
488+
nodeLiveness livenesspb.NodeVitalityInterface,
489+
stores storeVisitor,
490+
nodeCount int,
491+
allowMinimumQuorum bool,
492+
) (*RestartSafetyResponse, error) {
493+
res := &RestartSafetyResponse{
494+
IsRestartSafe: true,
495+
NodeID: int32(nodeID),
496+
}
497+
498+
vitality := nodeLiveness.ScanNodeVitalityFromCache()
499+
// For each of the node's stores, check each replica's status.
500+
err := stores.VisitStores(func(store *kvserver.Store) error {
501+
if int32(store.NodeID()) != res.NodeID {
502+
return nil
503+
}
504+
505+
if !store.IsDraining() {
506+
res.IsRestartSafe = false
507+
res.StoreNotDrainingCount++
508+
}
509+
510+
store.VisitReplicas(func(replica *kvserver.Replica) bool {
511+
desc, spanCfg := replica.DescAndSpanConfig()
512+
513+
neededVoters := allocatorimpl.GetNeededVoters(spanCfg.GetNumVoters(), nodeCount)
514+
rangeStatus := desc.Replicas().ReplicationStatus(func(rd roachpb.ReplicaDescriptor) bool {
515+
return vitality[rd.NodeID].IsLive(livenesspb.Metrics)
516+
}, neededVoters, -1)
517+
518+
isLeader := replica.RaftBasicStatus().RaftState == raftpb.StateLeader
519+
520+
// Reject Unavailable, Underreplicated, and Leader replicas as unsafe
521+
// to terminate. Additionally, reject when the store (really the node,
522+
// but the status is reported at the store level) is not draining.
523+
if !rangeStatus.Available {
524+
res.IsRestartSafe = false
525+
res.UnavailableRangeCount++
526+
}
527+
if isLeader {
528+
res.IsRestartSafe = false
529+
res.RaftLeadershipOnNodeCount++
530+
}
531+
532+
if rangeStatus.UnderReplicated {
533+
if neededVoters >= 5 && allowMinimumQuorum {
534+
// When neededVoters >= 5, present underreplication doesn't actually imply unavailability after we terminate
535+
// this node. The caller has opted in to allowMinimumQuorum restarts, so check whether this node being down
536+
// actually causes unavailability.
537+
futureStatus := desc.Replicas().ReplicationStatus(func(rd roachpb.ReplicaDescriptor) bool {
538+
if rd.NodeID == nodeID {
539+
return false
540+
}
541+
return vitality[rd.NodeID].IsLive(livenesspb.Metrics)
542+
}, neededVoters, -1)
543+
544+
if !futureStatus.Available {
545+
// Even minimum quorum won't be maintained if we down this node.
546+
res.IsRestartSafe = false
547+
res.UnderreplicatedRangeCount++
548+
}
549+
} else {
550+
// No allowMiniumQuorum (or not a big enough RF for that to matter), so under replication is enough.
551+
res.IsRestartSafe = false
552+
res.UnderreplicatedRangeCount++
553+
}
554+
}
555+
556+
return ctx.Err() == nil
557+
})
558+
559+
if ctx.Err() != nil {
560+
return ctx.Err()
561+
}
562+
return nil
563+
})
564+
return res, err
565+
}
566+
397567
// # Get metric recording and alerting rule templates
398568
//
399569
// Endpoint to export recommended metric recording and alerting rules.

0 commit comments

Comments
 (0)