Skip to content

Commit dfea05d

Browse files
committed
Add /raftz monitoring endpoint
We often find ourselves without good visibility into what's really going on in the Raft layer. This endpoint dumps quite a substantial amount of internal Raft node state. Filters include: * `?acc=ACCNAME` to filter by account * `?group=GROUP` to show only specific groups Otherwise all groups across all accounts are shown. Signed-off-by: Neil Twigg <neil@nats.io>
1 parent f263d75 commit dfea05d

File tree

2 files changed

+137
-0
lines changed

2 files changed

+137
-0
lines changed

server/monitor.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,6 +1421,7 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) {
14211421
<a href=.%s>Routes</a>
14221422
<a href=.%s>LeafNodes</a>
14231423
<a href=.%s>Gateways</a>
1424+
<a href=.%s>Raft Groups</a>
14241425
<a href=.%s class=last>Health Probe</a>
14251426
<a href=https://docs.nats.io/running-a-nats-service/nats_admin/monitoring class="help">Help</a>
14261427
</body>
@@ -1436,6 +1437,7 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) {
14361437
s.basePath(RoutezPath),
14371438
s.basePath(LeafzPath),
14381439
s.basePath(GatewayzPath),
1440+
s.basePath(RaftzPath),
14391441
s.basePath(HealthzPath),
14401442
)
14411443
}
@@ -3741,3 +3743,135 @@ func (s *Server) profilez(opts *ProfilezOptions) *ProfilezStatus {
37413743
Profile: buffer.Bytes(),
37423744
}
37433745
}
3746+
3747+
type RaftzGroup struct {
3748+
ID string `json:"id"`
3749+
State string `json:"state"`
3750+
Size int `json:"size"`
3751+
QuorumNeeded int `json:"quorum_needed"`
3752+
Observer bool `json:"observer,omitempty"`
3753+
Paused bool `json:"paused,omitempty"`
3754+
Committed uint64 `json:"committed"`
3755+
Applied uint64 `json:"applied"`
3756+
CatchingUp bool `json:"catching_up,omitempty"`
3757+
Leader string `json:"leader,omitempty"`
3758+
EverHadLeader bool `json:"ever_had_leader"`
3759+
Term uint64 `json:"term"`
3760+
Vote string `json:"voted_for,omitempty"`
3761+
PTerm uint64 `json:"pterm"`
3762+
PIndex uint64 `json:"pindex"`
3763+
IPQPropLen int `json:"ipq_proposal_len"`
3764+
IPQEntryLen int `json:"ipq_entry_len"`
3765+
IPQRespLen int `json:"ipq_resp_len"`
3766+
IPQApplyLen int `json:"ipq_apply_len"`
3767+
WAL StreamState `json:"wal"`
3768+
WALError error `json:"wal_error,omitempty"`
3769+
Peers map[string]RaftzGroupPeer `json:"peers"`
3770+
}
3771+
3772+
type RaftzGroupPeer struct {
3773+
Name string `json:"name"`
3774+
Known bool `json:"known"`
3775+
LastReplicatedIndex uint64 `json:"last_replicated_index,omitempty"`
3776+
LastSeen string `json:"last_seen,omitempty"`
3777+
}
3778+
3779+
func (s *Server) HandleRaftz(w http.ResponseWriter, r *http.Request) {
3780+
if s.raftNodes == nil {
3781+
w.WriteHeader(404)
3782+
w.Write([]byte("No Raft nodes registered"))
3783+
return
3784+
}
3785+
3786+
gfilter := r.URL.Query().Get("group")
3787+
afilter := r.URL.Query().Get("acc")
3788+
3789+
groups := map[string]RaftNode{}
3790+
infos := map[string]map[string]RaftzGroup{} // account -> group ID
3791+
3792+
s.rnMu.RLock()
3793+
if gfilter != _EMPTY_ {
3794+
if rg, ok := s.raftNodes[gfilter]; ok && rg != nil {
3795+
if n, ok := rg.(*raft); ok {
3796+
if afilter == "" || (afilter != "" && n.accName == afilter) {
3797+
groups[gfilter] = rg
3798+
}
3799+
}
3800+
}
3801+
} else {
3802+
for name, rg := range s.raftNodes {
3803+
if rg == nil {
3804+
continue
3805+
}
3806+
if n, ok := rg.(*raft); ok {
3807+
if afilter != "" && n.accName != afilter {
3808+
continue
3809+
}
3810+
groups[name] = rg
3811+
}
3812+
}
3813+
}
3814+
s.rnMu.RUnlock()
3815+
3816+
if len(groups) == 0 {
3817+
w.WriteHeader(404)
3818+
w.Write([]byte("No Raft nodes found, does the specified account/group exist?"))
3819+
return
3820+
}
3821+
3822+
for name, rg := range groups {
3823+
n, ok := rg.(*raft)
3824+
if n == nil || !ok {
3825+
continue
3826+
}
3827+
if _, ok := infos[n.accName]; !ok {
3828+
infos[n.accName] = map[string]RaftzGroup{}
3829+
}
3830+
// Only take the lock once, using the public RaftNode functions would
3831+
// cause us to take and release the locks over and over again.
3832+
n.RLock()
3833+
info := RaftzGroup{
3834+
ID: n.id,
3835+
State: RaftState(n.state.Load()).String(),
3836+
Size: n.csz,
3837+
QuorumNeeded: n.qn,
3838+
Observer: n.observer,
3839+
Paused: n.paused,
3840+
Committed: n.commit,
3841+
Applied: n.applied,
3842+
CatchingUp: n.catchup != nil,
3843+
Leader: n.leader,
3844+
EverHadLeader: n.pleader,
3845+
Term: n.term,
3846+
Vote: n.vote,
3847+
PTerm: n.pterm,
3848+
PIndex: n.pindex,
3849+
IPQPropLen: n.prop.len(),
3850+
IPQEntryLen: n.entry.len(),
3851+
IPQRespLen: n.resp.len(),
3852+
IPQApplyLen: n.apply.len(),
3853+
WALError: n.werr,
3854+
Peers: map[string]RaftzGroupPeer{},
3855+
}
3856+
n.wal.FastState(&info.WAL)
3857+
for id, p := range n.peers {
3858+
if id == n.id {
3859+
continue
3860+
}
3861+
peer := RaftzGroupPeer{
3862+
Name: s.serverNameForNode(id),
3863+
Known: p.kp,
3864+
LastReplicatedIndex: p.li,
3865+
}
3866+
if p.ts > 0 {
3867+
peer.LastSeen = time.Since(time.Unix(0, p.ts)).String()
3868+
}
3869+
info.Peers[id] = peer
3870+
}
3871+
n.RUnlock()
3872+
infos[n.accName][name] = info
3873+
}
3874+
3875+
b, _ := json.MarshalIndent(infos, "", " ")
3876+
ResponseHandler(w, r, b)
3877+
}

server/server.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2892,6 +2892,7 @@ const (
28922892
JszPath = "/jsz"
28932893
HealthzPath = "/healthz"
28942894
IPQueuesPath = "/ipqueuesz"
2895+
RaftzPath = "/raftz"
28952896
)
28962897

28972898
func (s *Server) basePath(p string) string {
@@ -3006,6 +3007,8 @@ func (s *Server) startMonitoring(secure bool) error {
30063007
mux.HandleFunc(s.basePath(HealthzPath), s.HandleHealthz)
30073008
// IPQueuesz
30083009
mux.HandleFunc(s.basePath(IPQueuesPath), s.HandleIPQueuesz)
3010+
// Raftz
3011+
mux.HandleFunc(s.basePath(RaftzPath), s.HandleRaftz)
30093012

30103013
// Do not set a WriteTimeout because it could cause cURL/browser
30113014
// to return empty response or unable to display page if the

0 commit comments

Comments
 (0)