Skip to content

Commit de5683f

Browse files
authored
derp: change packets_dropped metric to also have reason and kind labels (tailscale#14651)
Metrics currently exist for dropped packets by reason, and total received packets by kind (e.g., `disco` or `other`), but relating these two together to gleam information about the drop rate for specific reasons on a per-kind basis is not currently possible. Change `derp_packets_dropped` to use a `metrics.MultiLabelMap` to track both the `reason` and `kind` in the same metric to allow for this desired level of granularity. Drop metrics that this makes unnecessary (namely `packetsDroppedReason` and `packetsDroppedType`). Updates tailscale/corp#25489 Signed-off-by: Mario Minardi <[email protected]>
1 parent 7d73a38 commit de5683f

File tree

2 files changed

+126
-131
lines changed

2 files changed

+126
-131
lines changed

derp/derp_server.go

Lines changed: 126 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,14 @@ const (
112112
disableFighters
113113
)
114114

115+
// packetKind is the kind of packet being sent through DERP
116+
type packetKind string
117+
118+
const (
119+
packetKindDisco packetKind = "disco"
120+
packetKindOther packetKind = "other"
121+
)
122+
115123
type align64 [0]atomic.Int64 // for side effect of its 64-bit alignment
116124

117125
// Server is a DERP server.
@@ -131,44 +139,37 @@ type Server struct {
131139
debug bool
132140

133141
// Counters:
134-
packetsSent, bytesSent expvar.Int
135-
packetsRecv, bytesRecv expvar.Int
136-
packetsRecvByKind metrics.LabelMap
137-
packetsRecvDisco *expvar.Int
138-
packetsRecvOther *expvar.Int
139-
_ align64
140-
packetsDropped expvar.Int
141-
packetsDroppedReason metrics.LabelMap
142-
packetsDroppedReasonCounters []*expvar.Int // indexed by dropReason
143-
packetsDroppedType metrics.LabelMap
144-
packetsDroppedTypeDisco *expvar.Int
145-
packetsDroppedTypeOther *expvar.Int
146-
_ align64
147-
packetsForwardedOut expvar.Int
148-
packetsForwardedIn expvar.Int
149-
peerGoneDisconnectedFrames expvar.Int // number of peer disconnected frames sent
150-
peerGoneNotHereFrames expvar.Int // number of peer not here frames sent
151-
gotPing expvar.Int // number of ping frames from client
152-
sentPong expvar.Int // number of pong frames enqueued to client
153-
accepts expvar.Int
154-
curClients expvar.Int
155-
curClientsNotIdeal expvar.Int
156-
curHomeClients expvar.Int // ones with preferred
157-
dupClientKeys expvar.Int // current number of public keys we have 2+ connections for
158-
dupClientConns expvar.Int // current number of connections sharing a public key
159-
dupClientConnTotal expvar.Int // total number of accepted connections when a dup key existed
160-
unknownFrames expvar.Int
161-
homeMovesIn expvar.Int // established clients announce home server moves in
162-
homeMovesOut expvar.Int // established clients announce home server moves out
163-
multiForwarderCreated expvar.Int
164-
multiForwarderDeleted expvar.Int
165-
removePktForwardOther expvar.Int
166-
sclientWriteTimeouts expvar.Int
167-
avgQueueDuration *uint64 // In milliseconds; accessed atomically
168-
tcpRtt metrics.LabelMap // histogram
169-
meshUpdateBatchSize *metrics.Histogram
170-
meshUpdateLoopCount *metrics.Histogram
171-
bufferedWriteFrames *metrics.Histogram // how many sendLoop frames (or groups of related frames) get written per flush
142+
packetsSent, bytesSent expvar.Int
143+
packetsRecv, bytesRecv expvar.Int
144+
packetsRecvByKind metrics.LabelMap
145+
packetsRecvDisco *expvar.Int
146+
packetsRecvOther *expvar.Int
147+
_ align64
148+
packetsForwardedOut expvar.Int
149+
packetsForwardedIn expvar.Int
150+
peerGoneDisconnectedFrames expvar.Int // number of peer disconnected frames sent
151+
peerGoneNotHereFrames expvar.Int // number of peer not here frames sent
152+
gotPing expvar.Int // number of ping frames from client
153+
sentPong expvar.Int // number of pong frames enqueued to client
154+
accepts expvar.Int
155+
curClients expvar.Int
156+
curClientsNotIdeal expvar.Int
157+
curHomeClients expvar.Int // ones with preferred
158+
dupClientKeys expvar.Int // current number of public keys we have 2+ connections for
159+
dupClientConns expvar.Int // current number of connections sharing a public key
160+
dupClientConnTotal expvar.Int // total number of accepted connections when a dup key existed
161+
unknownFrames expvar.Int
162+
homeMovesIn expvar.Int // established clients announce home server moves in
163+
homeMovesOut expvar.Int // established clients announce home server moves out
164+
multiForwarderCreated expvar.Int
165+
multiForwarderDeleted expvar.Int
166+
removePktForwardOther expvar.Int
167+
sclientWriteTimeouts expvar.Int
168+
avgQueueDuration *uint64 // In milliseconds; accessed atomically
169+
tcpRtt metrics.LabelMap // histogram
170+
meshUpdateBatchSize *metrics.Histogram
171+
meshUpdateLoopCount *metrics.Histogram
172+
bufferedWriteFrames *metrics.Histogram // how many sendLoop frames (or groups of related frames) get written per flush
172173

173174
// verifyClientsLocalTailscaled only accepts client connections to the DERP
174175
// server if the clientKey is a known peer in the network, as specified by a
@@ -351,68 +352,93 @@ type Conn interface {
351352
SetWriteDeadline(time.Time) error
352353
}
353354

355+
var packetsDropped = metrics.NewMultiLabelMap[dropReasonKindLabels](
356+
"derp_packets_dropped",
357+
"counter",
358+
"DERP packets dropped by reason and by kind")
359+
354360
// NewServer returns a new DERP server. It doesn't listen on its own.
355361
// Connections are given to it via Server.Accept.
356362
func NewServer(privateKey key.NodePrivate, logf logger.Logf) *Server {
357363
var ms runtime.MemStats
358364
runtime.ReadMemStats(&ms)
359365

360366
s := &Server{
361-
debug: envknob.Bool("DERP_DEBUG_LOGS"),
362-
privateKey: privateKey,
363-
publicKey: privateKey.Public(),
364-
logf: logf,
365-
limitedLogf: logger.RateLimitedFn(logf, 30*time.Second, 5, 100),
366-
packetsRecvByKind: metrics.LabelMap{Label: "kind"},
367-
packetsDroppedReason: metrics.LabelMap{Label: "reason"},
368-
packetsDroppedType: metrics.LabelMap{Label: "type"},
369-
clients: map[key.NodePublic]*clientSet{},
370-
clientsMesh: map[key.NodePublic]PacketForwarder{},
371-
netConns: map[Conn]chan struct{}{},
372-
memSys0: ms.Sys,
373-
watchers: set.Set[*sclient]{},
374-
peerGoneWatchers: map[key.NodePublic]set.HandleSet[func(key.NodePublic)]{},
375-
avgQueueDuration: new(uint64),
376-
tcpRtt: metrics.LabelMap{Label: "le"},
377-
meshUpdateBatchSize: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}),
378-
meshUpdateLoopCount: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100}),
379-
bufferedWriteFrames: metrics.NewHistogram([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100}),
380-
keyOfAddr: map[netip.AddrPort]key.NodePublic{},
381-
clock: tstime.StdClock{},
367+
debug: envknob.Bool("DERP_DEBUG_LOGS"),
368+
privateKey: privateKey,
369+
publicKey: privateKey.Public(),
370+
logf: logf,
371+
limitedLogf: logger.RateLimitedFn(logf, 30*time.Second, 5, 100),
372+
packetsRecvByKind: metrics.LabelMap{Label: "kind"},
373+
clients: map[key.NodePublic]*clientSet{},
374+
clientsMesh: map[key.NodePublic]PacketForwarder{},
375+
netConns: map[Conn]chan struct{}{},
376+
memSys0: ms.Sys,
377+
watchers: set.Set[*sclient]{},
378+
peerGoneWatchers: map[key.NodePublic]set.HandleSet[func(key.NodePublic)]{},
379+
avgQueueDuration: new(uint64),
380+
tcpRtt: metrics.LabelMap{Label: "le"},
381+
meshUpdateBatchSize: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}),
382+
meshUpdateLoopCount: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100}),
383+
bufferedWriteFrames: metrics.NewHistogram([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100}),
384+
keyOfAddr: map[netip.AddrPort]key.NodePublic{},
385+
clock: tstime.StdClock{},
382386
}
383387
s.initMetacert()
384-
s.packetsRecvDisco = s.packetsRecvByKind.Get("disco")
385-
s.packetsRecvOther = s.packetsRecvByKind.Get("other")
388+
s.packetsRecvDisco = s.packetsRecvByKind.Get(string(packetKindDisco))
389+
s.packetsRecvOther = s.packetsRecvByKind.Get(string(packetKindOther))
386390

387-
s.packetsDroppedReasonCounters = s.genPacketsDroppedReasonCounters()
388-
389-
s.packetsDroppedTypeDisco = s.packetsDroppedType.Get("disco")
390-
s.packetsDroppedTypeOther = s.packetsDroppedType.Get("other")
391+
genPacketsDroppedCounters()
391392

392393
s.perClientSendQueueDepth = getPerClientSendQueueDepth()
393394
return s
394395
}
395396

396-
func (s *Server) genPacketsDroppedReasonCounters() []*expvar.Int {
397-
getMetric := s.packetsDroppedReason.Get
398-
ret := []*expvar.Int{
399-
dropReasonUnknownDest: getMetric("unknown_dest"),
400-
dropReasonUnknownDestOnFwd: getMetric("unknown_dest_on_fwd"),
401-
dropReasonGoneDisconnected: getMetric("gone_disconnected"),
402-
dropReasonQueueHead: getMetric("queue_head"),
403-
dropReasonQueueTail: getMetric("queue_tail"),
404-
dropReasonWriteError: getMetric("write_error"),
405-
dropReasonDupClient: getMetric("dup_client"),
397+
func genPacketsDroppedCounters() {
398+
initMetrics := func(reason dropReason) {
399+
packetsDropped.Add(dropReasonKindLabels{
400+
Kind: string(packetKindDisco),
401+
Reason: string(reason),
402+
}, 0)
403+
packetsDropped.Add(dropReasonKindLabels{
404+
Kind: string(packetKindOther),
405+
Reason: string(reason),
406+
}, 0)
407+
}
408+
getMetrics := func(reason dropReason) []expvar.Var {
409+
return []expvar.Var{
410+
packetsDropped.Get(dropReasonKindLabels{
411+
Kind: string(packetKindDisco),
412+
Reason: string(reason),
413+
}),
414+
packetsDropped.Get(dropReasonKindLabels{
415+
Kind: string(packetKindOther),
416+
Reason: string(reason),
417+
}),
418+
}
406419
}
407-
if len(ret) != int(numDropReasons) {
408-
panic("dropReason metrics out of sync")
420+
421+
dropReasons := []dropReason{
422+
dropReasonUnknownDest,
423+
dropReasonUnknownDestOnFwd,
424+
dropReasonGoneDisconnected,
425+
dropReasonQueueHead,
426+
dropReasonQueueTail,
427+
dropReasonWriteError,
428+
dropReasonDupClient,
409429
}
410-
for i := range numDropReasons {
411-
if ret[i] == nil {
430+
431+
for _, dr := range dropReasons {
432+
initMetrics(dr)
433+
m := getMetrics(dr)
434+
if len(m) != 2 {
435+
panic("dropReason metrics out of sync")
436+
}
437+
438+
if m[0] == nil || m[1] == nil {
412439
panic("dropReason metrics out of sync")
413440
}
414441
}
415-
return ret
416442
}
417443

418444
// SetMesh sets the pre-shared key that regional DERP servers used to mesh
@@ -1152,31 +1178,36 @@ func (c *sclient) debugLogf(format string, v ...any) {
11521178
}
11531179
}
11541180

1155-
// dropReason is why we dropped a DERP frame.
1156-
type dropReason int
1181+
type dropReasonKindLabels struct {
1182+
Reason string // metric label corresponding to a given dropReason
1183+
Kind string // either `disco` or `other`
1184+
}
11571185

1158-
//go:generate go run tailscale.com/cmd/addlicense -file dropreason_string.go go run golang.org/x/tools/cmd/stringer -type=dropReason -trimprefix=dropReason
1186+
// dropReason is why we dropped a DERP frame.
1187+
type dropReason string
11591188

11601189
const (
1161-
dropReasonUnknownDest dropReason = iota // unknown destination pubkey
1162-
dropReasonUnknownDestOnFwd // unknown destination pubkey on a derp-forwarded packet
1163-
dropReasonGoneDisconnected // destination tailscaled disconnected before we could send
1164-
dropReasonQueueHead // destination queue is full, dropped packet at queue head
1165-
dropReasonQueueTail // destination queue is full, dropped packet at queue tail
1166-
dropReasonWriteError // OS write() failed
1167-
dropReasonDupClient // the public key is connected 2+ times (active/active, fighting)
1168-
numDropReasons // unused; keep last
1190+
dropReasonUnknownDest dropReason = "unknown_dest" // unknown destination pubkey
1191+
dropReasonUnknownDestOnFwd dropReason = "unknown_dest_on_fwd" // unknown destination pubkey on a derp-forwarded packet
1192+
dropReasonGoneDisconnected dropReason = "gone_disconnected" // destination tailscaled disconnected before we could send
1193+
dropReasonQueueHead dropReason = "queue_head" // destination queue is full, dropped packet at queue head
1194+
dropReasonQueueTail dropReason = "queue_tail" // destination queue is full, dropped packet at queue tail
1195+
dropReasonWriteError dropReason = "write_error" // OS write() failed
1196+
dropReasonDupClient dropReason = "dup_client" // the public key is connected 2+ times (active/active, fighting)
11691197
)
11701198

11711199
func (s *Server) recordDrop(packetBytes []byte, srcKey, dstKey key.NodePublic, reason dropReason) {
1172-
s.packetsDropped.Add(1)
1173-
s.packetsDroppedReasonCounters[reason].Add(1)
1200+
labels := dropReasonKindLabels{
1201+
Reason: string(reason),
1202+
}
11741203
looksDisco := disco.LooksLikeDiscoWrapper(packetBytes)
11751204
if looksDisco {
1176-
s.packetsDroppedTypeDisco.Add(1)
1205+
labels.Kind = string(packetKindDisco)
11771206
} else {
1178-
s.packetsDroppedTypeOther.Add(1)
1207+
labels.Kind = string(packetKindOther)
11791208
}
1209+
packetsDropped.Add(labels, 1)
1210+
11801211
if verboseDropKeys[dstKey] {
11811212
// Preformat the log string prior to calling limitedLogf. The
11821213
// limiter acts based on the format string, and we want to
@@ -2095,9 +2126,6 @@ func (s *Server) ExpVar() expvar.Var {
20952126
m.Set("accepts", &s.accepts)
20962127
m.Set("bytes_received", &s.bytesRecv)
20972128
m.Set("bytes_sent", &s.bytesSent)
2098-
m.Set("packets_dropped", &s.packetsDropped)
2099-
m.Set("counter_packets_dropped_reason", &s.packetsDroppedReason)
2100-
m.Set("counter_packets_dropped_type", &s.packetsDroppedType)
21012129
m.Set("counter_packets_received_kind", &s.packetsRecvByKind)
21022130
m.Set("packets_sent", &s.packetsSent)
21032131
m.Set("packets_received", &s.packetsRecv)

derp/dropreason_string.go

Lines changed: 0 additions & 33 deletions
This file was deleted.

0 commit comments

Comments
 (0)