Skip to content

Commit 499981c

Browse files
committed
feat: Add Prometheus metrics for network latency monitoring
Add dedicated Prometheus metrics for all network latency measurements: Gateway metrics: - node_doctor_gateway_latency_seconds (gauge) - node_doctor_gateway_latency_histogram_seconds (histogram) Peer/CNI metrics: - node_doctor_peer_latency_seconds (gauge) - node_doctor_peer_latency_avg_seconds (gauge) - node_doctor_peer_reachable (gauge) - node_doctor_peers_total (gauge) - node_doctor_peers_reachable_total (gauge) - node_doctor_peer_latency_histogram_seconds (histogram) DNS metrics: - node_doctor_dns_latency_seconds (gauge) - node_doctor_dns_latency_histogram_seconds (histogram) API Server metrics: - node_doctor_apiserver_latency_seconds (gauge) - node_doctor_apiserver_latency_histogram_seconds (histogram) Implementation: - Added LatencyMetrics types to pkg/types/types.go - Added metric definitions to pkg/exporters/prometheus/metrics.go - Updated exporter to extract latency from Status.Metadata - Updated Gateway, CNI, DNS, and API Server monitors to report latency
1 parent e4353c8 commit 499981c

File tree

7 files changed

+414
-2
lines changed

7 files changed

+414
-2
lines changed

pkg/exporters/prometheus/exporter.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,9 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
191191
uptime := time.Since(e.startTime).Seconds()
192192
e.metrics.UptimeSeconds.WithLabelValues(e.nodeName).Set(uptime)
193193

194+
// Extract and record latency metrics from status metadata
195+
e.recordLatencyMetrics(status)
196+
194197
// Record successful export
195198
e.metrics.ExportOperationsTotal.WithLabelValues(
196199
e.nodeName, "prometheus", "status", "success").Inc()
@@ -200,6 +203,74 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
200203
return nil
201204
}
202205

206+
// recordLatencyMetrics extracts latency metrics from status metadata and records them
207+
func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
208+
latencyMetrics := status.GetLatencyMetrics()
209+
if latencyMetrics == nil {
210+
return
211+
}
212+
213+
// Record gateway latency metrics
214+
if latencyMetrics.Gateway != nil {
215+
gw := latencyMetrics.Gateway
216+
latencySeconds := gw.LatencyMs / 1000.0
217+
218+
e.metrics.GatewayLatencySeconds.WithLabelValues(
219+
e.nodeName, gw.GatewayIP).Set(latencySeconds)
220+
221+
e.metrics.GatewayLatencyHistogram.WithLabelValues(
222+
e.nodeName, gw.GatewayIP).Observe(latencySeconds)
223+
}
224+
225+
// Record peer latency metrics
226+
if len(latencyMetrics.Peers) > 0 {
227+
reachableCount := 0
228+
for _, peer := range latencyMetrics.Peers {
229+
latencySeconds := peer.LatencyMs / 1000.0
230+
avgLatencySeconds := peer.AvgLatencyMs / 1000.0
231+
232+
e.metrics.PeerLatencySeconds.WithLabelValues(
233+
e.nodeName, peer.PeerNode, peer.PeerIP).Set(latencySeconds)
234+
235+
e.metrics.PeerLatencyAvgSeconds.WithLabelValues(
236+
e.nodeName, peer.PeerNode, peer.PeerIP).Set(avgLatencySeconds)
237+
238+
reachable := 0.0
239+
if peer.Reachable {
240+
reachable = 1.0
241+
reachableCount++
242+
}
243+
e.metrics.PeerReachable.WithLabelValues(
244+
e.nodeName, peer.PeerNode, peer.PeerIP).Set(reachable)
245+
246+
e.metrics.PeerLatencyHistogram.WithLabelValues(
247+
e.nodeName, peer.PeerNode).Observe(latencySeconds)
248+
}
249+
250+
e.metrics.PeersTotal.WithLabelValues(e.nodeName).Set(float64(len(latencyMetrics.Peers)))
251+
e.metrics.PeersReachableTotal.WithLabelValues(e.nodeName).Set(float64(reachableCount))
252+
}
253+
254+
// Record DNS latency metrics
255+
for _, dns := range latencyMetrics.DNS {
256+
latencySeconds := dns.LatencyMs / 1000.0
257+
258+
e.metrics.DNSLatencySeconds.WithLabelValues(
259+
e.nodeName, dns.DNSServer, dns.Domain, dns.RecordType).Set(latencySeconds)
260+
261+
e.metrics.DNSLatencyHistogram.WithLabelValues(
262+
e.nodeName, dns.DomainType).Observe(latencySeconds)
263+
}
264+
265+
// Record API server latency metrics
266+
if latencyMetrics.APIServer != nil {
267+
latencySeconds := latencyMetrics.APIServer.LatencyMs / 1000.0
268+
269+
e.metrics.APIServerLatencySeconds.WithLabelValues(e.nodeName).Set(latencySeconds)
270+
e.metrics.APIServerLatencyHistogram.WithLabelValues(e.nodeName).Observe(latencySeconds)
271+
}
272+
}
273+
203274
// ExportProblem implements types.Exporter interface for problem exports
204275
func (e *PrometheusExporter) ExportProblem(ctx context.Context, problem *types.Problem) error {
205276
if problem == nil {

pkg/exporters/prometheus/metrics.go

Lines changed: 179 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,23 @@ type Metrics struct {
2323
StartTimeSeconds *prometheus.GaugeVec
2424
UptimeSeconds *prometheus.GaugeVec
2525

26+
// Network latency gauge metrics
27+
GatewayLatencySeconds *prometheus.GaugeVec
28+
PeerLatencySeconds *prometheus.GaugeVec
29+
PeerLatencyAvgSeconds *prometheus.GaugeVec
30+
PeerReachable *prometheus.GaugeVec
31+
PeersTotal *prometheus.GaugeVec
32+
PeersReachableTotal *prometheus.GaugeVec
33+
DNSLatencySeconds *prometheus.GaugeVec
34+
APIServerLatencySeconds *prometheus.GaugeVec
35+
2636
// Histogram metrics
27-
MonitorCheckDuration *prometheus.HistogramVec
28-
ExportDuration *prometheus.HistogramVec
37+
MonitorCheckDuration *prometheus.HistogramVec
38+
ExportDuration *prometheus.HistogramVec
39+
GatewayLatencyHistogram *prometheus.HistogramVec
40+
PeerLatencyHistogram *prometheus.HistogramVec
41+
DNSLatencyHistogram *prometheus.HistogramVec
42+
APIServerLatencyHistogram *prometheus.HistogramVec
2943
}
3044

3145
// NewMetrics creates a new Metrics instance with all metric definitions
@@ -167,6 +181,95 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
167181
[]string{"node"},
168182
),
169183

184+
// Network latency gauge metrics
185+
GatewayLatencySeconds: prometheus.NewGaugeVec(
186+
prometheus.GaugeOpts{
187+
Namespace: namespace,
188+
Subsystem: subsystem,
189+
Name: "gateway_latency_seconds",
190+
Help: "Current latency to the default gateway in seconds",
191+
ConstLabels: labels,
192+
},
193+
[]string{"node", "gateway_ip"},
194+
),
195+
196+
PeerLatencySeconds: prometheus.NewGaugeVec(
197+
prometheus.GaugeOpts{
198+
Namespace: namespace,
199+
Subsystem: subsystem,
200+
Name: "peer_latency_seconds",
201+
Help: "Last measured latency to peer node in seconds",
202+
ConstLabels: labels,
203+
},
204+
[]string{"node", "peer_node", "peer_ip"},
205+
),
206+
207+
PeerLatencyAvgSeconds: prometheus.NewGaugeVec(
208+
prometheus.GaugeOpts{
209+
Namespace: namespace,
210+
Subsystem: subsystem,
211+
Name: "peer_latency_avg_seconds",
212+
Help: "Average latency to peer node in seconds",
213+
ConstLabels: labels,
214+
},
215+
[]string{"node", "peer_node", "peer_ip"},
216+
),
217+
218+
PeerReachable: prometheus.NewGaugeVec(
219+
prometheus.GaugeOpts{
220+
Namespace: namespace,
221+
Subsystem: subsystem,
222+
Name: "peer_reachable",
223+
Help: "Whether peer node is reachable (1 = reachable, 0 = unreachable)",
224+
ConstLabels: labels,
225+
},
226+
[]string{"node", "peer_node", "peer_ip"},
227+
),
228+
229+
PeersTotal: prometheus.NewGaugeVec(
230+
prometheus.GaugeOpts{
231+
Namespace: namespace,
232+
Subsystem: subsystem,
233+
Name: "peers_total",
234+
Help: "Total number of discovered peer nodes",
235+
ConstLabels: labels,
236+
},
237+
[]string{"node"},
238+
),
239+
240+
PeersReachableTotal: prometheus.NewGaugeVec(
241+
prometheus.GaugeOpts{
242+
Namespace: namespace,
243+
Subsystem: subsystem,
244+
Name: "peers_reachable_total",
245+
Help: "Number of reachable peer nodes",
246+
ConstLabels: labels,
247+
},
248+
[]string{"node"},
249+
),
250+
251+
DNSLatencySeconds: prometheus.NewGaugeVec(
252+
prometheus.GaugeOpts{
253+
Namespace: namespace,
254+
Subsystem: subsystem,
255+
Name: "dns_latency_seconds",
256+
Help: "DNS resolution latency in seconds",
257+
ConstLabels: labels,
258+
},
259+
[]string{"node", "dns_server", "domain", "record_type"},
260+
),
261+
262+
APIServerLatencySeconds: prometheus.NewGaugeVec(
263+
prometheus.GaugeOpts{
264+
Namespace: namespace,
265+
Subsystem: subsystem,
266+
Name: "apiserver_latency_seconds",
267+
Help: "Kubernetes API server response latency in seconds",
268+
ConstLabels: labels,
269+
},
270+
[]string{"node"},
271+
),
272+
170273
// Histogram metrics
171274
MonitorCheckDuration: prometheus.NewHistogramVec(
172275
prometheus.HistogramOpts{
@@ -191,6 +294,54 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
191294
},
192295
[]string{"node", "exporter", "operation"},
193296
),
297+
298+
GatewayLatencyHistogram: prometheus.NewHistogramVec(
299+
prometheus.HistogramOpts{
300+
Namespace: namespace,
301+
Subsystem: subsystem,
302+
Name: "gateway_latency_histogram_seconds",
303+
Help: "Distribution of gateway latency in seconds",
304+
ConstLabels: labels,
305+
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0},
306+
},
307+
[]string{"node", "gateway_ip"},
308+
),
309+
310+
PeerLatencyHistogram: prometheus.NewHistogramVec(
311+
prometheus.HistogramOpts{
312+
Namespace: namespace,
313+
Subsystem: subsystem,
314+
Name: "peer_latency_histogram_seconds",
315+
Help: "Distribution of peer node latency in seconds",
316+
ConstLabels: labels,
317+
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0},
318+
},
319+
[]string{"node", "peer_node"},
320+
),
321+
322+
DNSLatencyHistogram: prometheus.NewHistogramVec(
323+
prometheus.HistogramOpts{
324+
Namespace: namespace,
325+
Subsystem: subsystem,
326+
Name: "dns_latency_histogram_seconds",
327+
Help: "Distribution of DNS resolution latency in seconds",
328+
ConstLabels: labels,
329+
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0},
330+
},
331+
[]string{"node", "domain_type"},
332+
),
333+
334+
APIServerLatencyHistogram: prometheus.NewHistogramVec(
335+
prometheus.HistogramOpts{
336+
Namespace: namespace,
337+
Subsystem: subsystem,
338+
Name: "apiserver_latency_histogram_seconds",
339+
Help: "Distribution of API server response latency in seconds",
340+
ConstLabels: labels,
341+
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0},
342+
},
343+
[]string{"node"},
344+
),
194345
}
195346

196347
return m, nil
@@ -212,6 +363,19 @@ func (m *Metrics) Register(registry *prometheus.Registry) error {
212363
m.UptimeSeconds,
213364
m.MonitorCheckDuration,
214365
m.ExportDuration,
366+
// Network latency metrics
367+
m.GatewayLatencySeconds,
368+
m.PeerLatencySeconds,
369+
m.PeerLatencyAvgSeconds,
370+
m.PeerReachable,
371+
m.PeersTotal,
372+
m.PeersReachableTotal,
373+
m.DNSLatencySeconds,
374+
m.APIServerLatencySeconds,
375+
m.GatewayLatencyHistogram,
376+
m.PeerLatencyHistogram,
377+
m.DNSLatencyHistogram,
378+
m.APIServerLatencyHistogram,
215379
}
216380

217381
for _, collector := range collectors {
@@ -239,6 +403,19 @@ func (m *Metrics) Unregister(registry *prometheus.Registry) {
239403
m.UptimeSeconds,
240404
m.MonitorCheckDuration,
241405
m.ExportDuration,
406+
// Network latency metrics
407+
m.GatewayLatencySeconds,
408+
m.PeerLatencySeconds,
409+
m.PeerLatencyAvgSeconds,
410+
m.PeerReachable,
411+
m.PeersTotal,
412+
m.PeersReachableTotal,
413+
m.DNSLatencySeconds,
414+
m.APIServerLatencySeconds,
415+
m.GatewayLatencyHistogram,
416+
m.PeerLatencyHistogram,
417+
m.DNSLatencyHistogram,
418+
m.APIServerLatencyHistogram,
242419
}
243420

244421
for _, collector := range collectors {

pkg/monitors/kubernetes/apiserver.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,14 @@ func (m *APIServerMonitor) checkAPIServer(ctx context.Context) (*types.Status, e
418418
}
419419
// Note: Latency info is already captured in APIServerReachable message above
420420

421+
// Set API server latency metrics for Prometheus export
422+
status.SetLatencyMetrics(&types.LatencyMetrics{
423+
APIServer: &types.APIServerLatency{
424+
LatencyMs: float64(metrics.Latency.Microseconds()) / 1000.0,
425+
Reachable: true,
426+
},
427+
})
428+
421429
return status, nil
422430
}
423431

pkg/monitors/network/cni.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,26 @@ func (m *CNIMonitor) checkCNI(ctx context.Context) (*types.Status, error) {
487487
reachableCount, totalPeers, reachablePercent, avgLatencyStr),
488488
))
489489

490+
// Set peer latency metrics for Prometheus export
491+
m.mu.Lock()
492+
peerLatencies := make([]types.PeerLatency, 0, len(m.peerStatuses))
493+
for _, ps := range m.peerStatuses {
494+
peerLatencies = append(peerLatencies, types.PeerLatency{
495+
PeerNode: ps.Peer.NodeName,
496+
PeerIP: ps.Peer.NodeIP,
497+
LatencyMs: float64(ps.LastLatency.Microseconds()) / 1000.0,
498+
AvgLatencyMs: float64(ps.AvgLatency.Microseconds()) / 1000.0,
499+
Reachable: ps.Reachable,
500+
})
501+
}
502+
m.mu.Unlock()
503+
504+
if len(peerLatencies) > 0 {
505+
status.SetLatencyMetrics(&types.LatencyMetrics{
506+
Peers: peerLatencies,
507+
})
508+
}
509+
490510
return status, nil
491511
}
492512

0 commit comments

Comments
 (0)