Skip to content

Commit 175d4ab

Browse files
committed
Merge feature/enhanced-dns-monitoring: Enhanced DNS monitoring and network latency metrics
This merge includes: - Per-nameserver DNS testing for custom domains - Success rate tracking with sliding window - DNS error type classification for better diagnostics - DNS consistency check across multiple lookups - Overlay network testing for accurate CNI connectivity - Prometheus metrics for network latency monitoring - Updated DNS monitoring documentation and best practices
2 parents 3dfdd75 + 499981c commit 175d4ab

File tree

12 files changed

+3682
-60
lines changed

12 files changed

+3682
-60
lines changed

docs/monitors.md

Lines changed: 610 additions & 21 deletions
Large diffs are not rendered by default.

helm/node-doctor/templates/configmap.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,35 @@ data:
7979
ioHealthCheck: {{ .Values.monitors.disk.ioHealthCheck }}
8080
{{- end }}
8181
82+
{{- if .Values.overlayTest.enabled }}
83+
# CNI Connectivity Monitor - Tests overlay network connectivity
84+
# Uses overlay-test pods for accurate CNI testing
85+
- name: cni-connectivity
86+
type: network-cni-check
87+
enabled: true
88+
interval: 30s
89+
timeout: 15s
90+
config:
91+
discovery:
92+
method: kubernetes
93+
namespace: {{ .Release.Namespace }}
94+
labelSelector: app={{ include "node-doctor.name" . }}-overlay-test
95+
refreshInterval: 5m
96+
overlayTestEnabled: true
97+
overlayTestLabelSelector: app={{ include "node-doctor.name" . }}-overlay-test
98+
connectivity:
99+
pingCount: 3
100+
pingTimeout: 5s
101+
warningLatency: 50ms
102+
criticalLatency: 200ms
103+
failureThreshold: 3
104+
minReachablePeers: 80
105+
cniHealth:
106+
enabled: true
107+
configPath: /etc/cni/net.d
108+
checkInterfaces: true
109+
{{- end }}
110+
82111
exporters:
83112
kubernetes:
84113
enabled: {{ .Values.exporters.kubernetes.enabled }}
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
{{- if .Values.overlayTest.enabled }}
2+
apiVersion: apps/v1
3+
kind: DaemonSet
4+
metadata:
5+
name: {{ include "node-doctor.fullname" . }}-overlay-test
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
{{- include "node-doctor.labels" . | nindent 4 }}
9+
app.kubernetes.io/component: overlay-test
10+
spec:
11+
updateStrategy:
12+
type: RollingUpdate
13+
rollingUpdate:
14+
maxUnavailable: {{ .Values.overlayTest.updateStrategy.maxUnavailable | default "25%" }}
15+
16+
selector:
17+
matchLabels:
18+
app: {{ include "node-doctor.name" . }}-overlay-test
19+
app.kubernetes.io/name: {{ include "node-doctor.name" . }}
20+
app.kubernetes.io/instance: {{ .Release.Name }}
21+
22+
template:
23+
metadata:
24+
labels:
25+
app: {{ include "node-doctor.name" . }}-overlay-test
26+
app.kubernetes.io/name: {{ include "node-doctor.name" . }}
27+
app.kubernetes.io/instance: {{ .Release.Name }}
28+
app.kubernetes.io/component: overlay-test
29+
annotations:
30+
description: "Node Doctor overlay network test pod - used for CNI connectivity testing"
31+
spec:
32+
# NO hostNetwork - this pod uses the overlay network
33+
hostNetwork: false
34+
35+
# Use cluster DNS (default)
36+
dnsPolicy: ClusterFirst
37+
38+
# Low priority - these are test pods
39+
priorityClassName: {{ .Values.overlayTest.priorityClassName | default "" | quote }}
40+
41+
serviceAccountName: {{ include "node-doctor.serviceAccountName" . }}
42+
43+
# Tolerate all taints to run on every node
44+
tolerations:
45+
- operator: Exists
46+
47+
nodeSelector:
48+
kubernetes.io/os: linux
49+
50+
terminationGracePeriodSeconds: 5
51+
52+
containers:
53+
- name: overlay-test
54+
image: "{{ .Values.overlayTest.image.repository }}:{{ .Values.overlayTest.image.tag }}"
55+
imagePullPolicy: {{ .Values.overlayTest.image.pullPolicy }}
56+
57+
# Just sleep forever - we only need the pod to exist and be pingable
58+
command: ["/bin/sh"]
59+
args: ["-c", "echo 'Overlay test pod ready'; while true; do sleep 3600; done"]
60+
61+
env:
62+
- name: NODE_NAME
63+
valueFrom:
64+
fieldRef:
65+
fieldPath: spec.nodeName
66+
- name: POD_IP
67+
valueFrom:
68+
fieldRef:
69+
fieldPath: status.podIP
70+
71+
# Minimal security context
72+
securityContext:
73+
runAsNonRoot: true
74+
runAsUser: 65534
75+
runAsGroup: 65534
76+
readOnlyRootFilesystem: true
77+
allowPrivilegeEscalation: false
78+
capabilities:
79+
drop:
80+
- ALL
81+
82+
# Minimal resources
83+
resources:
84+
requests:
85+
cpu: {{ .Values.overlayTest.resources.requests.cpu | default "1m" }}
86+
memory: {{ .Values.overlayTest.resources.requests.memory | default "4Mi" }}
87+
limits:
88+
cpu: {{ .Values.overlayTest.resources.limits.cpu | default "10m" }}
89+
memory: {{ .Values.overlayTest.resources.limits.memory | default "16Mi" }}
90+
91+
# Simple liveness check
92+
livenessProbe:
93+
exec:
94+
command:
95+
- /bin/sh
96+
- -c
97+
- "true"
98+
initialDelaySeconds: 5
99+
periodSeconds: 60
100+
timeoutSeconds: 1
101+
failureThreshold: 3
102+
103+
# Readiness - always ready once running
104+
readinessProbe:
105+
exec:
106+
command:
107+
- /bin/sh
108+
- -c
109+
- "true"
110+
initialDelaySeconds: 2
111+
periodSeconds: 10
112+
timeoutSeconds: 1
113+
failureThreshold: 1
114+
{{- end }}

helm/node-doctor/values.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,31 @@ features:
314314
enableMetrics: true
315315
enableProfiling: false
316316
enableTracing: false
317+
318+
# Overlay Test Configuration
319+
# Deploys lightweight test pods WITHOUT hostNetwork for accurate CNI/overlay testing
320+
overlayTest:
321+
# Enable overlay test pods - required for accurate CNI connectivity testing
322+
enabled: true
323+
324+
# Image for overlay test pods (minimal image, just needs to exist and be pingable)
325+
image:
326+
repository: busybox
327+
tag: "1.36"
328+
pullPolicy: IfNotPresent
329+
330+
# Resource limits for overlay test pods (minimal - just needs to sleep)
331+
resources:
332+
requests:
333+
cpu: 1m
334+
memory: 4Mi
335+
limits:
336+
cpu: 10m
337+
memory: 16Mi
338+
339+
# Update strategy
340+
updateStrategy:
341+
maxUnavailable: "25%"
342+
343+
# Priority class (empty = no priority class)
344+
priorityClassName: ""

pkg/exporters/prometheus/exporter.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,9 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
191191
uptime := time.Since(e.startTime).Seconds()
192192
e.metrics.UptimeSeconds.WithLabelValues(e.nodeName).Set(uptime)
193193

194+
// Extract and record latency metrics from status metadata
195+
e.recordLatencyMetrics(status)
196+
194197
// Record successful export
195198
e.metrics.ExportOperationsTotal.WithLabelValues(
196199
e.nodeName, "prometheus", "status", "success").Inc()
@@ -200,6 +203,74 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
200203
return nil
201204
}
202205

206+
// recordLatencyMetrics extracts latency metrics from status metadata and records them
207+
func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
208+
latencyMetrics := status.GetLatencyMetrics()
209+
if latencyMetrics == nil {
210+
return
211+
}
212+
213+
// Record gateway latency metrics
214+
if latencyMetrics.Gateway != nil {
215+
gw := latencyMetrics.Gateway
216+
latencySeconds := gw.LatencyMs / 1000.0
217+
218+
e.metrics.GatewayLatencySeconds.WithLabelValues(
219+
e.nodeName, gw.GatewayIP).Set(latencySeconds)
220+
221+
e.metrics.GatewayLatencyHistogram.WithLabelValues(
222+
e.nodeName, gw.GatewayIP).Observe(latencySeconds)
223+
}
224+
225+
// Record peer latency metrics
226+
if len(latencyMetrics.Peers) > 0 {
227+
reachableCount := 0
228+
for _, peer := range latencyMetrics.Peers {
229+
latencySeconds := peer.LatencyMs / 1000.0
230+
avgLatencySeconds := peer.AvgLatencyMs / 1000.0
231+
232+
e.metrics.PeerLatencySeconds.WithLabelValues(
233+
e.nodeName, peer.PeerNode, peer.PeerIP).Set(latencySeconds)
234+
235+
e.metrics.PeerLatencyAvgSeconds.WithLabelValues(
236+
e.nodeName, peer.PeerNode, peer.PeerIP).Set(avgLatencySeconds)
237+
238+
reachable := 0.0
239+
if peer.Reachable {
240+
reachable = 1.0
241+
reachableCount++
242+
}
243+
e.metrics.PeerReachable.WithLabelValues(
244+
e.nodeName, peer.PeerNode, peer.PeerIP).Set(reachable)
245+
246+
e.metrics.PeerLatencyHistogram.WithLabelValues(
247+
e.nodeName, peer.PeerNode).Observe(latencySeconds)
248+
}
249+
250+
e.metrics.PeersTotal.WithLabelValues(e.nodeName).Set(float64(len(latencyMetrics.Peers)))
251+
e.metrics.PeersReachableTotal.WithLabelValues(e.nodeName).Set(float64(reachableCount))
252+
}
253+
254+
// Record DNS latency metrics
255+
for _, dns := range latencyMetrics.DNS {
256+
latencySeconds := dns.LatencyMs / 1000.0
257+
258+
e.metrics.DNSLatencySeconds.WithLabelValues(
259+
e.nodeName, dns.DNSServer, dns.Domain, dns.RecordType).Set(latencySeconds)
260+
261+
e.metrics.DNSLatencyHistogram.WithLabelValues(
262+
e.nodeName, dns.DomainType).Observe(latencySeconds)
263+
}
264+
265+
// Record API server latency metrics
266+
if latencyMetrics.APIServer != nil {
267+
latencySeconds := latencyMetrics.APIServer.LatencyMs / 1000.0
268+
269+
e.metrics.APIServerLatencySeconds.WithLabelValues(e.nodeName).Set(latencySeconds)
270+
e.metrics.APIServerLatencyHistogram.WithLabelValues(e.nodeName).Observe(latencySeconds)
271+
}
272+
}
273+
203274
// ExportProblem implements types.Exporter interface for problem exports
204275
func (e *PrometheusExporter) ExportProblem(ctx context.Context, problem *types.Problem) error {
205276
if problem == nil {

0 commit comments

Comments
 (0)