Skip to content

Commit 8b8cad7

Browse files
Merge pull request #3583 from slashpai/endpointslice
MON-4437: Migrate windows-exporter ServiceMonitor to EndpointSlice
2 parents 18a9d10 + 88be9f8 commit 8b8cad7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+3056
-1905
lines changed

bundle/manifests/prometheus-k8s_rbac.authorization.k8s.io_v1_role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,11 @@ rules:
1313
verbs:
1414
- list
1515
- watch
16+
- apiGroups:
17+
- "discovery.k8s.io"
18+
resources:
19+
- endpointslices
20+
verbs:
21+
- get
22+
- list
23+
- watch

config/windows-exporter/windows-exporter-role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,11 @@ rules:
1313
verbs:
1414
- list
1515
- watch
16+
- apiGroups:
17+
- "discovery.k8s.io"
18+
resources:
19+
- endpointslices
20+
verbs:
21+
- get
22+
- list
23+
- watch

controllers/metric_controller.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ func (r *metricReconciler) ensureServiceMonitor(ctx context.Context) error {
132132
replacement1 := "$1:9182"
133133
replacement2 := metrics.WindowsMetricsResource
134134
attachMetadataBool := true
135+
endpointSliceRole := monv1.EndpointSliceRole
135136
expectedSM := &monv1.ServiceMonitor{
136137
ObjectMeta: metav1.ObjectMeta{
137138
Name: metrics.WindowsMetricsResource,
@@ -141,6 +142,7 @@ func (r *metricReconciler) ensureServiceMonitor(ctx context.Context) error {
141142
},
142143
},
143144
Spec: monv1.ServiceMonitorSpec{
145+
ServiceDiscoveryRole: &endpointSliceRole,
144146
AttachMetadata: &monv1.AttachMetadata{
145147
Node: &attachMetadataBool,
146148
},
@@ -165,7 +167,7 @@ func (r *metricReconciler) ensureServiceMonitor(ctx context.Context) error {
165167
Replacement: &replacement0,
166168
TargetLabel: "instance",
167169
SourceLabels: []monv1.LabelName{
168-
"__meta_kubernetes_endpoint_address_target_name",
170+
"__meta_kubernetes_endpointslice_endpoint_target_name",
169171
},
170172
},
171173
{ // Include only Windows nodes for this serviceMonitor

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ require (
2323
github.com/operator-framework/operator-lib v0.4.0
2424
github.com/operator-framework/operator-lifecycle-manager v0.22.0
2525
github.com/pkg/sftp v1.13.10
26-
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.85.0
27-
github.com/prometheus-operator/prometheus-operator/pkg/client v0.85.0
26+
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.86.2
27+
github.com/prometheus-operator/prometheus-operator/pkg/client v0.86.2
2828
github.com/spf13/cobra v1.10.1
2929
github.com/spf13/pflag v1.0.10
3030
github.com/stretchr/testify v1.11.1

go.sum

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -544,10 +544,10 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
544544
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
545545
github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI=
546546
github.com/pquerna/cachecontrol v0.0.0-20171018203845-0dec1b30a021/go.mod h1:prYjPmNq4d1NPVmpShWobRqXY3q7Vp+80DqgxxUrUIA=
547-
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.85.0 h1:oY+F5FZFmCjCyzkHWPjVQpzvnvEB/0FP+iyzDUUlqFc=
548-
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.85.0/go.mod h1:VB7wtBmDT6W2RJHzsvPZlBId+EnmeQA0d33fFTXvraM=
549-
github.com/prometheus-operator/prometheus-operator/pkg/client v0.85.0 h1:OdW3Vnmoa2pM5PfRyQaSEO+UN5yamEUS1MuhwE0PxbY=
550-
github.com/prometheus-operator/prometheus-operator/pkg/client v0.85.0/go.mod h1:5ctipSFkXKeXig01Or0aXKNaaBQFWZAAK1zzYWz9YZY=
547+
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.86.2 h1:VRXUgbGmpmjZgFYiUnTwlC+JjfCUs5KKFsorJhI1ZKQ=
548+
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.86.2/go.mod h1:nPk0OteXBkbT0CRCa2oZQL1jRLW6RJ2fuIijHypeJdk=
549+
github.com/prometheus-operator/prometheus-operator/pkg/client v0.86.2 h1:aD+r5a/96ZVT11Uo6Jpt3gO2Hutq3NROB8lXYKMZlOI=
550+
github.com/prometheus-operator/prometheus-operator/pkg/client v0.86.2/go.mod h1:fXZB2vXirMxIjFEIggDrFirYJQh7GkHCZRwm8aKD0e8=
551551
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
552552
github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso=
553553
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=

test/e2e/delete_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,8 +273,9 @@ func (tc *testContext) testWindowsNodeDeletion(t *testing.T) {
273273
_, err = tc.waitForWindowsMachines(int(expectedNodeCount), "", true)
274274
require.NoError(t, err, "ConfigMap controller Windows machine deletion failed")
275275

276-
// Test if prometheus configuration is updated to have no node entries in the endpoints object
276+
// Test if prometheus configuration is updated to have no node entries in the endpointslice objects.
277277
t.Run("Prometheus configuration", tc.testPrometheus)
278+
t.Run("Prometheus endpoint slice cleanup", tc.testPrometheusEndpointSliceCleanup)
278279

279280
// Cleanup windows-instances ConfigMap
280281
tc.deleteWindowsInstanceConfigMap()

test/e2e/metrics_test.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,41 @@ func (tc *testContext) testPrometheus(t *testing.T) {
8282

8383
}
8484

85+
// testPrometheusEndpointSliceCleanup verifies that no Windows nodes remain in kubelet EndpointSlices
86+
func (tc *testContext) testPrometheusEndpointSliceCleanup(t *testing.T) {
87+
// List all EndpointSlices for the kubelet service in kube-system namespace
88+
endpointSlices, err := tc.client.K8s.DiscoveryV1().EndpointSlices("kube-system").List(
89+
context.TODO(),
90+
metav1.ListOptions{
91+
LabelSelector: "kubernetes.io/service-name=kubelet",
92+
},
93+
)
94+
require.NoError(t, err, "error listing EndpointSlices for kubelet service")
95+
96+
// Verify that no Windows node addresses appear in any EndpointSlice
97+
var foundWindowsNodes []string
98+
for _, slice := range endpointSlices.Items {
99+
for _, endpoint := range slice.Endpoints {
100+
// Check if this endpoint references a node
101+
if endpoint.TargetRef == nil || endpoint.TargetRef.Kind != "Node" {
102+
continue
103+
}
104+
// Try to get the node to check its OS
105+
node, err := tc.client.K8s.CoreV1().Nodes().Get(context.Background(),
106+
endpoint.TargetRef.Name, metav1.GetOptions{})
107+
require.NoError(t, err, "node %s referenced in EndpointSlice not found - EndpointSlice not synced properly",
108+
endpoint.TargetRef.Name)
109+
// Check if this is a Windows node
110+
if nodeOS, exists := node.Labels["kubernetes.io/os"]; exists && nodeOS == "windows" {
111+
foundWindowsNodes = append(foundWindowsNodes, node.Name)
112+
}
113+
}
114+
}
115+
116+
require.Empty(t, foundWindowsNodes,
117+
"Found Windows nodes in kubelet EndpointSlices after deletion: %v", foundWindowsNodes)
118+
}
119+
85120
// PrometheusQuery defines the result of the /query request
86121
// Example Reference of Prometheus Query Response: https://prometheus.io/docs/prometheus/latest/querying/api/
87122
type PrometheusQuery struct {
@@ -239,9 +274,12 @@ func (tc *testContext) testPodMetrics(t *testing.T, podName string) {
239274
fmt.Sprintf("pod_interface_network:container_network_receive_bytes:irate5m{pod='%s',namespace='%s'}", podName, tc.workloadNamespace),
240275
fmt.Sprintf("pod_interface_network:container_network_transmit_bytes_total:irate5m{pod='%s',namespace='%s'}", podName, tc.workloadNamespace),
241276
}
277+
// Use extended timeout to account for EndpointSlice propagation, metric scraping,
278+
// and recording rule evaluation when using EndpointSlice-based service discovery
279+
podMetricsTimeout := 5 * time.Minute
242280
for i, query := range queries {
243281
t.Run("query "+strconv.Itoa(i), func(t *testing.T) {
244-
err := wait.PollUntilContextTimeout(context.TODO(), retry.Interval, retry.ResourceChangeTimeout, true,
282+
err := wait.PollUntilContextTimeout(context.TODO(), retry.Interval, podMetricsTimeout, true,
245283
func(ctx context.Context) (done bool, err error) {
246284
results, err := makePrometheusQuery(prometheusRoute.Spec.Host, query, prometheusToken)
247285
if err != nil {

vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/resource.go

Lines changed: 45 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)