Skip to content

Commit 682ac55

Browse files
tiraboschiingvagabund
authored andcommitted
Add an alert and an condition for RelieveAndMigrate without PSI
RelieveAndMigrate profile requires PSI metric to be enabled for the worker nodes (psi=1 kernel parameter). Directly check it for the node that is executing the operator pod, looking if /proc/pressure/ is available inside the pod, and report it with a condition. Check it for all the worker nodes and report it also with an alert. Signed-off-by: Simone Tiraboschi <[email protected]>
1 parent 15ea63c commit 682ac55

File tree

5 files changed

+239
-0
lines changed

5 files changed

+239
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,9 @@ relevant asymmetry between the descheduling and successive scheduling decisions.
180180
The soft taints set by the descheduler soft-tainter act as a hint for the scheduler to mitigate
181181
this asymmetry and foster a quicker convergence.
182182

183+
This profile requires [PSI](https://docs.kernel.org/accounting/psi.html) metrics to be enabled (psi=1 kernel parameter)
184+
for all the worker nodes.
185+
183186
The profile exposes the following customization:
184187
- `devLowNodeUtilizationThresholds`: Sets experimental thresholds for the LowNodeUtilization strategy.
185188
- `devActualUtilizationProfile`: Enable load-aware descheduling.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: descheduler-psi-alert
6+
namespace: openshift-kube-descheduler-operator
7+
spec:
8+
groups:
9+
- name: recordingRules.alerts
10+
rules:
11+
- alert: DeschedulerPSIDisabled
12+
expr: |-
13+
count(kube_node_role{role="worker"}) >
14+
(count(
15+
descheduler:nodepressure:cpu:avg1m * on (instance) group_left (node) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
16+
OR on() vector(0)
17+
)
18+
for: 0m
19+
labels:
20+
severity: critical
21+
annotations:
22+
summary: Kube Descheduler Operator is configured to consume a PSI metric but PSI is not enabled
23+
description: "Kube Descheduler Operator is configured (devActualUtilizationProfile) to consume a PSI metric but PSI is not enabled for all the worker nodes (psi=1 kernel argument)"

pkg/operator/target_config_reconciler.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package operator
33
import (
44
"context"
55
"fmt"
6+
"os"
67
"slices"
78
"strconv"
89
"strings"
@@ -67,6 +68,8 @@ import (
6768

6869
const DefaultImage = "quay.io/openshift/origin-descheduler:latest"
6970
const kubeVirtShedulableLabelSelector = "kubevirt.io/schedulable=true"
71+
const psiPath = "/proc/pressure/"
72+
const EXPERIMENTAL_DISABLE_PSI_CHECK = "EXPERIMENTAL_DISABLE_PSI_CHECK"
7073

7174
// deschedulerCommand provides descheduler command with policyconfigfile mounted as volume and log-level for backwards
7275
// compatibility with 3.11
@@ -88,6 +91,7 @@ type TargetConfigReconciler struct {
8891
namespaceLister corev1listers.NamespaceLister
8992
nodeLister corev1listers.NodeLister
9093
cache resourceapply.ResourceCache
94+
psiPath string
9195
}
9296

9397
func NewTargetConfigReconciler(
@@ -133,6 +137,7 @@ func NewTargetConfigReconciler(
133137
namespaceLister: coreInformers.Core().V1().Namespaces().Lister(),
134138
nodeLister: coreInformers.Core().V1().Nodes().Lister(),
135139
cache: resourceapply.NewResourceCache(),
140+
psiPath: psiPath,
136141
}
137142

138143
configInformer.Config().V1().Schedulers().Informer().AddEventHandler(c.eventHandler())
@@ -322,6 +327,16 @@ func (c TargetConfigReconciler) sync() error {
322327
specAnnotations["clusterrolebindings/openshift-descheduler-operand"] = resourceVersion
323328
}
324329

330+
if softtainterPSIAlert, _, err := c.managePSIAlert(descheduler, isSoftTainterNeeded); err != nil {
331+
return err
332+
} else {
333+
resourceVersion := "0"
334+
if softtainterPSIAlert != nil {
335+
resourceVersion = softtainterPSIAlert.GetResourceVersion()
336+
}
337+
specAnnotations["prometheusrule/descheduler-psi-alert"] = resourceVersion
338+
}
339+
325340
if role, _, err := c.manageRole(descheduler); err != nil {
326341
return err
327342
} else {
@@ -476,6 +491,22 @@ func (c *TargetConfigReconciler) managePrometheusRule(descheduler *deschedulerv1
476491
return resourceapply.ApplyKnownUnstructured(c.ctx, c.dynamicClient, c.eventRecorder, required)
477492
}
478493

494+
func (c *TargetConfigReconciler) managePSIAlert(descheduler *deschedulerv1.KubeDescheduler, stEnabled bool) (*unstructured.Unstructured, bool, error) {
495+
required := resourceread.ReadUnstructuredOrDie(bindata.MustAsset("assets/kube-descheduler/psialert.yaml"))
496+
ownerReference := metav1.OwnerReference{
497+
APIVersion: "operator.openshift.io/v1",
498+
Kind: "KubeDescheduler",
499+
Name: descheduler.Name,
500+
UID: descheduler.UID,
501+
}
502+
required.SetOwnerReferences([]metav1.OwnerReference{ownerReference})
503+
controller.EnsureOwnerRef(required, ownerReference)
504+
if stEnabled {
505+
return resourceapply.ApplyKnownUnstructured(c.ctx, c.dynamicClient, c.eventRecorder, required)
506+
}
507+
return resourceapply.DeleteKnownUnstructured(c.ctx, c.dynamicClient, c.eventRecorder, required)
508+
}
509+
479510
func (c *TargetConfigReconciler) manageSoftTainterValidatingAdmissionPolicy(descheduler *deschedulerv1.KubeDescheduler, stEnabled bool) (*admissionv1.ValidatingAdmissionPolicy, bool, error) {
480511
required := resourceread.ReadValidatingAdmissionPolicyV1OrDie(bindata.MustAsset("assets/kube-descheduler/softtaintervalidatingadmissionpolicy.yaml"))
481512
ownerReference := metav1.OwnerReference{
@@ -1340,6 +1371,13 @@ func (c *TargetConfigReconciler) manageConfigMap(descheduler *deschedulerv1.Kube
13401371
if !kvDeployed {
13411372
return nil, true, fmt.Errorf("profile %v can only be used when KubeVirt is properly deployed", deschedulerv1.RelieveAndMigrate)
13421373
}
1374+
psiEnabled, psierr := c.isPSIenabled()
1375+
if psierr != nil {
1376+
return nil, false, psierr
1377+
}
1378+
if !psiEnabled {
1379+
return nil, true, fmt.Errorf("profile %v can only be used when PSI metrics are enabled for the worker nodes", deschedulerv1.RelieveAndMigrate)
1380+
}
13431381
kubeVirtShedulable := kubeVirtShedulableLabelSelector
13441382
policy.NodeSelector = &kubeVirtShedulable
13451383
profile, err = relieveAndMigrateProfile(descheduler.Spec.ProfileCustomizations, includedNamespaces, excludedNamespaces, c.protectedNamespaces)
@@ -1629,6 +1667,22 @@ func (c *TargetConfigReconciler) isKubeVirtDeployed() (bool, error) {
16291667
return false, nil
16301668
}
16311669

1670+
func (c *TargetConfigReconciler) isPSIenabled() (bool, error) {
1671+
if boolValue, err := strconv.ParseBool(os.Getenv(EXPERIMENTAL_DISABLE_PSI_CHECK)); err == nil && boolValue {
1672+
return true, nil
1673+
}
1674+
1675+
_, err := os.Stat(c.psiPath)
1676+
if err == nil {
1677+
return true, nil
1678+
} else {
1679+
if os.IsNotExist(err) {
1680+
return false, nil
1681+
}
1682+
return false, err
1683+
}
1684+
}
1685+
16321686
func (c *TargetConfigReconciler) isSoftTainterNeeded(descheduler *deschedulerv1.KubeDescheduler) (bool, error) {
16331687
if slices.Contains(descheduler.Spec.Profiles, deschedulerv1.RelieveAndMigrate) {
16341688
return true, nil

pkg/operator/target_config_reconciler_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package operator
33
import (
44
"context"
55
"fmt"
6+
"os"
7+
"path"
68
"testing"
79
"time"
810
"unsafe"
@@ -107,6 +109,17 @@ func TestManageConfigMap(t *testing.T) {
107109
fiveMinutes := metav1.Duration{Duration: fm}
108110
priority := int32(1000)
109111

112+
tempPSIPath, err := os.MkdirTemp("", "unittest")
113+
if err != nil {
114+
t.Fatalf("Failed test: %v", err)
115+
}
116+
defer func(tempPSIPath string) {
117+
err := os.RemoveAll(tempPSIPath)
118+
if err != nil {
119+
t.Fatalf("Failed test: %v", err)
120+
}
121+
}(tempPSIPath)
122+
110123
tests := []struct {
111124
name string
112125
schedulerConfig *configv1.Scheduler
@@ -116,6 +129,7 @@ func TestManageConfigMap(t *testing.T) {
116129
nodes []runtime.Object
117130
err error
118131
forceDeployment bool
132+
missingPSI bool
119133
}{
120134
{
121135
name: "Podlifetime",
@@ -583,6 +597,36 @@ func TestManageConfigMap(t *testing.T) {
583597
err: fmt.Errorf("profile DevKubeVirtRelieveAndMigrate can only be used when KubeVirt is properly deployed"),
584598
forceDeployment: true,
585599
},
600+
{
601+
name: "RelieveAndMigrateWithoutPSI",
602+
descheduler: &deschedulerv1.KubeDescheduler{
603+
Spec: deschedulerv1.KubeDeschedulerSpec{
604+
Profiles: []deschedulerv1.DeschedulerProfile{"DevKubeVirtRelieveAndMigrate"},
605+
ProfileCustomizations: &deschedulerv1.ProfileCustomizations{DevLowNodeUtilizationThresholds: &deschedulerv1.LowThreshold},
606+
},
607+
},
608+
want: &corev1.ConfigMap{
609+
TypeMeta: metav1.TypeMeta{APIVersion: "v1", Kind: "ConfigMap"},
610+
Data: map[string]string{"policy.yaml": string(bindata.MustAsset("assets/relieveAndMigrateLowConfig.yaml"))},
611+
},
612+
nodes: []runtime.Object{
613+
&corev1.Node{
614+
ObjectMeta: metav1.ObjectMeta{
615+
Name: "node1",
616+
Labels: map[string]string{"kubevirt.io/schedulable": "true"},
617+
},
618+
},
619+
&corev1.Node{
620+
ObjectMeta: metav1.ObjectMeta{
621+
Name: "node2",
622+
Labels: map[string]string{"kubevirt.io/schedulable": "true"},
623+
},
624+
},
625+
},
626+
missingPSI: true,
627+
err: fmt.Errorf("profile DevKubeVirtRelieveAndMigrate can only be used when PSI metrics are enabled for the worker nodes"),
628+
forceDeployment: true,
629+
},
586630
{
587631
name: "AffinityAndTaintsWithNamespaces",
588632
descheduler: &deschedulerv1.KubeDescheduler{
@@ -889,7 +933,13 @@ func TestManageConfigMap(t *testing.T) {
889933
ctx, cancelFunc := context.WithCancel(context.TODO())
890934
defer cancelFunc()
891935

936+
testPSIPath := tempPSIPath
937+
if tt.missingPSI {
938+
testPSIPath = path.Join(tempPSIPath, "MISSING")
939+
}
940+
892941
targetConfigReconciler, _ := initTargetConfigReconciler(ctx, objects, []runtime.Object{tt.schedulerConfig}, tt.routes, nil)
942+
targetConfigReconciler.psiPath = testPSIPath
893943

894944
got, forceDeployment, err := targetConfigReconciler.manageConfigMap(tt.descheduler)
895945
if tt.err != nil {
@@ -1100,6 +1150,18 @@ func TestManageDeployment(t *testing.T) {
11001150
func TestManageSoftTainterDeployment(t *testing.T) {
11011151
ctx, cancelFunc := context.WithCancel(context.TODO())
11021152
defer cancelFunc()
1153+
1154+
tempPSIPath, err := os.MkdirTemp("", "unittest")
1155+
if err != nil {
1156+
t.Fatalf("Failed test: %v", err)
1157+
}
1158+
defer func(path string) {
1159+
err := os.RemoveAll(path)
1160+
if err != nil {
1161+
t.Fatalf("Failed test: %v", err)
1162+
}
1163+
}(tempPSIPath)
1164+
11031165
expectedSoftTainterDeployment := &appsv1.Deployment{
11041166
TypeMeta: metav1.TypeMeta{APIVersion: "apps/v1", Kind: "Deployment"},
11051167
ObjectMeta: metav1.ObjectMeta{
@@ -1387,6 +1449,7 @@ func TestManageSoftTainterDeployment(t *testing.T) {
13871449

13881450
func TestSync(t *testing.T) {
13891451
fakeRecorder := NewFakeRecorder(1024)
1452+
13901453
tests := []struct {
13911454
name string
13921455
targetConfigReconciler *TargetConfigReconciler

test/e2e/operator_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ var operatorConfigs = map[string]string{
5959
}
6060
var operatorConfigsAppliers = map[string]func() error{}
6161

62+
const EXPERIMENTAL_DISABLE_PSI_CHECK = "EXPERIMENTAL_DISABLE_PSI_CHECK"
63+
6264
func TestMain(m *testing.M) {
6365
if os.Getenv("KUBECONFIG") == "" {
6466
klog.Errorf("KUBECONFIG environment variable not set")
@@ -265,6 +267,24 @@ func TestSoftTainterDeployment(t *testing.T) {
265267
}
266268
}(ctx, kubeClient)
267269

270+
// patch the operator deployment to mock PSI
271+
prevDisablePSIcheck, foundDisablePSIcheck := os.LookupEnv(EXPERIMENTAL_DISABLE_PSI_CHECK)
272+
if err := mockPSIEnv(ctx, kubeClient); err != nil {
273+
t.Fatalf("Failed mocking PSI path enviromental variable to the operator depoyment: %v", err)
274+
}
275+
defer func(ctx context.Context, kubeClient *k8sclient.Clientset, prevDisablePSIcheck string, foundDisablePSIcheck bool) {
276+
err := unmockPSIEnv(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
277+
if err != nil {
278+
t.Fatalf("Failed PSI path enviromental variable: %v", err)
279+
}
280+
}(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
281+
// wait for descheduler operator pod to be running
282+
deschOpPod, err := waitForPodRunningByNamePrefix(ctx, kubeClient, operatorclient.OperatorNamespace, operatorclient.OperandName, operatorclient.OperandName+"-operator")
283+
if err != nil {
284+
t.Fatalf("Unable to wait for the Descheduler operator pod to run")
285+
}
286+
klog.Infof("Descheduler pod running in %v", deschOpPod.Name)
287+
268288
// apply devKubeVirtRelieveAndMigrate CR for the operator
269289
if err := operatorConfigsAppliers[kubeVirtRelieveAndMigrateConf](); err != nil {
270290
t.Fatalf("Unable to apply a CR for Descheduler operator: %v", err)
@@ -358,6 +378,24 @@ func TestSoftTainterVAP(t *testing.T) {
358378
}
359379
}(ctx, kubeClient)
360380

381+
// patch the operator deployment to mock PSI
382+
prevDisablePSIcheck, foundDisablePSIcheck := os.LookupEnv(EXPERIMENTAL_DISABLE_PSI_CHECK)
383+
if err := mockPSIEnv(ctx, kubeClient); err != nil {
384+
t.Fatalf("Failed mocking PSI path enviromental variable to the operator depoyment: %v", err)
385+
}
386+
defer func(ctx context.Context, kubeClient *k8sclient.Clientset, prevDisablePSIcheck string, foundDisablePSIcheck bool) {
387+
err := unmockPSIEnv(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
388+
if err != nil {
389+
t.Fatalf("Failed PSI path enviromental variable: %v", err)
390+
}
391+
}(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
392+
// wait for descheduler operator pod to be running
393+
deschOpPod, err := waitForPodRunningByNamePrefix(ctx, kubeClient, operatorclient.OperatorNamespace, operatorclient.OperandName, operatorclient.OperandName+"-operator")
394+
if err != nil {
395+
t.Fatalf("Unable to wait for the Descheduler operator pod to run")
396+
}
397+
klog.Infof("Descheduler pod running in %v", deschOpPod.Name)
398+
361399
// apply devKubeVirtRelieveAndMigrate CR for the operator
362400
if err := operatorConfigsAppliers[kubeVirtRelieveAndMigrateConf](); err != nil {
363401
t.Fatalf("Unable to apply a CR for Descheduler operator: %v", err)
@@ -825,6 +863,27 @@ func updateNodeAndRetryOnConflicts(ctx context.Context, kubeClient *k8sclient.Cl
825863
return nil
826864
}
827865

866+
func updateDeploymentAndRetryOnConflicts(ctx context.Context, kubeClient *k8sclient.Clientset, deployment *appsv1.Deployment, opts metav1.UpdateOptions) error {
867+
uDeployment, uerr := kubeClient.AppsV1().Deployments(deployment.Namespace).Update(ctx, deployment, opts)
868+
if uerr != nil {
869+
if apierrors.IsConflict(uerr) {
870+
if uDeployment.Name == "" {
871+
uDeployment, uerr = kubeClient.AppsV1().Deployments(uDeployment.Namespace).Get(ctx, uDeployment.Name, metav1.GetOptions{})
872+
if uerr != nil {
873+
return uerr
874+
}
875+
}
876+
deployment.Spec.DeepCopyInto(&uDeployment.Spec)
877+
uDeployment.ObjectMeta.Labels = deployment.ObjectMeta.Labels
878+
uDeployment.ObjectMeta.Annotations = deployment.ObjectMeta.Annotations
879+
_, err := kubeClient.AppsV1().Deployments(deployment.Namespace).Update(ctx, deployment, opts)
880+
return err
881+
}
882+
return uerr
883+
}
884+
return nil
885+
}
886+
828887
func tryUpdatingTaintWithExpectation(ctx context.Context, t *testing.T, clientSet *k8sclient.Clientset, node *corev1.Node, taint *corev1.Taint, add, expectedSuccess bool) {
829888
rnode, err := clientSet.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{})
830889
if err != nil {
@@ -886,3 +945,40 @@ func tryRemovingTaintWithExpectedSuccess(ctx context.Context, t *testing.T, clie
886945
func tryRemovingTaintWithExpectedFailure(ctx context.Context, t *testing.T, clientSet *k8sclient.Clientset, node *corev1.Node, taint *corev1.Taint) {
887946
tryUpdatingTaintWithExpectation(ctx, t, clientSet, node, taint, false, false)
888947
}
948+
949+
func mockPSIEnv(ctx context.Context, kubeClient *k8sclient.Clientset) error {
950+
operatorDeployment, err := kubeClient.AppsV1().Deployments(operatorclient.OperatorNamespace).Get(ctx, operatorclient.OperandName+"-operator", metav1.GetOptions{})
951+
if err != nil {
952+
return err
953+
}
954+
operatorDeployment.Spec.Template.Spec.Containers[0].Env = append(
955+
operatorDeployment.Spec.Template.Spec.Containers[0].Env,
956+
v1.EnvVar{
957+
Name: EXPERIMENTAL_DISABLE_PSI_CHECK,
958+
Value: "true",
959+
})
960+
return updateDeploymentAndRetryOnConflicts(ctx, kubeClient, operatorDeployment, metav1.UpdateOptions{})
961+
}
962+
963+
func unmockPSIEnv(ctx context.Context, kubeClient *k8sclient.Clientset, prevDisablePSIcheck string, foundDisablePSIcheck bool) error {
964+
operatorDeployment, err := kubeClient.AppsV1().Deployments(operatorclient.OperatorNamespace).Get(ctx, operatorclient.OperandName+"-operator", metav1.GetOptions{})
965+
if err != nil {
966+
return err
967+
}
968+
var envVars []v1.EnvVar
969+
for _, e := range operatorDeployment.Spec.Template.Spec.Containers[0].Env {
970+
if e.Name != EXPERIMENTAL_DISABLE_PSI_CHECK {
971+
envVars = append(envVars, e)
972+
}
973+
}
974+
if foundDisablePSIcheck {
975+
operatorDeployment.Spec.Template.Spec.Containers[0].Env = append(
976+
envVars,
977+
v1.EnvVar{
978+
Name: EXPERIMENTAL_DISABLE_PSI_CHECK,
979+
Value: prevDisablePSIcheck,
980+
})
981+
}
982+
operatorDeployment.Spec.Template.Spec.Containers[0].Env = envVars
983+
return updateDeploymentAndRetryOnConflicts(ctx, kubeClient, operatorDeployment, metav1.UpdateOptions{})
984+
}

0 commit comments

Comments
 (0)