Skip to content

Commit fc6d126

Browse files
Merge pull request #507 from tiraboschi/psireporting
Add an alert and an condition for RelieveAndMigrate without PSI
2 parents 15ea63c + 5c56ad1 commit fc6d126

File tree

5 files changed

+239
-0
lines changed

5 files changed

+239
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,9 @@ relevant asymmetry between the descheduling and successive scheduling decisions.
180180
The soft taints set by the descheduler soft-tainter act as a hint for the scheduler to mitigate
181181
this asymmetry and foster a quicker convergence.
182182

183+
This profile requires [PSI](https://docs.kernel.org/accounting/psi.html) metrics to be enabled (psi=1 kernel parameter)
184+
for all the worker nodes.
185+
183186
The profile exposes the following customization:
184187
- `devLowNodeUtilizationThresholds`: Sets experimental thresholds for the LowNodeUtilization strategy.
185188
- `devActualUtilizationProfile`: Enable load-aware descheduling.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: descheduler-psi-alert
6+
namespace: openshift-kube-descheduler-operator
7+
spec:
8+
groups:
9+
- name: recordingRules.alerts
10+
rules:
11+
- alert: DeschedulerPSIDisabled
12+
expr: |-
13+
count(kube_node_role{role="worker"}) >
14+
(count(
15+
descheduler:nodepressure:cpu:avg1m * on (instance) group_left (node) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
16+
OR on() vector(0)
17+
)
18+
for: 0m
19+
labels:
20+
severity: critical
21+
annotations:
22+
summary: Kube Descheduler Operator is configured to consume a PSI metric but PSI is not enabled
23+
description: "Kube Descheduler Operator is configured (devActualUtilizationProfile) to consume a PSI metric but PSI is not enabled for all the worker nodes (psi=1 kernel argument)"

pkg/operator/target_config_reconciler.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package operator
33
import (
44
"context"
55
"fmt"
6+
"os"
67
"slices"
78
"strconv"
89
"strings"
@@ -67,6 +68,8 @@ import (
6768

6869
const DefaultImage = "quay.io/openshift/origin-descheduler:latest"
6970
const kubeVirtShedulableLabelSelector = "kubevirt.io/schedulable=true"
71+
const psiPath = "/proc/pressure/"
72+
const EXPERIMENTAL_DISABLE_PSI_CHECK = "EXPERIMENTAL_DISABLE_PSI_CHECK"
7073

7174
// deschedulerCommand provides descheduler command with policyconfigfile mounted as volume and log-level for backwards
7275
// compatibility with 3.11
@@ -88,6 +91,7 @@ type TargetConfigReconciler struct {
8891
namespaceLister corev1listers.NamespaceLister
8992
nodeLister corev1listers.NodeLister
9093
cache resourceapply.ResourceCache
94+
psiPath string
9195
}
9296

9397
func NewTargetConfigReconciler(
@@ -133,6 +137,7 @@ func NewTargetConfigReconciler(
133137
namespaceLister: coreInformers.Core().V1().Namespaces().Lister(),
134138
nodeLister: coreInformers.Core().V1().Nodes().Lister(),
135139
cache: resourceapply.NewResourceCache(),
140+
psiPath: psiPath,
136141
}
137142

138143
configInformer.Config().V1().Schedulers().Informer().AddEventHandler(c.eventHandler())
@@ -322,6 +327,16 @@ func (c TargetConfigReconciler) sync() error {
322327
specAnnotations["clusterrolebindings/openshift-descheduler-operand"] = resourceVersion
323328
}
324329

330+
if softtainterPSIAlert, _, err := c.managePSIAlert(descheduler, isSoftTainterNeeded); err != nil {
331+
return err
332+
} else {
333+
resourceVersion := "0"
334+
if softtainterPSIAlert != nil {
335+
resourceVersion = softtainterPSIAlert.GetResourceVersion()
336+
}
337+
specAnnotations["prometheusrule/descheduler-psi-alert"] = resourceVersion
338+
}
339+
325340
if role, _, err := c.manageRole(descheduler); err != nil {
326341
return err
327342
} else {
@@ -476,6 +491,22 @@ func (c *TargetConfigReconciler) managePrometheusRule(descheduler *deschedulerv1
476491
return resourceapply.ApplyKnownUnstructured(c.ctx, c.dynamicClient, c.eventRecorder, required)
477492
}
478493

494+
func (c *TargetConfigReconciler) managePSIAlert(descheduler *deschedulerv1.KubeDescheduler, stEnabled bool) (*unstructured.Unstructured, bool, error) {
495+
required := resourceread.ReadUnstructuredOrDie(bindata.MustAsset("assets/kube-descheduler/psialert.yaml"))
496+
ownerReference := metav1.OwnerReference{
497+
APIVersion: "operator.openshift.io/v1",
498+
Kind: "KubeDescheduler",
499+
Name: descheduler.Name,
500+
UID: descheduler.UID,
501+
}
502+
required.SetOwnerReferences([]metav1.OwnerReference{ownerReference})
503+
controller.EnsureOwnerRef(required, ownerReference)
504+
if stEnabled {
505+
return resourceapply.ApplyKnownUnstructured(c.ctx, c.dynamicClient, c.eventRecorder, required)
506+
}
507+
return resourceapply.DeleteKnownUnstructured(c.ctx, c.dynamicClient, c.eventRecorder, required)
508+
}
509+
479510
func (c *TargetConfigReconciler) manageSoftTainterValidatingAdmissionPolicy(descheduler *deschedulerv1.KubeDescheduler, stEnabled bool) (*admissionv1.ValidatingAdmissionPolicy, bool, error) {
480511
required := resourceread.ReadValidatingAdmissionPolicyV1OrDie(bindata.MustAsset("assets/kube-descheduler/softtaintervalidatingadmissionpolicy.yaml"))
481512
ownerReference := metav1.OwnerReference{
@@ -1340,6 +1371,13 @@ func (c *TargetConfigReconciler) manageConfigMap(descheduler *deschedulerv1.Kube
13401371
if !kvDeployed {
13411372
return nil, true, fmt.Errorf("profile %v can only be used when KubeVirt is properly deployed", deschedulerv1.RelieveAndMigrate)
13421373
}
1374+
psiEnabled, psierr := c.isPSIenabled()
1375+
if psierr != nil {
1376+
return nil, false, psierr
1377+
}
1378+
if !psiEnabled {
1379+
return nil, true, fmt.Errorf("profile %v can only be used when PSI metrics are enabled for the worker nodes", deschedulerv1.RelieveAndMigrate)
1380+
}
13431381
kubeVirtShedulable := kubeVirtShedulableLabelSelector
13441382
policy.NodeSelector = &kubeVirtShedulable
13451383
profile, err = relieveAndMigrateProfile(descheduler.Spec.ProfileCustomizations, includedNamespaces, excludedNamespaces, c.protectedNamespaces)
@@ -1629,6 +1667,22 @@ func (c *TargetConfigReconciler) isKubeVirtDeployed() (bool, error) {
16291667
return false, nil
16301668
}
16311669

1670+
func (c *TargetConfigReconciler) isPSIenabled() (bool, error) {
1671+
if boolValue, err := strconv.ParseBool(os.Getenv(EXPERIMENTAL_DISABLE_PSI_CHECK)); err == nil && boolValue {
1672+
return true, nil
1673+
}
1674+
1675+
_, err := os.Stat(c.psiPath)
1676+
if err == nil {
1677+
return true, nil
1678+
} else {
1679+
if os.IsNotExist(err) {
1680+
return false, nil
1681+
}
1682+
return false, err
1683+
}
1684+
}
1685+
16321686
func (c *TargetConfigReconciler) isSoftTainterNeeded(descheduler *deschedulerv1.KubeDescheduler) (bool, error) {
16331687
if slices.Contains(descheduler.Spec.Profiles, deschedulerv1.RelieveAndMigrate) {
16341688
return true, nil

pkg/operator/target_config_reconciler_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package operator
33
import (
44
"context"
55
"fmt"
6+
"os"
7+
"path"
68
"testing"
79
"time"
810
"unsafe"
@@ -107,6 +109,17 @@ func TestManageConfigMap(t *testing.T) {
107109
fiveMinutes := metav1.Duration{Duration: fm}
108110
priority := int32(1000)
109111

112+
tempPSIPath, err := os.MkdirTemp("", "unittest")
113+
if err != nil {
114+
t.Fatalf("Failed test: %v", err)
115+
}
116+
defer func(tempPSIPath string) {
117+
err := os.RemoveAll(tempPSIPath)
118+
if err != nil {
119+
t.Fatalf("Failed test: %v", err)
120+
}
121+
}(tempPSIPath)
122+
110123
tests := []struct {
111124
name string
112125
schedulerConfig *configv1.Scheduler
@@ -116,6 +129,7 @@ func TestManageConfigMap(t *testing.T) {
116129
nodes []runtime.Object
117130
err error
118131
forceDeployment bool
132+
missingPSI bool
119133
}{
120134
{
121135
name: "Podlifetime",
@@ -583,6 +597,36 @@ func TestManageConfigMap(t *testing.T) {
583597
err: fmt.Errorf("profile DevKubeVirtRelieveAndMigrate can only be used when KubeVirt is properly deployed"),
584598
forceDeployment: true,
585599
},
600+
{
601+
name: "RelieveAndMigrateWithoutPSI",
602+
descheduler: &deschedulerv1.KubeDescheduler{
603+
Spec: deschedulerv1.KubeDeschedulerSpec{
604+
Profiles: []deschedulerv1.DeschedulerProfile{"DevKubeVirtRelieveAndMigrate"},
605+
ProfileCustomizations: &deschedulerv1.ProfileCustomizations{DevLowNodeUtilizationThresholds: &deschedulerv1.LowThreshold},
606+
},
607+
},
608+
want: &corev1.ConfigMap{
609+
TypeMeta: metav1.TypeMeta{APIVersion: "v1", Kind: "ConfigMap"},
610+
Data: map[string]string{"policy.yaml": string(bindata.MustAsset("assets/relieveAndMigrateLowConfig.yaml"))},
611+
},
612+
nodes: []runtime.Object{
613+
&corev1.Node{
614+
ObjectMeta: metav1.ObjectMeta{
615+
Name: "node1",
616+
Labels: map[string]string{"kubevirt.io/schedulable": "true"},
617+
},
618+
},
619+
&corev1.Node{
620+
ObjectMeta: metav1.ObjectMeta{
621+
Name: "node2",
622+
Labels: map[string]string{"kubevirt.io/schedulable": "true"},
623+
},
624+
},
625+
},
626+
missingPSI: true,
627+
err: fmt.Errorf("profile DevKubeVirtRelieveAndMigrate can only be used when PSI metrics are enabled for the worker nodes"),
628+
forceDeployment: true,
629+
},
586630
{
587631
name: "AffinityAndTaintsWithNamespaces",
588632
descheduler: &deschedulerv1.KubeDescheduler{
@@ -889,7 +933,13 @@ func TestManageConfigMap(t *testing.T) {
889933
ctx, cancelFunc := context.WithCancel(context.TODO())
890934
defer cancelFunc()
891935

936+
testPSIPath := tempPSIPath
937+
if tt.missingPSI {
938+
testPSIPath = path.Join(tempPSIPath, "MISSING")
939+
}
940+
892941
targetConfigReconciler, _ := initTargetConfigReconciler(ctx, objects, []runtime.Object{tt.schedulerConfig}, tt.routes, nil)
942+
targetConfigReconciler.psiPath = testPSIPath
893943

894944
got, forceDeployment, err := targetConfigReconciler.manageConfigMap(tt.descheduler)
895945
if tt.err != nil {
@@ -1100,6 +1150,18 @@ func TestManageDeployment(t *testing.T) {
11001150
func TestManageSoftTainterDeployment(t *testing.T) {
11011151
ctx, cancelFunc := context.WithCancel(context.TODO())
11021152
defer cancelFunc()
1153+
1154+
tempPSIPath, err := os.MkdirTemp("", "unittest")
1155+
if err != nil {
1156+
t.Fatalf("Failed test: %v", err)
1157+
}
1158+
defer func(path string) {
1159+
err := os.RemoveAll(path)
1160+
if err != nil {
1161+
t.Fatalf("Failed test: %v", err)
1162+
}
1163+
}(tempPSIPath)
1164+
11031165
expectedSoftTainterDeployment := &appsv1.Deployment{
11041166
TypeMeta: metav1.TypeMeta{APIVersion: "apps/v1", Kind: "Deployment"},
11051167
ObjectMeta: metav1.ObjectMeta{
@@ -1387,6 +1449,7 @@ func TestManageSoftTainterDeployment(t *testing.T) {
13871449

13881450
func TestSync(t *testing.T) {
13891451
fakeRecorder := NewFakeRecorder(1024)
1452+
13901453
tests := []struct {
13911454
name string
13921455
targetConfigReconciler *TargetConfigReconciler

test/e2e/operator_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ var operatorConfigs = map[string]string{
5959
}
6060
var operatorConfigsAppliers = map[string]func() error{}
6161

62+
const EXPERIMENTAL_DISABLE_PSI_CHECK = "EXPERIMENTAL_DISABLE_PSI_CHECK"
63+
6264
func TestMain(m *testing.M) {
6365
if os.Getenv("KUBECONFIG") == "" {
6466
klog.Errorf("KUBECONFIG environment variable not set")
@@ -265,6 +267,24 @@ func TestSoftTainterDeployment(t *testing.T) {
265267
}
266268
}(ctx, kubeClient)
267269

270+
// patch the operator deployment to mock PSI
271+
prevDisablePSIcheck, foundDisablePSIcheck := os.LookupEnv(EXPERIMENTAL_DISABLE_PSI_CHECK)
272+
if err := mockPSIEnv(ctx, kubeClient); err != nil {
273+
t.Fatalf("Failed mocking PSI path enviromental variable to the operator depoyment: %v", err)
274+
}
275+
defer func(ctx context.Context, kubeClient *k8sclient.Clientset, prevDisablePSIcheck string, foundDisablePSIcheck bool) {
276+
err := unmockPSIEnv(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
277+
if err != nil {
278+
t.Fatalf("Failed PSI path enviromental variable: %v", err)
279+
}
280+
}(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
281+
// wait for descheduler operator pod to be running
282+
deschOpPod, err := waitForPodRunningByNamePrefix(ctx, kubeClient, operatorclient.OperatorNamespace, operatorclient.OperandName, operatorclient.OperandName+"-operator")
283+
if err != nil {
284+
t.Fatalf("Unable to wait for the Descheduler operator pod to run")
285+
}
286+
klog.Infof("Descheduler pod running in %v", deschOpPod.Name)
287+
268288
// apply devKubeVirtRelieveAndMigrate CR for the operator
269289
if err := operatorConfigsAppliers[kubeVirtRelieveAndMigrateConf](); err != nil {
270290
t.Fatalf("Unable to apply a CR for Descheduler operator: %v", err)
@@ -358,6 +378,24 @@ func TestSoftTainterVAP(t *testing.T) {
358378
}
359379
}(ctx, kubeClient)
360380

381+
// patch the operator deployment to mock PSI
382+
prevDisablePSIcheck, foundDisablePSIcheck := os.LookupEnv(EXPERIMENTAL_DISABLE_PSI_CHECK)
383+
if err := mockPSIEnv(ctx, kubeClient); err != nil {
384+
t.Fatalf("Failed mocking PSI path enviromental variable to the operator depoyment: %v", err)
385+
}
386+
defer func(ctx context.Context, kubeClient *k8sclient.Clientset, prevDisablePSIcheck string, foundDisablePSIcheck bool) {
387+
err := unmockPSIEnv(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
388+
if err != nil {
389+
t.Fatalf("Failed PSI path enviromental variable: %v", err)
390+
}
391+
}(ctx, kubeClient, prevDisablePSIcheck, foundDisablePSIcheck)
392+
// wait for descheduler operator pod to be running
393+
deschOpPod, err := waitForPodRunningByNamePrefix(ctx, kubeClient, operatorclient.OperatorNamespace, operatorclient.OperandName, operatorclient.OperandName+"-operator")
394+
if err != nil {
395+
t.Fatalf("Unable to wait for the Descheduler operator pod to run")
396+
}
397+
klog.Infof("Descheduler pod running in %v", deschOpPod.Name)
398+
361399
// apply devKubeVirtRelieveAndMigrate CR for the operator
362400
if err := operatorConfigsAppliers[kubeVirtRelieveAndMigrateConf](); err != nil {
363401
t.Fatalf("Unable to apply a CR for Descheduler operator: %v", err)
@@ -825,6 +863,27 @@ func updateNodeAndRetryOnConflicts(ctx context.Context, kubeClient *k8sclient.Cl
825863
return nil
826864
}
827865

866+
func updateDeploymentAndRetryOnConflicts(ctx context.Context, kubeClient *k8sclient.Clientset, deployment *appsv1.Deployment, opts metav1.UpdateOptions) error {
867+
uDeployment, uerr := kubeClient.AppsV1().Deployments(deployment.Namespace).Update(ctx, deployment, opts)
868+
if uerr != nil {
869+
if apierrors.IsConflict(uerr) {
870+
if uDeployment.Name == "" {
871+
uDeployment, uerr = kubeClient.AppsV1().Deployments(uDeployment.Namespace).Get(ctx, uDeployment.Name, metav1.GetOptions{})
872+
if uerr != nil {
873+
return uerr
874+
}
875+
}
876+
deployment.Spec.DeepCopyInto(&uDeployment.Spec)
877+
uDeployment.ObjectMeta.Labels = deployment.ObjectMeta.Labels
878+
uDeployment.ObjectMeta.Annotations = deployment.ObjectMeta.Annotations
879+
_, err := kubeClient.AppsV1().Deployments(deployment.Namespace).Update(ctx, deployment, opts)
880+
return err
881+
}
882+
return uerr
883+
}
884+
return nil
885+
}
886+
828887
func tryUpdatingTaintWithExpectation(ctx context.Context, t *testing.T, clientSet *k8sclient.Clientset, node *corev1.Node, taint *corev1.Taint, add, expectedSuccess bool) {
829888
rnode, err := clientSet.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{})
830889
if err != nil {
@@ -886,3 +945,40 @@ func tryRemovingTaintWithExpectedSuccess(ctx context.Context, t *testing.T, clie
886945
func tryRemovingTaintWithExpectedFailure(ctx context.Context, t *testing.T, clientSet *k8sclient.Clientset, node *corev1.Node, taint *corev1.Taint) {
887946
tryUpdatingTaintWithExpectation(ctx, t, clientSet, node, taint, false, false)
888947
}
948+
949+
func mockPSIEnv(ctx context.Context, kubeClient *k8sclient.Clientset) error {
950+
operatorDeployment, err := kubeClient.AppsV1().Deployments(operatorclient.OperatorNamespace).Get(ctx, operatorclient.OperandName+"-operator", metav1.GetOptions{})
951+
if err != nil {
952+
return err
953+
}
954+
operatorDeployment.Spec.Template.Spec.Containers[0].Env = append(
955+
operatorDeployment.Spec.Template.Spec.Containers[0].Env,
956+
v1.EnvVar{
957+
Name: EXPERIMENTAL_DISABLE_PSI_CHECK,
958+
Value: "true",
959+
})
960+
return updateDeploymentAndRetryOnConflicts(ctx, kubeClient, operatorDeployment, metav1.UpdateOptions{})
961+
}
962+
963+
func unmockPSIEnv(ctx context.Context, kubeClient *k8sclient.Clientset, prevDisablePSIcheck string, foundDisablePSIcheck bool) error {
964+
operatorDeployment, err := kubeClient.AppsV1().Deployments(operatorclient.OperatorNamespace).Get(ctx, operatorclient.OperandName+"-operator", metav1.GetOptions{})
965+
if err != nil {
966+
return err
967+
}
968+
var envVars []v1.EnvVar
969+
for _, e := range operatorDeployment.Spec.Template.Spec.Containers[0].Env {
970+
if e.Name != EXPERIMENTAL_DISABLE_PSI_CHECK {
971+
envVars = append(envVars, e)
972+
}
973+
}
974+
if foundDisablePSIcheck {
975+
operatorDeployment.Spec.Template.Spec.Containers[0].Env = append(
976+
envVars,
977+
v1.EnvVar{
978+
Name: EXPERIMENTAL_DISABLE_PSI_CHECK,
979+
Value: prevDisablePSIcheck,
980+
})
981+
}
982+
operatorDeployment.Spec.Template.Spec.Containers[0].Env = envVars
983+
return updateDeploymentAndRetryOnConflicts(ctx, kubeClient, operatorDeployment, metav1.UpdateOptions{})
984+
}

0 commit comments

Comments
 (0)