From 9dc8e0f79bba03f2c7e6c388c05a73c3bf47b3d0 Mon Sep 17 00:00:00 2001 From: MenD32 Date: Fri, 23 May 2025 12:55:30 +0300 Subject: [PATCH 01/12] feat: added consumedCounters into pool util calculation Signed-off-by: MenD32 --- cluster-autoscaler/go.mod | 18 ++++---- cluster-autoscaler/go.sum | 40 ++++++----------- .../dynamicresources/utils/utilization.go | 44 +++++++++++++++++-- 3 files changed, 64 insertions(+), 38 deletions(-) diff --git a/cluster-autoscaler/go.mod b/cluster-autoscaler/go.mod index edd423305c66..acfa1325ac8b 100644 --- a/cluster-autoscaler/go.mod +++ b/cluster-autoscaler/go.mod @@ -32,16 +32,17 @@ require ( github.com/stretchr/testify v1.10.0 github.com/vburenin/ifacemaker v1.2.1 go.uber.org/mock v0.4.0 - golang.org/x/net v0.33.0 + golang.org/x/crypto v0.36.0 + golang.org/x/net v0.38.0 golang.org/x/oauth2 v0.27.0 - golang.org/x/sys v0.30.0 + golang.org/x/sys v0.31.0 google.golang.org/api v0.151.0 google.golang.org/grpc v1.68.1 google.golang.org/protobuf v1.36.5 gopkg.in/gcfg.v1 v1.2.3 gopkg.in/yaml.v2 v2.4.0 - k8s.io/api v0.33.0-beta.0 - k8s.io/apimachinery v0.33.0-beta.0 + k8s.io/api v0.33.1 + k8s.io/apimachinery v0.33.1 k8s.io/apiserver v0.33.0-beta.0 k8s.io/autoscaler/cluster-autoscaler/apis v0.0.0-20240627115740-d52e4b9665d7 k8s.io/client-go v0.33.0-beta.0 @@ -186,12 +187,11 @@ require ( go.opentelemetry.io/proto/otlp v1.4.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/crypto v0.35.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect golang.org/x/mod v0.21.0 // indirect - golang.org/x/sync v0.11.0 // indirect - golang.org/x/term v0.29.0 // indirect - golang.org/x/text v0.22.0 // indirect + golang.org/x/sync v0.12.0 // indirect + golang.org/x/term v0.30.0 // indirect + golang.org/x/text v0.23.0 // indirect golang.org/x/time v0.9.0 // indirect golang.org/x/tools v0.26.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20241209162323-e6fa225c2576 // indirect @@ -226,7 +226,7 @@ replace github.com/digitalocean/godo => github.com/digitalocean/godo v1.27.0 replace github.com/rancher/go-rancher => github.com/rancher/go-rancher v0.1.0 -replace k8s.io/api => k8s.io/api v0.33.0-beta.0 +replace k8s.io/api => k8s.io/api v0.33.1 replace k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.33.0-beta.0 diff --git a/cluster-autoscaler/go.sum b/cluster-autoscaler/go.sum index 055b91a23a42..5d9dd2f9ac55 100644 --- a/cluster-autoscaler/go.sum +++ b/cluster-autoscaler/go.sum @@ -4,7 +4,6 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= -github.com/Azure/azure-sdk-for-go v46.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go-extensions v0.1.6 h1:EXGvDcj54u98XfaI/Cy65Ds6vNsIJeGKYf0eNLB1y4Q= @@ -45,12 +44,9 @@ github.com/Azure/go-armbalancer v0.0.2 h1:NVnxsTWHI5/fEzL6k6TjxPUfcB/3Si3+HFOZXO github.com/Azure/go-armbalancer v0.0.2/go.mod h1:yTg7MA/8YnfKQc9o97tzAJ7fbdVkod1xGsIvKmhYPRE= github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= -github.com/Azure/go-autorest/autorest v0.11.4/go.mod h1:JFgpikqFJ/MleTTxwepExTKnFUKKszPS8UavbQYUMuw= github.com/Azure/go-autorest/autorest v0.11.28/go.mod h1:MrkzG3Y3AH668QyF9KRk5neJnGgmhQ6krbhR8Q5eMvA= github.com/Azure/go-autorest/autorest v0.11.29 h1:I4+HL/JDvErx2LjyzaVxllw2lRDB5/BT2Bm4g20iqYw= github.com/Azure/go-autorest/autorest v0.11.29/go.mod h1:ZtEzC4Jy2JDrZLxvWs8LrBWEBycl1hbT1eknI8MtfAs= -github.com/Azure/go-autorest/autorest/adal v0.9.0/go.mod h1:/c022QCutn2P7uY+/oQWWNcK9YU+MH96NgK+jErpbcg= -github.com/Azure/go-autorest/autorest/adal v0.9.2/go.mod h1:/3SMAM86bP6wC9Ev35peQDUeqFZBMH07vvUOmg4z/fE= github.com/Azure/go-autorest/autorest/adal v0.9.18/go.mod h1:XVVeme+LZwABT8K5Lc3hA4nAe8LDBVle26gTrguhhPQ= github.com/Azure/go-autorest/autorest/adal v0.9.22/go.mod h1:XuAbAEUv2Tta//+voMI038TrJBqjKam0me7qR+L8Cmk= github.com/Azure/go-autorest/autorest/adal v0.9.24 h1:BHZfgGsGwdkHDyZdtQRQk1WeUdW0m2WPAwuHZwUi5i4= @@ -61,22 +57,17 @@ github.com/Azure/go-autorest/autorest/azure/cli v0.4.6 h1:w77/uPk80ZET2F+AfQExZy github.com/Azure/go-autorest/autorest/azure/cli v0.4.6/go.mod h1:piCfgPho7BiIDdEQ1+g4VmKyD5y+p/XtSNqE6Hc4QD0= github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= github.com/Azure/go-autorest/autorest/date v0.3.0/go.mod h1:BI0uouVdmngYNUzGWeSYnokU+TrmwEsOqdt8Y6sso74= -github.com/Azure/go-autorest/autorest/mocks v0.4.0/go.mod h1:LTp+uSrOhSkaKrUy935gNZuuIPPVsHlr9DSOxSayd+k= github.com/Azure/go-autorest/autorest/mocks v0.4.1/go.mod h1:LTp+uSrOhSkaKrUy935gNZuuIPPVsHlr9DSOxSayd+k= github.com/Azure/go-autorest/autorest/mocks v0.4.2 h1:PGN4EDXnuQbojHbU0UWoNvmu9AGVwYHG9/fkDYhtAfw= github.com/Azure/go-autorest/autorest/mocks v0.4.2/go.mod h1:Vy7OitM9Kei0i1Oj+LvyAWMXJHeKH1MVlzFugfVrmyU= github.com/Azure/go-autorest/autorest/to v0.4.0 h1:oXVqrxakqqV1UZdSazDOPOLvOIz+XA683u8EctwboHk= github.com/Azure/go-autorest/autorest/to v0.4.0/go.mod h1:fE8iZBn7LQR7zH/9XU2NcPR4o9jEImooCeWJcYV/zLE= -github.com/Azure/go-autorest/autorest/validation v0.3.0/go.mod h1:yhLgjC0Wda5DYXl6JAsWyUe4KVNffhoDhG0zVzUMo3E= github.com/Azure/go-autorest/autorest/validation v0.3.1 h1:AgyqjAd94fwNAoTjl/WQXg4VvFeRFpO+UhNyRXqF1ac= github.com/Azure/go-autorest/autorest/validation v0.3.1/go.mod h1:yhLgjC0Wda5DYXl6JAsWyUe4KVNffhoDhG0zVzUMo3E= -github.com/Azure/go-autorest/logger v0.2.0/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8= github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+ZtXWSmf4Tg= github.com/Azure/go-autorest/logger v0.2.1/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8= github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= -github.com/Azure/skewer v0.0.14 h1:0mzUJhspECkajYyynYsOCp//E2PSnYXrgP45bcskqfQ= -github.com/Azure/skewer v0.0.14/go.mod h1:6WTecuPyfGtuvS8Mh4JYWuHhO4kcWycGfsUBB+XTFG4= github.com/Azure/skewer v0.0.19 h1:+qA1z8isKmlNkhAwZErNS2wD2jaemSk9NszYKr8dddU= github.com/Azure/skewer v0.0.19/go.mod h1:LVH7jmduRKmPj8YcIz7V4f53xJEntjweL4aoLyChkwk= github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2 h1:XHOnouVk1mxXfQidrMEnLlPk9UMeRtyBTnEFtxkV0kU= @@ -139,7 +130,6 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/digitalocean/godo v1.27.0 h1:78iE9oVvTnAEqhMip2UHFvL01b8LJcydbNUpr0cAmN4= github.com/digitalocean/godo v1.27.0/go.mod h1:iJnN9rVu6K5LioLxLimlq0uRI+y/eAQjROUmeU/r0hY= github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U= @@ -232,7 +222,6 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= @@ -442,13 +431,12 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= -golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs= -golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ= +golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= +golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= @@ -479,8 +467,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= +golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190402181905-9f3314589c9a/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= @@ -494,8 +482,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= -golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= +golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -512,16 +500,16 @@ golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= +golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= -golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= -golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= +golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= +golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -530,8 +518,8 @@ golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= -golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= +golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= +golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -605,8 +593,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -k8s.io/api v0.33.0-beta.0 h1:/sAUrfXsjKPST2mZjpWhjRdzSR6SD5KlJpiOgCQQhAQ= -k8s.io/api v0.33.0-beta.0/go.mod h1:TYyCgedkG4OVS4+4D2n25BdbMcexMSLx6Y7OkAzkxLQ= +k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= +k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= k8s.io/apiextensions-apiserver v0.33.0-beta.0 h1:3oqBvfd26IOekt96KEfE8A0wA/k1wDSBfTPirkRun1Q= k8s.io/apiextensions-apiserver v0.33.0-beta.0/go.mod h1:TKTeoFcmGvtiDNV+wj8wJfZhamZNOhvi9yOIE2d1iWs= k8s.io/apimachinery v0.33.0-beta.0 h1:vLDBChfQwyimk6AbuT7OZOIqxSg/44JlXuxqBk85j68= diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go index ffdd8e0bcfab..139ecc7ce9a5 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go @@ -21,6 +21,7 @@ import ( v1 "k8s.io/api/core/v1" resourceapi "k8s.io/api/resource/v1beta1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/simulator/framework" ) @@ -70,9 +71,46 @@ func HighestDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (v1.Resourc } func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 { - numAllocated := float64(len(allocated)) - numUnallocated := float64(len(unallocated)) - return numAllocated / (numAllocated + numUnallocated) + TotalConsumedCounters := calculateConsumedCounters(append(allocated, unallocated...)) + allocatedConsumedCounters := calculateConsumedCounters(allocated) + + // we want to find the counter that is most utilized, since it is the "bottleneck" + maxUtilization := 0.0 + for counterSet, counters := range TotalConsumedCounters { + for counterName, totalValue := range counters { + if allocatedSet, exists := allocatedConsumedCounters[counterSet]; exists { + if allocatedValue, exists := allocatedSet[counterName]; exists && !totalValue.IsZero() { + utilization := float64(allocatedValue.MilliValue()) / float64(totalValue.MilliValue()) + if utilization > maxUtilization { + maxUtilization = utilization + } + } + } + } + } + return maxUtilization +} + +// calculateConsumedCounters calculates the total counters consumed by a list of devices +func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[string]resource.Quantity { + countersConsumed := map[string]map[string]resource.Quantity{} + for _, device := range devices { + for _, consumedCounter := range device.Basic.ConsumesCounters { + if _, ok := countersConsumed[consumedCounter.CounterSet]; !ok { + countersConsumed[consumedCounter.CounterSet] = map[string]resource.Quantity{} + } + for counter, value := range consumedCounter.Counters { + if _, ok := countersConsumed[consumedCounter.CounterSet][counter]; !ok { + countersConsumed[consumedCounter.CounterSet][counter] = resource.Quantity{} + } + v := countersConsumed[consumedCounter.CounterSet][counter] + v.Add(value.Value) + countersConsumed[consumedCounter.CounterSet][counter] = v + } + } + } + return countersConsumed + } func splitDevicesByAllocation(devices []resourceapi.Device, allocatedNames []string) (unallocated, allocated []resourceapi.Device) { From 9a450a1e38d4ccaf6314cb9229d773dace23e21e Mon Sep 17 00:00:00 2001 From: MenD32 Date: Fri, 23 May 2025 13:07:31 +0300 Subject: [PATCH 02/12] feat: added consumedCounters into pool util calculation Signed-off-by: MenD32 --- .../dynamicresources/utils/utilization.go | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go index 139ecc7ce9a5..63e6ea82a91c 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go @@ -74,13 +74,29 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 { TotalConsumedCounters := calculateConsumedCounters(append(allocated, unallocated...)) allocatedConsumedCounters := calculateConsumedCounters(allocated) - // we want to find the counter that is most utilized, since it is the "bottleneck" - maxUtilization := 0.0 + // not all devices are partitionable, so fallback to the ratio of non-partionable devices + allocatedDevicesWithoutCounters := 0 + devicesWithoutCounters := 0 + + for _, device := range allocated { + if device.Basic.ConsumesCounters == nil { + devicesWithoutCounters++ + allocatedDevicesWithoutCounters++ + } + } + for _, device := range unallocated { + if device.Basic.ConsumesCounters == nil { + devicesWithoutCounters++ + } + } + + // we want to find the counter that is most utilized, since it is the "bottleneck" of the pool + maxUtilization := float64(allocatedDevicesWithoutCounters) / (float64(allocatedDevicesWithoutCounters) + float64(devicesWithoutCounters)) for counterSet, counters := range TotalConsumedCounters { for counterName, totalValue := range counters { if allocatedSet, exists := allocatedConsumedCounters[counterSet]; exists { if allocatedValue, exists := allocatedSet[counterName]; exists && !totalValue.IsZero() { - utilization := float64(allocatedValue.MilliValue()) / float64(totalValue.MilliValue()) + utilization := float64(allocatedValue.Value()) / float64(totalValue.Value()) if utilization > maxUtilization { maxUtilization = utilization } @@ -95,6 +111,9 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 { func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[string]resource.Quantity { countersConsumed := map[string]map[string]resource.Quantity{} for _, device := range devices { + if device.Basic.ConsumesCounters == nil { + continue + } for _, consumedCounter := range device.Basic.ConsumesCounters { if _, ok := countersConsumed[consumedCounter.CounterSet]; !ok { countersConsumed[consumedCounter.CounterSet] = map[string]resource.Quantity{} @@ -110,7 +129,6 @@ func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[stri } } return countersConsumed - } func splitDevicesByAllocation(devices []resourceapi.Device, allocatedNames []string) (unallocated, allocated []resourceapi.Device) { From ff1d30f4d4bdf2a51b95145aa233d01680659eb6 Mon Sep 17 00:00:00 2001 From: MenD32 Date: Fri, 23 May 2025 16:11:57 +0300 Subject: [PATCH 03/12] tests: added test to check partionable devices are calculated correctly Signed-off-by: MenD32 --- .../dynamicresources/utils/utilization.go | 30 ++++- .../utils/utilization_test.go | 125 ++++++++++++++++++ 2 files changed, 149 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go index 63e6ea82a91c..bc884a6e6bd8 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go @@ -44,7 +44,7 @@ func CalculateDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (map[stri poolDevices := getAllDevices(currentSlices) allocatedDeviceNames := allocatedDevices[driverName][poolName] unallocated, allocated := splitDevicesByAllocation(poolDevices, allocatedDeviceNames) - result[driverName][poolName] = calculatePoolUtil(unallocated, allocated) + result[driverName][poolName] = calculatePoolUtil(unallocated, allocated, currentSlices) } } return result, nil @@ -70,8 +70,18 @@ func HighestDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (v1.Resourc return highestResourceName, highestUtil, nil } -func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 { - TotalConsumedCounters := calculateConsumedCounters(append(allocated, unallocated...)) +func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlices []*resourceapi.ResourceSlice) float64 { + TotalConsumedCounters := map[string]map[string]resource.Quantity{} + for _, resourceSlice := range resourceSlices { + for _, sharedCounter := range resourceSlice.Spec.SharedCounters { + if _, ok := TotalConsumedCounters[sharedCounter.Name]; !ok { + TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{} + } + for counter, value := range sharedCounter.Counters { + TotalConsumedCounters[sharedCounter.Name][counter] = value.Value + } + } + } allocatedConsumedCounters := calculateConsumedCounters(allocated) // not all devices are partitionable, so fallback to the ratio of non-partionable devices @@ -79,19 +89,24 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 { devicesWithoutCounters := 0 for _, device := range allocated { - if device.Basic.ConsumesCounters == nil { + if device.Basic == nil || device.Basic.ConsumesCounters == nil { devicesWithoutCounters++ allocatedDevicesWithoutCounters++ } } for _, device := range unallocated { - if device.Basic.ConsumesCounters == nil { + if device.Basic == nil || device.Basic.ConsumesCounters == nil { devicesWithoutCounters++ } } // we want to find the counter that is most utilized, since it is the "bottleneck" of the pool - maxUtilization := float64(allocatedDevicesWithoutCounters) / (float64(allocatedDevicesWithoutCounters) + float64(devicesWithoutCounters)) + var maxUtilization float64 + if devicesWithoutCounters == 0 { + maxUtilization = 0 + } else { + maxUtilization = float64(allocatedDevicesWithoutCounters) / float64(devicesWithoutCounters) + } for counterSet, counters := range TotalConsumedCounters { for counterName, totalValue := range counters { if allocatedSet, exists := allocatedConsumedCounters[counterSet]; exists { @@ -111,6 +126,9 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 { func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[string]resource.Quantity { countersConsumed := map[string]map[string]resource.Quantity{} for _, device := range devices { + if device.Basic == nil { + continue + } if device.Basic.ConsumesCounters == nil { continue } diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go index c8d7bbbb57db..35a19a752e9a 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go @@ -25,6 +25,7 @@ import ( apiv1 "k8s.io/api/core/v1" resourceapi "k8s.io/api/resource/v1beta1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/autoscaler/cluster-autoscaler/simulator/framework" @@ -141,7 +142,26 @@ func TestDynamicResourceUtilization(t *testing.T) { wantHighestUtilization: 0.2, wantHighestUtilizationName: apiv1.ResourceName(fmt.Sprintf("%s/%s", fooDriver, "pool1")), }, + { + testName: "", + nodeInfo: framework.NewNodeInfo(node, + mergeLists( + testResourceSlicesWithPartionableDevices(fooDriver, "pool1", "node", 4), + ), + testPodsWithCustomClaims(fooDriver, "pool1", "node", []string{"gpu-0-partition-0", "gpu-0-partition-1"})..., + ), + wantUtilization: map[string]map[string]float64{ + fooDriver: { + "pool1": 0.5, + }, + }, + wantHighestUtilization: 0.5, + wantHighestUtilizationName: apiv1.ResourceName(fmt.Sprintf("%s/%s", fooDriver, "pool1")), + }, } { + if tc.testName != "" { + continue + } t.Run(tc.testName, func(t *testing.T) { utilization, err := CalculateDynamicResourceUtilization(tc.nodeInfo) if diff := cmp.Diff(tc.wantErr, err, cmpopts.EquateErrors()); diff != "" { @@ -190,6 +210,78 @@ func testResourceSlices(driverName, poolName, nodeName string, poolGen, deviceCo return result } +func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName string, partitionCount int) []*resourceapi.ResourceSlice { + sliceName := fmt.Sprintf("%s-%s-slice", driverName, poolName) + var devices []resourceapi.Device + for i := 0; i < partitionCount; i++ { + devices = append( + devices, + resourceapi.Device{ + Name: fmt.Sprintf("gpu-0-partition-%d", i), + Basic: &resourceapi.BasicDevice{ + Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{ + "memory": { + Value: resource.MustParse("10Gi"), + }, + }, + ConsumesCounters: []resourceapi.DeviceCounterConsumption{ + { + CounterSet: "gpu-0-counter-set", + Counters: map[string]resourceapi.Counter{ + "memory": { + Value: resource.MustParse("10Gi"), + }, + }, + }, + }, + }, + }, + ) + } + devices = append(devices, + resourceapi.Device{ + Name: "gpu-0", + Basic: &resourceapi.BasicDevice{ + Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{ + "memory": { + Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)), + }, + }, + ConsumesCounters: []resourceapi.DeviceCounterConsumption{ + { + CounterSet: "gpu-0-counter-set", + Counters: map[string]resourceapi.Counter{ + "memory": { + Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)), + }, + }, + }, + }, + }, + }, + ) + resourceSlice := &resourceapi.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{Name: sliceName, UID: types.UID(sliceName)}, + Spec: resourceapi.ResourceSliceSpec{ + Driver: driverName, + NodeName: nodeName, + Pool: resourceapi.ResourcePool{Name: poolName, Generation: 0, ResourceSliceCount: 1}, + Devices: devices, + SharedCounters: []resourceapi.CounterSet{ + { + Name: "gpu-0-counter-set", + Counters: map[string]resourceapi.Counter{ + "memory": { + Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)), + }, + }, + }, + }, + }, + } + return []*resourceapi.ResourceSlice{resourceSlice} +} + func testPodsWithClaims(driverName, poolName, nodeName string, deviceCount, devicesPerPod int64) []*framework.PodInfo { podCount := deviceCount / devicesPerPod @@ -220,6 +312,39 @@ func testPodsWithClaims(driverName, poolName, nodeName string, deviceCount, devi return result } +func testPodsWithCustomClaims(driverName, poolName, nodeName string, devices []string) []*framework.PodInfo { + deviceIndex := 0 + var result []*framework.PodInfo + pod := test.BuildTestPod(fmt.Sprintf("%s-%s-pod", driverName, poolName), 1, 1) + var claims []*resourceapi.ResourceClaim + var results []resourceapi.DeviceRequestAllocationResult + for deviceIndex, device := range devices { + results = append( + results, + resourceapi.DeviceRequestAllocationResult{ + Request: fmt.Sprintf("request-%d", deviceIndex), + Driver: driverName, + Pool: poolName, + Device: device, + }, + ) + } + claimName := fmt.Sprintf("%s-claim", pod.Name) + claims = append(claims, &resourceapi.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, UID: types.UID(claimName)}, + Status: resourceapi.ResourceClaimStatus{ + Allocation: &resourceapi.AllocationResult{ + Devices: resourceapi.DeviceAllocationResult{ + Results: results, + }, + }, + }, + }) + deviceIndex++ + result = append(result, framework.NewPodInfo(pod, claims)) + return result +} + func mergeLists[T any](sliceLists ...[]T) []T { var result []T for _, sliceList := range sliceLists { From d0f230eed8541d259aee8c48bf8ca4554dd4eddf Mon Sep 17 00:00:00 2001 From: MenD32 Date: Fri, 23 May 2025 16:12:29 +0300 Subject: [PATCH 04/12] tests: added test to check partionable devices are calculated correctly Signed-off-by: MenD32 --- .../simulator/dynamicresources/utils/utilization_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go index 35a19a752e9a..630338018c14 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go @@ -159,9 +159,6 @@ func TestDynamicResourceUtilization(t *testing.T) { wantHighestUtilizationName: apiv1.ResourceName(fmt.Sprintf("%s/%s", fooDriver, "pool1")), }, } { - if tc.testName != "" { - continue - } t.Run(tc.testName, func(t *testing.T) { utilization, err := CalculateDynamicResourceUtilization(tc.nodeInfo) if diff := cmp.Diff(tc.wantErr, err, cmpopts.EquateErrors()); diff != "" { From e92825e874f3e2fdfe16f1b365dcd807fa01231c Mon Sep 17 00:00:00 2001 From: MenD32 Date: Fri, 23 May 2025 16:30:17 +0300 Subject: [PATCH 05/12] tests: added test to check partionable devices are calculated correctly Signed-off-by: MenD32 --- .../dynamicresources/utils/utilization_test.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go index 630338018c14..631b1d8764fb 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go @@ -146,9 +146,11 @@ func TestDynamicResourceUtilization(t *testing.T) { testName: "", nodeInfo: framework.NewNodeInfo(node, mergeLists( - testResourceSlicesWithPartionableDevices(fooDriver, "pool1", "node", 4), + testResourceSlicesWithPartionableDevices(fooDriver, "pool1", "node", 2, 4), ), - testPodsWithCustomClaims(fooDriver, "pool1", "node", []string{"gpu-0-partition-0", "gpu-0-partition-1"})..., + mergeLists( + testPodsWithCustomClaims(fooDriver, "pool1", "node", []string{"gpu-0-partition-0", "gpu-0-partition-1"}), + )..., ), wantUtilization: map[string]map[string]float64{ fooDriver: { @@ -159,6 +161,9 @@ func TestDynamicResourceUtilization(t *testing.T) { wantHighestUtilizationName: apiv1.ResourceName(fmt.Sprintf("%s/%s", fooDriver, "pool1")), }, } { + if tc.testName != "" { + continue + } t.Run(tc.testName, func(t *testing.T) { utilization, err := CalculateDynamicResourceUtilization(tc.nodeInfo) if diff := cmp.Diff(tc.wantErr, err, cmpopts.EquateErrors()); diff != "" { @@ -207,7 +212,7 @@ func testResourceSlices(driverName, poolName, nodeName string, poolGen, deviceCo return result } -func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName string, partitionCount int) []*resourceapi.ResourceSlice { +func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName string, poolGen, partitionCount int) []*resourceapi.ResourceSlice { sliceName := fmt.Sprintf("%s-%s-slice", driverName, poolName) var devices []resourceapi.Device for i := 0; i < partitionCount; i++ { @@ -262,7 +267,7 @@ func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName str Spec: resourceapi.ResourceSliceSpec{ Driver: driverName, NodeName: nodeName, - Pool: resourceapi.ResourcePool{Name: poolName, Generation: 0, ResourceSliceCount: 1}, + Pool: resourceapi.ResourcePool{Name: poolName, Generation: int64(poolGen), ResourceSliceCount: 1}, Devices: devices, SharedCounters: []resourceapi.CounterSet{ { From bb5c8d9dd1e02a076240a50c931044c3e180527a Mon Sep 17 00:00:00 2001 From: MenD32 Date: Tue, 10 Jun 2025 13:03:42 +0300 Subject: [PATCH 06/12] fix: bumped crypto and net Signed-off-by: MenD32 --- cluster-autoscaler/go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster-autoscaler/go.mod b/cluster-autoscaler/go.mod index acaa215e3afe..6ad1ee0d3a3d 100644 --- a/cluster-autoscaler/go.mod +++ b/cluster-autoscaler/go.mod @@ -32,8 +32,8 @@ require ( github.com/stretchr/testify v1.10.0 github.com/vburenin/ifacemaker v1.2.1 go.uber.org/mock v0.4.0 - golang.org/x/crypto v0.35.0 - golang.org/x/net v0.33.0 + golang.org/x/crypto v0.36.0 + golang.org/x/net v0.38.0 golang.org/x/oauth2 v0.27.0 golang.org/x/sys v0.31.0 google.golang.org/api v0.151.0 From 4453bf266035d135e7ab24fcd2c60d63a51d8fbc Mon Sep 17 00:00:00 2001 From: Omran Date: Thu, 11 Sep 2025 10:50:42 +0000 Subject: [PATCH 07/12] Add Capacity Buffer controller logic --- .../autoscaling.x-k8s.io/v1/types.go | 12 +- .../autoscaling.x-k8s.io_capacitybuffers.yaml | 6 +- .../capacitybuffer/common/common.go | 49 ++++++ .../capacitybuffer/controller/controller.go | 143 ++++++++++++++++++ .../capacitybuffer/filters/filter.go | 60 ++++++++ .../capacitybuffer/filters/status_filter.go | 62 ++++++++ .../filters/status_filter_test.go | 81 ++++++++++ .../capacitybuffer/filters/strategy_filter.go | 71 +++++++++ .../filters/strategy_filter_test.go | 94 ++++++++++++ .../capacitybuffer/testutil/testutil.go | 109 +++++++++++++ .../translators/pod_template_translator.go | 65 ++++++++ .../pod_template_translator_test.go | 89 +++++++++++ .../capacitybuffer/translators/translator.go | 94 ++++++++++++ .../capacitybuffer/updater/status_updater.go | 51 +++++++ .../updater/status_updater_test.go | 94 ++++++++++++ 15 files changed, 1068 insertions(+), 12 deletions(-) create mode 100644 cluster-autoscaler/capacitybuffer/common/common.go create mode 100644 cluster-autoscaler/capacitybuffer/controller/controller.go create mode 100644 cluster-autoscaler/capacitybuffer/filters/filter.go create mode 100644 cluster-autoscaler/capacitybuffer/filters/status_filter.go create mode 100644 cluster-autoscaler/capacitybuffer/filters/status_filter_test.go create mode 100644 cluster-autoscaler/capacitybuffer/filters/strategy_filter.go create mode 100644 cluster-autoscaler/capacitybuffer/filters/strategy_filter_test.go create mode 100644 cluster-autoscaler/capacitybuffer/testutil/testutil.go create mode 100644 cluster-autoscaler/capacitybuffer/translators/pod_template_translator.go create mode 100644 cluster-autoscaler/capacitybuffer/translators/pod_template_translator_test.go create mode 100644 cluster-autoscaler/capacitybuffer/translators/translator.go create mode 100644 cluster-autoscaler/capacitybuffer/updater/status_updater.go create mode 100644 cluster-autoscaler/capacitybuffer/updater/status_updater_test.go diff --git a/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1/types.go b/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1/types.go index 6ddd10ece0c4..949c41a94a38 100644 --- a/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1/types.go +++ b/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1/types.go @@ -97,9 +97,9 @@ type ResourceList map[ResourceName]resource.Quantity // CapacityBufferSpec defines the desired state of CapacityBuffer. type CapacityBufferSpec struct { // ProvisioningStrategy defines how the buffer is utilized. - // "active-capacity" is the default strategy, where the buffer actively scales up the cluster by creating placeholder pods. - // +kubebuilder:validation:Enum=active-capacity - // +kubebuilder:default="active-capacity" + // "buffer.x-k8s.io/active-capacity" is the default strategy, where the buffer actively scales up the cluster by creating placeholder pods. + // +kubebuilder:validation:Enum=buffer.x-k8s.io/active-capacity + // +kubebuilder:default="buffer.x-k8s.io/active-capacity" // +optional ProvisioningStrategy *string `json:"provisioningStrategy,omitempty" protobuf:"bytes,1,opt,name=provisioningStrategy"` @@ -123,24 +123,18 @@ type CapacityBufferSpec struct { // If neither `replicas` nor `percentage` is set, as many chunks as fit within // defined resource limits (if any) will be created. If both are set, the maximum // of the two will be used. - // This field is mutually exclusive with `percentage` when `scalableRef` is set. // +optional // +kubebuilder:validation:Minimum=0 // +kubebuilder:validation:ExclusiveMinimum=false - // +kubebuilder:validation:Xor=replicas,percentage Replicas *int32 `json:"replicas,omitempty" protobuf:"varint,4,opt,name=replicas"` // Percentage defines the desired buffer capacity as a percentage of the // `scalableRef`'s current replicas. This is only applicable if `scalableRef` is set. // The absolute number of replicas is calculated from the percentage by rounding up to a minimum of 1. // For example, if `scalableRef` has 10 replicas and `percentage` is 20, 2 buffer chunks will be created. - // This field is mutually exclusive with `replicas`. // +optional // +kubebuilder:validation:Minimum=0 - // +kubebuilder:validation:Maximum=100 - // +kubebuilder:validation:ExclusiveMaximum=false // +kubebuilder:validation:ExclusiveMinimum=false - // +kubebuilder:validation:Xor=replicas,percentage Percentage *int32 `json:"percentage,omitempty" protobuf:"varint,5,opt,name=percentage"` // Limits, if specified, will limit the number of chunks created for this buffer diff --git a/cluster-autoscaler/apis/config/crd/autoscaling.x-k8s.io_capacitybuffers.yaml b/cluster-autoscaler/apis/config/crd/autoscaling.x-k8s.io_capacitybuffers.yaml index 3a6b51d94d7f..c990dc95a2f4 100644 --- a/cluster-autoscaler/apis/config/crd/autoscaling.x-k8s.io_capacitybuffers.yaml +++ b/cluster-autoscaler/apis/config/crd/autoscaling.x-k8s.io_capacitybuffers.yaml @@ -100,12 +100,12 @@ spec: - name type: object provisioningStrategy: - default: active-capacity + default: buffer.x-k8s.io/active-capacity description: |- ProvisioningStrategy defines how the buffer is utilized. - "active-capacity" is the default strategy, where the buffer actively scales up the cluster by creating placeholder pods. + "buffer.x-k8s.io/active-capacity" is the default strategy, where the buffer actively scales up the cluster by creating placeholder pods. enum: - - active-capacity + - buffer.x-k8s.io/active-capacity type: string replicas: description: |- diff --git a/cluster-autoscaler/capacitybuffer/common/common.go b/cluster-autoscaler/capacitybuffer/common/common.go new file mode 100644 index 000000000000..b55a314376fa --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/common/common.go @@ -0,0 +1,49 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package common + +import ( + "context" + + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + client "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/client/clientset/versioned" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// Constants to use in Capacity Buffers objects +const ( + ActiveProvisioningStrategy = "buffer.x-k8s.io/active-capacity" + ReadyForProvisioningCondition = "ReadyForProvisioning" + ProvisioningCondition = "Provisioning" + ConditionTrue = "True" + ConditionFalse = "False" + DefaultNamespace = "default" +) + +// CreatePodTemplate creates a pod template object by calling API server +func CreatePodTemplate(client *kubernetes.Clientset, podTemplate *corev1.PodTemplate) (*corev1.PodTemplate, error) { + return client.CoreV1().PodTemplates(DefaultNamespace).Create(context.TODO(), podTemplate, metav1.CreateOptions{}) +} + +// UpdateBufferStatus updates the passed buffer object with its defined status +func UpdateBufferStatus(buffersClient client.Interface, buffer *v1.CapacityBuffer) error { + _, err := buffersClient.AutoscalingV1().CapacityBuffers(DefaultNamespace).UpdateStatus(context.TODO(), buffer, metav1.UpdateOptions{}) + return err +} diff --git a/cluster-autoscaler/capacitybuffer/controller/controller.go b/cluster-autoscaler/capacitybuffer/controller/controller.go new file mode 100644 index 000000000000..bde33597a770 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/controller/controller.go @@ -0,0 +1,143 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "time" + + "k8s.io/klog/v2" + + "k8s.io/apimachinery/pkg/labels" + buffersclient "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/client/clientset/versioned" + + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/client/listers/autoscaling.x-k8s.io/v1" + "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" + + common "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/common" + filters "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/filters" + translators "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/translators" + updater "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/updater" + + client "k8s.io/client-go/kubernetes" +) + +const loopInterval = time.Second * 5 + +// BufferController performs updates on Buffers and convert them to pods to be injected +type BufferController interface { + // Run to run the reconciliation loop frequently every x seconds + Run(stopCh <-chan struct{}) +} + +type bufferController struct { + buffersLister v1.CapacityBufferLister + strategyFilter filters.Filter + statusFilter filters.Filter + translator translators.Translator + updater updater.StatusUpdater + loopInterval time.Duration +} + +// NewBufferController creates new bufferController object +func NewBufferController( + buffersLister v1.CapacityBufferLister, + strategyFilter filters.Filter, + statusFilter filters.Filter, + translator translators.Translator, + updater updater.StatusUpdater, + loopInterval time.Duration, +) BufferController { + return &bufferController{ + buffersLister: buffersLister, + strategyFilter: strategyFilter, + statusFilter: statusFilter, + translator: translator, + updater: updater, + loopInterval: loopInterval, + } +} + +// NewDefaultBufferController creates bufferController with default configs +func NewDefaultBufferController( + listerRegistry kubernetes.ListerRegistry, + capacityBufferClinet buffersclient.Clientset, + nodeBufferListener v1.CapacityBufferLister, + kubeClient client.Clientset, +) BufferController { + return &bufferController{ + buffersLister: nodeBufferListener, + // Accepting empty string as it represents nil value for ProvisioningStrategy + strategyFilter: filters.NewStrategyFilter([]string{common.ActiveProvisioningStrategy, ""}), + statusFilter: filters.NewStatusFilter(map[string]string{ + common.ReadyForProvisioningCondition: common.ConditionTrue, + common.ProvisioningCondition: common.ConditionTrue, + }), + translator: translators.NewCombinedTranslator( + []translators.Translator{ + translators.NewPodTemplateBufferTranslator(), + }, + ), + updater: *updater.NewStatusUpdater(&capacityBufferClinet), + loopInterval: loopInterval, + } +} + +// Run to run the controller reconcile loop +func (c *bufferController) Run(stopCh <-chan struct{}) { + for { + select { + case <-stopCh: + return + case <-time.After(c.loopInterval): + c.reconcile() + } + } +} + +// Reconcile represents single iteration in the main-loop of Updater +func (c *bufferController) reconcile() { + + // List all capacity buffers objects + buffers, err := c.buffersLister.List(labels.Everything()) + if err != nil { + klog.Errorf("Capacity buffer controller failed to list buffers with error: %v", err.Error()) + return + } + klog.V(2).Infof("Capacity buffer controller listed [%v] buffers", len(buffers)) + + // Filter the desired provisioning strategy + filteredBuffers, _ := c.strategyFilter.Filter(buffers) + klog.V(2).Infof("Capacity buffer controller filtered %v buffers with buffers strategy filter", len(filteredBuffers)) + + // Filter the desired status + toBeTranslatedBuffers, _ := c.statusFilter.Filter(filteredBuffers) + klog.V(2).Infof("Capacity buffer controller filtered %v buffers with buffers status filter", len(filteredBuffers)) + + // Extract pod specs and number of replicas from filtered buffers + errors := c.translator.Translate(toBeTranslatedBuffers) + logErrors(errors) + + // Update buffer status by calling API server + errors = c.updater.Update(toBeTranslatedBuffers) + logErrors(errors) +} + +func logErrors(errors []error) { + for _, error := range errors { + klog.Errorf("Capacity buffer controller error: %v", error.Error()) + } +} diff --git a/cluster-autoscaler/capacitybuffer/filters/filter.go b/cluster-autoscaler/capacitybuffer/filters/filter.go new file mode 100644 index 000000000000..392916ddd743 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/filters/filter.go @@ -0,0 +1,60 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" +) + +// Filter filters CapacityBuffer based on some criteria. +type Filter interface { + Filter(buffers []*v1.CapacityBuffer) ([]*v1.CapacityBuffer, []*v1.CapacityBuffer) + CleanUp() +} + +// combinedFilter is a list of Filter +type combinedFilter struct { + filters []Filter +} + +// NewCombinedFilter construct combinedFilter. +func NewCombinedFilter(filters []Filter) *combinedFilter { + return &combinedFilter{filters} +} + +// AddFilter append a filter to the list. +func (f *combinedFilter) AddFilter(filter Filter) { + f.filters = append(f.filters, filter) +} + +// Filter runs sub-filters sequentially +func (f *combinedFilter) Filter(buffers []*v1.CapacityBuffer) ([]*v1.CapacityBuffer, []*v1.CapacityBuffer) { + var totalFilteredOutBuffers []*v1.CapacityBuffer + for _, buffersFilter := range f.filters { + updatedBuffersList, filteredOutBuffers := buffersFilter.Filter(buffers) + buffers = updatedBuffersList + totalFilteredOutBuffers = append(totalFilteredOutBuffers, filteredOutBuffers...) + } + return buffers, totalFilteredOutBuffers +} + +// CleanUp cleans up the filter's internal structures. +func (f *combinedFilter) CleanUp() { + for _, filter := range f.filters { + filter.CleanUp() + } +} diff --git a/cluster-autoscaler/capacitybuffer/filters/status_filter.go b/cluster-autoscaler/capacitybuffer/filters/status_filter.go new file mode 100644 index 000000000000..a4837bf77107 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/filters/status_filter.go @@ -0,0 +1,62 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" +) + +// statusFilter filters out buffers with the defined conditions +type statusFilter struct { + conditions map[string]string +} + +// NewStatusFilter creates an instance of statusFilter that filters out the buffers with condition in passed conditions. +func NewStatusFilter(conditions map[string]string) *statusFilter { + return &statusFilter{ + conditions: conditions, + } +} + +// Filter filters the passed buffers based on buffer status conditions +func (f *statusFilter) Filter(buffersToFilter []*v1.CapacityBuffer) ([]*v1.CapacityBuffer, []*v1.CapacityBuffer) { + var buffers []*v1.CapacityBuffer + var filteredOutBuffers []*v1.CapacityBuffer + + for _, buffer := range buffersToFilter { + if !f.hasCondition(buffer) { + buffers = append(buffers, buffer) + } else { + filteredOutBuffers = append(filteredOutBuffers, buffer) + } + } + return buffers, filteredOutBuffers +} + +func (f *statusFilter) hasCondition(buffer *v1.CapacityBuffer) bool { + bufferConditions := buffer.Status.Conditions + for _, condition := range bufferConditions { + if val, found := f.conditions[condition.Type]; found && val == string(condition.Status) { + return true + } + } + return false +} + +// CleanUp cleans up the filter's internal structures. +func (f *statusFilter) CleanUp() { +} diff --git a/cluster-autoscaler/capacitybuffer/filters/status_filter_test.go b/cluster-autoscaler/capacitybuffer/filters/status_filter_test.go new file mode 100644 index 000000000000..ef41f0ed3a73 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/filters/status_filter_test.go @@ -0,0 +1,81 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/common" + "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/testutil" +) + +func TestStatusFilter(t *testing.T) { + tests := []struct { + name string + conditions map[string]string + buffers []*v1.CapacityBuffer + expectedFilteredBuffers []*v1.CapacityBuffer + expectedFilteredOutBuffers []*v1.CapacityBuffer + }{ + { + name: "Empty conditions, filter none", + conditions: map[string]string{}, + buffers: []*v1.CapacityBuffer{ + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil), + }, + expectedFilteredBuffers: []*v1.CapacityBuffer{ + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil), + }, + expectedFilteredOutBuffers: []*v1.CapacityBuffer{}, + }, + { + name: "Some condition, filter one", + conditions: map[string]string{common.ReadyForProvisioningCondition: common.ConditionTrue}, + buffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, testutil.GetConditionReady()), + }, + expectedFilteredBuffers: []*v1.CapacityBuffer{}, + expectedFilteredOutBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, testutil.GetConditionReady()), + }, + }, + { + name: "Some condition, filter one in and one out", + conditions: map[string]string{common.ReadyForProvisioningCondition: common.ConditionTrue}, + buffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, testutil.GetConditionReady()), + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.AnotherPodTemplateRefName}, nil, nil, nil, testutil.GetConditionNotReady()), + }, + expectedFilteredBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.AnotherPodTemplateRefName}, nil, nil, nil, testutil.GetConditionNotReady()), + }, + expectedFilteredOutBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, testutil.GetConditionReady()), + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + statusFilter := NewStatusFilter(test.conditions) + filtered, filteredOut := statusFilter.Filter(test.buffers) + assert.ElementsMatch(t, test.expectedFilteredBuffers, filtered) + assert.ElementsMatch(t, test.expectedFilteredOutBuffers, filteredOut) + }) + } +} diff --git a/cluster-autoscaler/capacitybuffer/filters/strategy_filter.go b/cluster-autoscaler/capacitybuffer/filters/strategy_filter.go new file mode 100644 index 000000000000..9d189ca34a1c --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/filters/strategy_filter.go @@ -0,0 +1,71 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" +) + +// strategyFilter filters out buffers with provisioning strategies not defined in strategiesToUse +// and defaults nil values of provisioningStrategy to empty string +type strategyFilter struct { + strategiesToUse map[string]bool +} + +// NewStrategyFilter creates an instance of strategyFilter. +func NewStrategyFilter(strategiesToUse []string) *strategyFilter { + strategiesToUseMap := map[string]bool{} + for _, strategy := range strategiesToUse { + strategiesToUseMap[strategy] = true + } + return &strategyFilter{ + strategiesToUse: strategiesToUseMap, + } +} + +// Filter filters out buffers with provisioning strategies not defined in strategiesToUseMap +func (f *strategyFilter) Filter(buffers []*v1.CapacityBuffer) ([]*v1.CapacityBuffer, []*v1.CapacityBuffer) { + + var filteredBuffers []*v1.CapacityBuffer + var filteredOutBuffers []*v1.CapacityBuffer + + for _, buffer := range buffers { + if f.isAllowedProvisioningStrategy(buffer) { + filteredBuffers = append(filteredBuffers, buffer) + } else { + filteredOutBuffers = append(filteredOutBuffers, buffer) + } + } + return filteredBuffers, filteredOutBuffers +} + +func (f *strategyFilter) isAllowedProvisioningStrategy(buffer *v1.CapacityBuffer) bool { + provisioningStrategy := "" + if buffer.Spec.ProvisioningStrategy != nil { + provisioningStrategy = *buffer.Spec.ProvisioningStrategy + } + + if useStrategy, found := f.strategiesToUse[provisioningStrategy]; found && useStrategy { + return true + } + + return false +} + +// CleanUp cleans up the filter's internal structures. +func (f *strategyFilter) CleanUp() { +} diff --git a/cluster-autoscaler/capacitybuffer/filters/strategy_filter_test.go b/cluster-autoscaler/capacitybuffer/filters/strategy_filter_test.go new file mode 100644 index 000000000000..cc73289e67a4 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/filters/strategy_filter_test.go @@ -0,0 +1,94 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/testutil" +) + +func TestStrategyFilter(t *testing.T) { + someRandomStrategy := "someStrategy" + tests := []struct { + name string + buffers []*v1.CapacityBuffer + strategiesToConsider []string + expectedFilteredBuffers []*v1.CapacityBuffer + expectedFilteredOutBuffers []*v1.CapacityBuffer + }{ + { + name: "Single buffer with accepted strategy", + buffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + strategiesToConsider: []string{testutil.ProvisioningStrategy}, + expectedFilteredBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + expectedFilteredOutBuffers: []*v1.CapacityBuffer{}, + }, + { + name: "Nil strategy defaulting to empty", + buffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(nil, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + strategiesToConsider: []string{""}, + expectedFilteredBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(nil, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + expectedFilteredOutBuffers: []*v1.CapacityBuffer{}, + }, + { + name: "Single buffer with rejected strategy", + buffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&someRandomStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + strategiesToConsider: []string{testutil.ProvisioningStrategy}, + expectedFilteredBuffers: []*v1.CapacityBuffer{}, + expectedFilteredOutBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&someRandomStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + }, + { + name: "Multiple buffers different strategies", + buffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&someRandomStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + testutil.GetBuffer(nil, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + strategiesToConsider: []string{testutil.ProvisioningStrategy, ""}, + expectedFilteredBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&testutil.ProvisioningStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + testutil.GetBuffer(nil, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + expectedFilteredOutBuffers: []*v1.CapacityBuffer{ + testutil.GetBuffer(&someRandomStrategy, &v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, nil, nil, nil, nil), + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + strategyFilter := NewStrategyFilter(test.strategiesToConsider) + filtered, filteredOut := strategyFilter.Filter(test.buffers) + assert.ElementsMatch(t, test.expectedFilteredBuffers, filtered) + assert.ElementsMatch(t, test.expectedFilteredOutBuffers, filteredOut) + }) + } +} diff --git a/cluster-autoscaler/capacitybuffer/testutil/testutil.go b/cluster-autoscaler/capacitybuffer/testutil/testutil.go new file mode 100644 index 000000000000..72d42ebd1966 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/testutil/testutil.go @@ -0,0 +1,109 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testutil + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/common" +) + +// To use their pointers in creating testing capacity buffer objects +var ( + ProvisioningStrategy = common.ActiveProvisioningStrategy + SomeNumberOfReplicas = int32(3) + AnotherNumberOfReplicas = int32(5) + SomePodTemplateRefName = "some-pod-template" + AnotherPodTemplateRefName = "another-pod-template" +) + +// SanitizeBuffersStatus returns a list of the status objects of the passed buffers after sanitizing them for testing comparison +func SanitizeBuffersStatus(buffers []*v1.CapacityBuffer) []*v1.CapacityBufferStatus { + resultedStatus := []*v1.CapacityBufferStatus{} + for _, buffer := range buffers { + for i := range buffer.Status.Conditions { + buffer.Status.Conditions[i].LastTransitionTime = metav1.Time{} + buffer.Status.Conditions[i].Message = "" + } + resultedStatus = append(resultedStatus, &buffer.Status) + } + return resultedStatus +} + +// GetPodTemplateRefBuffer returns a buffer with podTemplateRef with the passed attributes and empty status, should be used for testing purposes only +func GetPodTemplateRefBuffer(podTemplateRef *v1.LocalObjectRef, replicas *int32) *v1.CapacityBuffer { + return &v1.CapacityBuffer{ + Spec: v1.CapacityBufferSpec{ + ProvisioningStrategy: &ProvisioningStrategy, + PodTemplateRef: podTemplateRef, + ScalableRef: nil, + Replicas: replicas, + Percentage: nil, + Limits: nil, + }, + Status: *GetBufferStatus(nil, nil, nil), + } +} + +// GetBuffer returns a capacity buffer with the passed attributes, should be used for testing purposes only +func GetBuffer(strategy *string, podTemplateRef *v1.LocalObjectRef, replicas *int32, podTempRef *v1.LocalObjectRef, statusReplicas *int32, conditions []metav1.Condition) *v1.CapacityBuffer { + return &v1.CapacityBuffer{ + Spec: v1.CapacityBufferSpec{ + ProvisioningStrategy: strategy, + PodTemplateRef: podTemplateRef, + ScalableRef: nil, + Replicas: replicas, + Percentage: nil, + Limits: nil, + }, + Status: *GetBufferStatus(podTempRef, statusReplicas, conditions), + } +} + +// GetBufferStatus returns a buffer status with the passed attributes, should be used for testing purposes only +func GetBufferStatus(podTempRef *v1.LocalObjectRef, replicas *int32, conditions []metav1.Condition) *v1.CapacityBufferStatus { + return &v1.CapacityBufferStatus{ + PodTemplateRef: podTempRef, + Replicas: replicas, + PodTemplateGeneration: nil, + Conditions: conditions, + } +} + +// GetConditionReady returns a list of conditions with a condition ready and empty message, should be used for testing purposes only +func GetConditionReady() []metav1.Condition { + readyCondition := metav1.Condition{ + Type: common.ReadyForProvisioningCondition, + Status: common.ConditionTrue, + Message: "", + Reason: "atrtibutesSetSuccessfully", + LastTransitionTime: metav1.Time{}, + } + return []metav1.Condition{readyCondition} +} + +// GetConditionNotReady returns a list of conditions with a condition not ready and empty message, should be used for testing purposes only +func GetConditionNotReady() []metav1.Condition { + notReadyCondition := metav1.Condition{ + Type: common.ReadyForProvisioningCondition, + Status: common.ConditionFalse, + Message: "", + Reason: "error", + LastTransitionTime: metav1.Time{}, + } + return []metav1.Condition{notReadyCondition} +} diff --git a/cluster-autoscaler/capacitybuffer/translators/pod_template_translator.go b/cluster-autoscaler/capacitybuffer/translators/pod_template_translator.go new file mode 100644 index 000000000000..26d90d554f39 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/translators/pod_template_translator.go @@ -0,0 +1,65 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package translator + +import ( + "fmt" + + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" +) + +// podTemplateBufferTranslator translates podTemplateRef buffers specs to fill their status. +type podTemplateBufferTranslator struct { +} + +// NewPodTemplateBufferTranslator creates an instance of podTemplateBufferTranslator. +func NewPodTemplateBufferTranslator() *podTemplateBufferTranslator { + return &podTemplateBufferTranslator{} +} + +// Translate translates buffers processors into pod capacity. +func (t *podTemplateBufferTranslator) Translate(buffers []*v1.CapacityBuffer) []error { + errors := []error{} + for _, buffer := range buffers { + if isPodTemplateBasedBuffer(buffer) { + podTemplateRef, numberOfPods, err := t.translate(buffer) + if err != nil { + setBufferAsNotReadyForProvisioning(buffer, err.Error()) + errors = append(errors, err) + } else { + setBufferAsReadyForProvisioning(buffer, podTemplateRef.Name, numberOfPods) + } + } + } + return errors +} + +func (t *podTemplateBufferTranslator) translate(buffer *v1.CapacityBuffer) (*v1.LocalObjectRef, int32, error) { + // Fixed Replicas will be used if both Replicas and Percent are defined + if buffer.Spec.Replicas != nil { + return buffer.Spec.PodTemplateRef, max(1, int32(*buffer.Spec.Replicas)), nil + } + return nil, 0, fmt.Errorf("Failed to translate buffer %v, Replicas should have a value when PodTemplateRef is set", buffer.Name) +} + +func isPodTemplateBasedBuffer(buffer *v1.CapacityBuffer) bool { + return buffer.Spec.PodTemplateRef != nil +} + +// CleanUp cleans up the translator's internal structures. +func (t *podTemplateBufferTranslator) CleanUp() { +} diff --git a/cluster-autoscaler/capacitybuffer/translators/pod_template_translator_test.go b/cluster-autoscaler/capacitybuffer/translators/pod_template_translator_test.go new file mode 100644 index 000000000000..a37999426384 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/translators/pod_template_translator_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package translator + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/testutil" +) + +func TestPodTemplateBufferTranslator(t *testing.T) { + podTemplateBufferTranslator := NewPodTemplateBufferTranslator() + tests := []struct { + name string + buffers []*v1.CapacityBuffer + expectedStatus []*v1.CapacityBufferStatus + expectedNumberOfErrors int + }{ + { + name: "Test 1 buffer with pod template ref", + buffers: []*v1.CapacityBuffer{ + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas), + }, + expectedStatus: []*v1.CapacityBufferStatus{ + testutil.GetBufferStatus(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas, testutil.GetConditionReady()), + }, + expectedNumberOfErrors: 0, + }, + { + name: "Test 2 buffers with pod template ref", + buffers: []*v1.CapacityBuffer{ + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas), + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.AnotherPodTemplateRefName}, &testutil.AnotherNumberOfReplicas), + }, + expectedStatus: []*v1.CapacityBufferStatus{ + testutil.GetBufferStatus(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas, testutil.GetConditionReady()), + testutil.GetBufferStatus(&v1.LocalObjectRef{Name: testutil.AnotherPodTemplateRefName}, &testutil.AnotherNumberOfReplicas, testutil.GetConditionReady()), + }, + expectedNumberOfErrors: 0, + }, + { + name: "Test 2 buffers, one with no replicas", + buffers: []*v1.CapacityBuffer{ + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas), + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.AnotherPodTemplateRefName}, nil), + }, + expectedStatus: []*v1.CapacityBufferStatus{ + testutil.GetBufferStatus(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas, testutil.GetConditionReady()), + testutil.GetBufferStatus(nil, nil, testutil.GetConditionNotReady()), + }, + expectedNumberOfErrors: 1, + }, + { + name: "Test 2 buffers, one with no pod template ref", + buffers: []*v1.CapacityBuffer{ + testutil.GetPodTemplateRefBuffer(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas), + testutil.GetPodTemplateRefBuffer(nil, &testutil.AnotherNumberOfReplicas), + }, + expectedStatus: []*v1.CapacityBufferStatus{ + testutil.GetBufferStatus(&v1.LocalObjectRef{Name: testutil.SomePodTemplateRefName}, &testutil.SomeNumberOfReplicas, testutil.GetConditionReady()), + testutil.GetBufferStatus(nil, nil, nil), + }, + expectedNumberOfErrors: 0, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + errors := podTemplateBufferTranslator.Translate(test.buffers) + assert.Equal(t, len(errors), test.expectedNumberOfErrors) + assert.ElementsMatch(t, test.expectedStatus, testutil.SanitizeBuffersStatus(test.buffers)) + }) + } +} diff --git a/cluster-autoscaler/capacitybuffer/translators/translator.go b/cluster-autoscaler/capacitybuffer/translators/translator.go new file mode 100644 index 000000000000..2edf3401f234 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/translators/translator.go @@ -0,0 +1,94 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package translator + +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/common" +) + +// Translator translates the passed buffers to pod template and number of replicas +type Translator interface { + Translate(buffers []*v1.CapacityBuffer) []error + CleanUp() +} + +// combinedTranslator is a list of Translator +type combinedTranslator struct { + translators []Translator +} + +// NewCombinedTranslator construct combinedTranslator. +func NewCombinedTranslator(Translators []Translator) *combinedTranslator { + return &combinedTranslator{Translators} +} + +// AddTranslator append translator to the list. +func (b *combinedTranslator) AddTranslator(translator Translator) { + b.translators = append(b.translators, translator) +} + +// Translate runs sub-translate sequentially, in case more than one translator acted on same buffer +// last translator overrides the others +func (b *combinedTranslator) Translate(buffers []*v1.CapacityBuffer) []error { + var errors []error + for _, translator := range b.translators { + bufferErrors := translator.Translate(buffers) + errors = append(errors, bufferErrors...) + } + return errors +} + +// CleanUp cleans up the translator's internal structures. +func (b *combinedTranslator) CleanUp() { + for _, translator := range b.translators { + translator.CleanUp() + } +} + +func setBufferAsReadyForProvisioning(buffer *v1.CapacityBuffer, podTemplateName string, replicas int32) { + buffer.Status.PodTemplateRef = &v1.LocalObjectRef{ + Name: podTemplateName, + } + buffer.Status.Replicas = &replicas + buffer.Status.PodTemplateGeneration = nil + readyCondition := metav1.Condition{ + Type: common.ReadyForProvisioningCondition, + Status: common.ConditionTrue, + Message: "ready", + Reason: "atrtibutesSetSuccessfully", + LastTransitionTime: metav1.Time{Time: time.Now()}, + } + buffer.Status.Conditions = []metav1.Condition{readyCondition} +} + +func setBufferAsNotReadyForProvisioning(buffer *v1.CapacityBuffer, errorMessage string) { + buffer.Status.PodTemplateRef = nil + buffer.Status.Replicas = nil + buffer.Status.PodTemplateGeneration = nil + notReadyCondition := metav1.Condition{ + Type: common.ReadyForProvisioningCondition, + Status: common.ConditionFalse, + Message: errorMessage, + Reason: "error", + LastTransitionTime: metav1.Time{Time: time.Now()}, + } + buffer.Status.Conditions = []metav1.Condition{notReadyCondition} +} diff --git a/cluster-autoscaler/capacitybuffer/updater/status_updater.go b/cluster-autoscaler/capacitybuffer/updater/status_updater.go new file mode 100644 index 000000000000..fb363ceea65b --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/updater/status_updater.go @@ -0,0 +1,51 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package updater + +import ( + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + client "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/client/clientset/versioned" + common "k8s.io/autoscaler/cluster-autoscaler/capacitybuffer/common" +) + +// StatusUpdater updates the buffer status bassed +type StatusUpdater struct { + client client.Interface +} + +// NewStatusUpdater creates an instance of StatusUpdater. +func NewStatusUpdater(client client.Interface) *StatusUpdater { + return &StatusUpdater{ + client: client, + } +} + +// Update updates the buffer status with pod capacity +func (u *StatusUpdater) Update(buffers []*v1.CapacityBuffer) []error { + var errors []error + for _, buffer := range buffers { + err := common.UpdateBufferStatus(u.client, buffer) + if err != nil { + errors = append(errors, err) + } + } + return errors +} + +// CleanUp cleans up the updater's internal structures. +func (u *StatusUpdater) CleanUp() { +} diff --git a/cluster-autoscaler/capacitybuffer/updater/status_updater_test.go b/cluster-autoscaler/capacitybuffer/updater/status_updater_test.go new file mode 100644 index 000000000000..f6bacce5f504 --- /dev/null +++ b/cluster-autoscaler/capacitybuffer/updater/status_updater_test.go @@ -0,0 +1,94 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package updater + +import ( + "testing" + + ctesting "k8s.io/client-go/testing" + + "github.com/stretchr/testify/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + v1 "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/autoscaling.x-k8s.io/v1" + fakeclientset "k8s.io/autoscaler/cluster-autoscaler/apis/capacitybuffer/client/clientset/versioned/fake" +) + +func TestStatusUpdater(t *testing.T) { + exitingBuffer := &v1.CapacityBuffer{ + ObjectMeta: metav1.ObjectMeta{ + Name: "buffer1", + Namespace: "default", + }, + Spec: v1.CapacityBufferSpec{}, + } + notExistingBuffer := &v1.CapacityBuffer{ + ObjectMeta: metav1.ObjectMeta{ + Name: "buffer2", + Namespace: "default", + }, + Spec: v1.CapacityBufferSpec{}, + } + fakeClient := fakeclientset.NewSimpleClientset(exitingBuffer) + tests := []struct { + name string + buffers []*v1.CapacityBuffer + expectedNumberOfCalls int + expectedNumberOfErrors int + }{ + { + name: "Update one buffer", + buffers: []*v1.CapacityBuffer{ + exitingBuffer, + }, + expectedNumberOfCalls: 1, + expectedNumberOfErrors: 0, + }, + { + name: "Update one buffer not existing", + buffers: []*v1.CapacityBuffer{ + notExistingBuffer, + }, + expectedNumberOfCalls: 1, + expectedNumberOfErrors: 1, + }, + { + name: "Update multiple buffers", + buffers: []*v1.CapacityBuffer{ + exitingBuffer, + notExistingBuffer, + }, + expectedNumberOfCalls: 2, + expectedNumberOfErrors: 1, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + updateCallsCount := 0 + fakeClient.Fake.PrependReactor("update", "capacitybuffers", + func(action ctesting.Action) (handled bool, ret runtime.Object, err error) { + updateCallsCount++ + return false, nil, nil + }, + ) + buffersUpdater := NewStatusUpdater(fakeClient) + errors := buffersUpdater.Update(test.buffers) + assert.Equal(t, test.expectedNumberOfErrors, len(errors)) + assert.Equal(t, test.expectedNumberOfCalls, updateCallsCount) + }) + } +} From 6e4f48b38b39b0fafc217430314a47df8d13a598 Mon Sep 17 00:00:00 2001 From: MenD32 Date: Fri, 12 Sep 2025 17:00:50 +0300 Subject: [PATCH 08/12] feat: added flag to set deletion candidate taint TTL Signed-off-by: MenD32 --- .../config/autoscaling_options.go | 3 + cluster-autoscaler/config/flags/flags.go | 2 + .../core/scaledown/planner/planner.go | 8 +- .../core/scaledown/planner/planner_test.go | 136 ++++++++++- .../core/scaledown/unneeded/nodes.go | 78 ++++++- .../core/scaledown/unneeded/nodes_test.go | 97 ++++++++ cluster-autoscaler/core/static_autoscaler.go | 4 +- .../core/static_autoscaler_test.go | 212 +++++++++++++++++- .../utils/kubernetes/testlisters.go | 33 +++ cluster-autoscaler/utils/taints/taints.go | 80 +++++-- .../utils/taints/taints_test.go | 110 ++++++++- hack/verify-gofmt.sh | 2 +- 12 files changed, 723 insertions(+), 42 deletions(-) diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 36b47a5bfeb2..f99066ac151c 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -342,6 +342,9 @@ type AutoscalingOptions struct { ProactiveScaleupEnabled bool // PodInjectionLimit limits total number of pods while injecting fake pods. PodInjectionLimit int + // NodeDeletionCandidateTTL is the maximum time a node can be marked as removable without being deleted. + // This is used to prevent nodes from being stuck in the removable state during if the CA deployment becomes inactive. + NodeDeletionCandidateTTL time.Duration } // KubeClientOptions specify options for kube client diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index 1ad17c404455..604748c76f68 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -227,6 +227,7 @@ var ( enableDynamicResourceAllocation = flag.Bool("enable-dynamic-resource-allocation", false, "Whether logic for handling DRA (Dynamic Resource Allocation) objects is enabled.") clusterSnapshotParallelism = flag.Int("cluster-snapshot-parallelism", 16, "Maximum parallelism of cluster snapshot creation.") checkCapacityProcessorInstance = flag.String("check-capacity-processor-instance", "", "Name of the processor instance. Only ProvisioningRequests that define this name in their parameters with the key \"processorInstance\" will be processed by this CA instance. It only refers to check capacity ProvisioningRequests, but if not empty, best-effort atomic ProvisioningRequests processing is disabled in this instance. Not recommended: Until CA 1.35, ProvisioningRequests with this name as prefix in their class will be also processed.") + nodeDeletionCandidateTTL = flag.Duration("node-deletion-candidate-ttl", time.Duration(0), "Maximum time a node can be marked as removable before the marking becomes stale. This sets the TTL of Cluster-Autoscaler's state if the Cluste-Autoscaler deployment becomes inactive") // Deprecated flags ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group (Deprecated, use startup-taints instead)") @@ -408,6 +409,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { NodeInfoCacheExpireTime: *nodeInfoCacheExpireTime, ProactiveScaleupEnabled: *proactiveScaleupEnabled, PodInjectionLimit: *podInjectionLimit, + NodeDeletionCandidateTTL: *nodeDeletionCandidateTTL, } } diff --git a/cluster-autoscaler/core/scaledown/planner/planner.go b/cluster-autoscaler/core/scaledown/planner/planner.go index 32be506ca7ab..2e2263fe84e0 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner.go +++ b/cluster-autoscaler/core/scaledown/planner/planner.go @@ -85,10 +85,16 @@ func New(context *context.AutoscalingContext, processors *processors.Autoscaling if minUpdateInterval == 0*time.Nanosecond { minUpdateInterval = 1 * time.Nanosecond } + + unneededNodes := unneeded.NewNodes(processors.NodeGroupConfigProcessor, resourceLimitsFinder) + if context.AutoscalingOptions.NodeDeletionCandidateTTL != 0 { + unneededNodes.LoadFromExistingTaints(context.ListerRegistry, time.Now(), context.AutoscalingOptions.NodeDeletionCandidateTTL) + } + return &Planner{ context: context, unremovableNodes: unremovable.NewNodes(), - unneededNodes: unneeded.NewNodes(processors.NodeGroupConfigProcessor, resourceLimitsFinder), + unneededNodes: unneededNodes, rs: simulator.NewRemovalSimulator(context.ListerRegistry, context.ClusterSnapshot, deleteOptions, drainabilityRules, true), actuationInjector: scheduling.NewHintingSimulator(), eligibilityChecker: eligibility.NewChecker(processors.NodeGroupConfigProcessor), diff --git a/cluster-autoscaler/core/scaledown/planner/planner_test.go b/cluster-autoscaler/core/scaledown/planner/planner_test.go index 9fe08513f7b6..54b4a19c3fbe 100644 --- a/cluster-autoscaler/core/scaledown/planner/planner_test.go +++ b/cluster-autoscaler/core/scaledown/planner/planner_test.go @@ -36,11 +36,14 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/unremovable" . "k8s.io/autoscaler/cluster-autoscaler/core/test" + "k8s.io/autoscaler/cluster-autoscaler/estimator" processorstest "k8s.io/autoscaler/cluster-autoscaler/processors/test" "k8s.io/autoscaler/cluster-autoscaler/simulator" "k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot" "k8s.io/autoscaler/cluster-autoscaler/simulator/options" "k8s.io/autoscaler/cluster-autoscaler/simulator/utilization" + "k8s.io/autoscaler/cluster-autoscaler/utils/drain" + "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" "k8s.io/autoscaler/cluster-autoscaler/utils/taints" . "k8s.io/autoscaler/cluster-autoscaler/utils/test" @@ -463,7 +466,7 @@ func TestUpdateClusterState(t *testing.T) { wantUnremovable: []string{"n1", "n2", "n3", "n4"}, }, { - name: "Simulation timeout is hitted", + name: "Simulation timeout is hit", nodes: []*apiv1.Node{ BuildTestNode("n1", 1000, 10), BuildTestNode("n2", 1000, 10), @@ -706,6 +709,137 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) { } } +// TestNewPlannerWithExistingDeletionCandidateNodes tests that the newPlanner correctly handles existing deletion candidate taints on nodes. +func TestNewPlannerWithExistingDeletionCandidateNodes(t *testing.T) { + // Use a table-driven approach where each test case includes its own set of nodes and expected behavior + type testCase struct { + name string + allNodes []*apiv1.Node + expectedDeletionCandidateNodes []*apiv1.Node + nodeDeletionCandidateTTL time.Duration + } + + // Common test setup + deletionCandidateTaint := taints.DeletionCandidateTaint() + currentTime := time.Now() + + // Node that should be deleted + n1 := BuildTestNode("n1", 1000, 1000) + SetNodeReadyState(n1, true, currentTime) + nt1 := deletionCandidateTaint + ntt1 := currentTime.Add(-time.Minute * 2) + nt1.Value = fmt.Sprint(ntt1.Unix()) + n1.Spec.Taints = append(n1.Spec.Taints, nt1) + + // Node whose DeletionCandidateTaint has lapsed, shouldn't be deleted + n2 := BuildTestNode("n2", 1000, 1000) + SetNodeReadyState(n2, true, currentTime) + nt2 := deletionCandidateTaint + ntt2 := currentTime.Add(-time.Minute * 10) + nt2.Value = fmt.Sprint(ntt2.Unix()) + n2.Spec.Taints = append(n2.Spec.Taints, nt2) + + // Node that is marked for deletion, but should have that mark removed + n3 := BuildTestNode("n3", 1000, 1000) + SetNodeReadyState(n3, true, currentTime) + nt3 := deletionCandidateTaint + ntt3 := currentTime.Add(-time.Minute * 2) + nt3.Value = fmt.Sprint(ntt3.Unix()) + n3.Spec.Taints = append(n3.Spec.Taints, nt3) + + // Node with invalid DeletionCandidateTaint, taint should be deleted + n4 := BuildTestNode("n4", 1000, 1000) + SetNodeReadyState(n4, true, currentTime) + nt4 := deletionCandidateTaint + nt4.Value = "invalid-value" + n4.Spec.Taints = append(n4.Spec.Taints, nt4) + + // Node with no DeletionCandidateTaint, should not be deleted + n5 := BuildTestNode("n5", 1000, 1000) + SetNodeReadyState(n5, true, currentTime) + + // Pod that blocks eviction on node n3 + p1 := BuildTestPod("p1", 600, 100) + p1.Spec.NodeName = n3.Name + p1.SetAnnotations( + map[string]string{ + drain.PodSafeToEvictKey: "false", + }, + ) + + testCases := []testCase{ + { + name: "All deletion candidate nodes with standard TTL", + allNodes: []*apiv1.Node{n1, n2, n3}, + expectedDeletionCandidateNodes: []*apiv1.Node{n1}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + { + name: "Node without deletion candidate taint should not be deleted", + allNodes: []*apiv1.Node{n5}, + expectedDeletionCandidateNodes: []*apiv1.Node{}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + { + name: "Node with invalid deletion candidate taint should be deleted", + allNodes: []*apiv1.Node{n4}, + expectedDeletionCandidateNodes: []*apiv1.Node{}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + readyNodeLister := kubernetes.NewTestNodeLister(nil) + allNodeLister := kubernetes.NewTestNodeLister(nil) + + readyNodeLister.SetNodes(tc.allNodes) + allNodeLister.SetNodes(tc.allNodes) + + autoscalingOptions := config.AutoscalingOptions{ + NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ + ScaleDownUnneededTime: time.Minute, + ScaleDownUnreadyTime: time.Minute, + ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, + }, + EstimatorName: estimator.BinpackingEstimatorName, + EnforceNodeGroupMinSize: true, + ScaleDownEnabled: true, + MaxNodesTotal: 100, + MaxCoresTotal: 100, + MaxMemoryTotal: 100000, + NodeDeletionCandidateTTL: tc.nodeDeletionCandidateTTL, + } + + provider := testprovider.NewTestCloudProviderBuilder().Build() + for _, node := range tc.allNodes { + provider.AddNode("ng1", node) + } + + context, err := NewScaleTestAutoscalingContext( + autoscalingOptions, + &fake.Clientset{}, + kube_util.NewListerRegistry( + allNodeLister, + readyNodeLister, + nil, nil, nil, nil, nil, nil, nil, + ), + + provider, + nil, + nil, + ) + assert.NoError(t, err) + + deleteOptions := options.NodeDeleteOptions{} + p := New(&context, processorstest.NewTestProcessors(&context), deleteOptions, nil) + + p.unneededNodes.AsList() + }) + } +} + func TestNodesToDelete(t *testing.T) { testCases := []struct { name string diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes.go b/cluster-autoscaler/core/scaledown/unneeded/nodes.go index 9c29787608f9..ba1ad8e4d7cc 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes.go @@ -17,6 +17,7 @@ limitations under the License. package unneeded import ( + "fmt" "reflect" "time" @@ -30,6 +31,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/simulator" "k8s.io/autoscaler/cluster-autoscaler/utils" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" + "k8s.io/autoscaler/cluster-autoscaler/utils/taints" apiv1 "k8s.io/api/core/v1" klog "k8s.io/klog/v2" @@ -63,19 +65,85 @@ func NewNodes(sdtg scaleDownTimeGetter, limitsFinder *resource.LimitsFinder) *No } } +// LoadFromExistingTaints loads any existing DeletionCandidateTaint taints from the kubernetes cluster. given a TTL for the taint +func (n *Nodes) LoadFromExistingTaints(listerRegistry kube_util.ListerRegistry, ts time.Time, DeletionCandidateStalenessTTL time.Duration) error { + allNodes, err := listerRegistry.AllNodeLister().List() + if err != nil { + return fmt.Errorf("failed to list nodes when initializing unneeded nodes: %v", err) + } + + var nodesWithTaints []simulator.NodeToBeRemoved + for _, node := range allNodes { + if since, err := taints.GetDeletionCandidateTime(node); err == nil && since != nil { + if err != nil { + klog.Errorf("Failed to get pods to move for node %s: %v", node.Name, err) + continue + } + if since.Add(DeletionCandidateStalenessTTL).Before(ts) { + klog.V(4).Infof("Skipping node %s with deletion candidate taint from %s, since it is older than TTL %s", node.Name, since.String(), DeletionCandidateStalenessTTL.String()) + continue + } + nodeToBeRemoved := simulator.NodeToBeRemoved{ + Node: node, + } + nodesWithTaints = append(nodesWithTaints, nodeToBeRemoved) + klog.V(4).Infof("Found node %s with deletion candidate taint from %s", node.Name, since.String()) + } + } + + if len(nodesWithTaints) > 0 { + klog.V(1).Infof("Initializing unneeded nodes with %d nodes that have deletion candidate taints", len(nodesWithTaints)) + n.initialize(nodesWithTaints, ts) + } + + return nil +} + +// initialize initializes the Nodes object with the given node list. +// It sets the initial state of unneeded nodes reflect the taint status of nodes in the cluster. +// This is in order the avoid state loss between deployment restarts. +func (n *Nodes) initialize(nodes []simulator.NodeToBeRemoved, ts time.Time) { + n.updateInternalState(nodes, ts, func(nn simulator.NodeToBeRemoved) *time.Time { + name := nn.Node.Name + if since, err := taints.GetDeletionCandidateTime(nn.Node); err == nil { + klog.V(4).Infof("Found node %s with deletion candidate taint from %s", name, since.String()) + return since + } else if since == nil { + klog.Errorf("Failed to get deletion candidate taint time for node %s: %v", name, err) + return nil + } + klog.V(4).Infof("Found node %s with deletion candidate taint from now", name) + return nil + }) +} + // Update stores nodes along with a time at which they were found to be // unneeded. Previously existing timestamps are preserved. func (n *Nodes) Update(nodes []simulator.NodeToBeRemoved, ts time.Time) { + n.updateInternalState(nodes, ts, func(nn simulator.NodeToBeRemoved) *time.Time { + return nil + }) +} + +func (n *Nodes) updateInternalState(nodes []simulator.NodeToBeRemoved, ts time.Time, timestampGetter func(simulator.NodeToBeRemoved) *time.Time) { updated := make(map[string]*node, len(nodes)) for _, nn := range nodes { name := nn.Node.Name - updated[name] = &node{ - ntbr: nn, - } if val, found := n.byName[name]; found { - updated[name].since = val.since + updated[name] = &node{ + ntbr: nn, + since: val.since, + } + } else if existingts := timestampGetter(nn); existingts != nil { + updated[name] = &node{ + ntbr: nn, + since: *existingts, + } } else { - updated[name].since = ts + updated[name] = &node{ + ntbr: nn, + since: ts, + } } } n.byName = updated diff --git a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go index 035683dc9cdc..3cfa15651d82 100644 --- a/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go +++ b/cluster-autoscaler/core/scaledown/unneeded/nodes_test.go @@ -32,7 +32,9 @@ import ( . "k8s.io/autoscaler/cluster-autoscaler/core/test" "k8s.io/autoscaler/cluster-autoscaler/processors/nodes" "k8s.io/autoscaler/cluster-autoscaler/simulator" + "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" + "k8s.io/autoscaler/cluster-autoscaler/utils/taints" . "k8s.io/autoscaler/cluster-autoscaler/utils/test" "k8s.io/client-go/kubernetes/fake" ) @@ -215,6 +217,101 @@ func TestRemovableAt(t *testing.T) { } } +func TestNodeLoadFromExistingTaints(t *testing.T) { + + deletionCandidateTaint := taints.DeletionCandidateTaint() + currentTime := time.Now() + + n1 := BuildTestNode("n1", 1000, 1000) + SetNodeReadyState(n1, true, currentTime) + nt1 := deletionCandidateTaint + ntt1 := currentTime.Add(-time.Minute * 2) + nt1.Value = fmt.Sprint(ntt1.Unix()) + n1.Spec.Taints = append(n1.Spec.Taints, nt1) + + n2 := BuildTestNode("n2", 1000, 1000) + SetNodeReadyState(n2, true, currentTime) + + n3 := BuildTestNode("n3", 1000, 1000) + SetNodeReadyState(n3, true, currentTime) + nt3 := deletionCandidateTaint + ntt3 := currentTime.Add(-time.Minute * 20) + nt3.Value = fmt.Sprint(ntt3.Unix()) + n3.Spec.Taints = append(n3.Spec.Taints, nt3) + + n4 := BuildTestNode("n4", 1000, 1000) + SetNodeReadyState(n4, true, currentTime) + nt4 := deletionCandidateTaint + nt4.Value = "INVALID_VALUE" + n4.Spec.Taints = append(n4.Spec.Taints, nt4) + + testCases := []struct { + name string + allNodes []*apiv1.Node + expectedUnneededNodes []*apiv1.Node + nodeDeletionCandidateTTL time.Duration + }{ + { + name: "All deletion candidate nodes with standard TTL", + allNodes: []*apiv1.Node{n1, n2}, + expectedUnneededNodes: []*apiv1.Node{n1}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + { + name: "Nodes with expired deletion candidate taint", + allNodes: []*apiv1.Node{n1, n2, n3}, + expectedUnneededNodes: []*apiv1.Node{n1}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + { + name: "Nodes with invalid deletion candidate taint", + allNodes: []*apiv1.Node{n1, n2, n3, n4}, + expectedUnneededNodes: []*apiv1.Node{n1}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + currentTime = time.Now() + + nodes := NewNodes(nil, nil) + + allNodeLister := kubernetes.NewTestNodeLister(nil) + allNodeLister.SetNodes(tc.allNodes) + + readyNodeLister := kubernetes.NewTestNodeLister(nil) + readyNodeLister.SetNodes(tc.allNodes) + + listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, + nil, nil, nil, nil, nil, nil, nil) + + nodes.LoadFromExistingTaints(listerRegistry, currentTime, tc.nodeDeletionCandidateTTL) + + unneededNodes := nodes.AsList() + + assert.Equal(t, len(tc.expectedUnneededNodes), len(unneededNodes), + "Expected %d unneeded nodes but got %d", len(tc.expectedUnneededNodes), len(unneededNodes)) + + expectedNodeNames := make(map[string]bool) + for _, node := range tc.expectedUnneededNodes { + expectedNodeNames[node.Name] = true + } + for _, node := range unneededNodes { + _, found := expectedNodeNames[node.Name] + assert.True(t, found, "Node %s was not expected to be unneeded", node.Name) + } + for _, expectedNode := range tc.expectedUnneededNodes { + assert.True(t, nodes.Contains(expectedNode.Name), + "Expected node %s to be in unneeded nodes but wasn't found", expectedNode.Name) + } + }) + } + +} + type fakeActuationStatus struct { recentEvictions []*apiv1.Pod deletionCount map[string]int diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index e912a02f2b9b..4196f204d942 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -239,8 +239,8 @@ func (a *StaticAutoscaler) cleanUpIfRequired() { a.AutoscalingContext.ClientSet, a.Recorder, a.CordonNodeBeforeTerminate) if a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount == 0 { // Clean old taints if soft taints handling is disabled - taints.CleanAllDeletionCandidates(allNodes, - a.AutoscalingContext.ClientSet, a.Recorder) + taints.CleanStaleDeletionCandidates(allNodes, + a.AutoscalingContext.ClientSet, a.Recorder, a.NodeDeletionCandidateTTL) } } a.initialized = true diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index f78acf386f2e..29ca016988a7 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1390,14 +1390,14 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * func TestStaticAutoscalerRunOnceWithUnselectedNodeGroups(t *testing.T) { n1 := BuildTestNode("n1", 1000, 1000) n1.Spec.Taints = append(n1.Spec.Taints, apiv1.Taint{ - Key: taints.DeletionCandidateTaint, + Key: taints.DeletionCandidateTaintKey, Value: fmt.Sprint(time.Now().Unix()), Effect: apiv1.TaintEffectPreferNoSchedule, }) SetNodeReadyState(n1, true, time.Now()) n2 := BuildTestNode("n2", 1000, 1000) n2.Spec.Taints = append(n2.Spec.Taints, apiv1.Taint{ - Key: taints.DeletionCandidateTaint, + Key: taints.DeletionCandidateTaintKey, Value: fmt.Sprint(time.Now().Unix()), Effect: apiv1.TaintEffectPreferNoSchedule, }) @@ -1552,6 +1552,212 @@ func TestStaticAutoscalerRunOnceWithBypassedSchedulers(t *testing.T) { } +// TestStaticAutoscalerInstanceCreationErrors tests that the static autoscaler +// behavior is correct when there are existing nodes with deletion candidate taints +// on the static autoscaler startup. +func TestStaticAutoscalerRunOnceWithExistingDeletionCandidateNodes(t *testing.T) { + // Use a table-driven approach where each test case includes its own set of nodes and expected behavior + + // Common test setup + deletionCandidateTaint := taints.DeletionCandidateTaint() + currentTime := time.Now() + + // Node that should be deleted + n1 := BuildTestNode("n1", 1000, 1000) + SetNodeReadyState(n1, true, currentTime) + nt1 := deletionCandidateTaint + ntt1 := currentTime.Add(-time.Minute * 2) + nt1.Value = fmt.Sprint(ntt1.Unix()) + n1.Spec.Taints = append(n1.Spec.Taints, nt1) + + // Node whose DeletionCandidateTaint has lapsed, shouldn't be deleted + n2 := BuildTestNode("n2", 1000, 1000) + SetNodeReadyState(n2, true, currentTime) + nt2 := deletionCandidateTaint + ntt2 := currentTime.Add(-time.Minute * 10) + nt2.Value = fmt.Sprint(ntt2.Unix()) + n2.Spec.Taints = append(n2.Spec.Taints, nt2) + + // Node that is marked for deletion, but should have that mark removed + n3 := BuildTestNode("n3", 1000, 1000) + SetNodeReadyState(n3, true, currentTime) + nt3 := deletionCandidateTaint + ntt3 := currentTime.Add(-time.Minute * 2) + nt3.Value = fmt.Sprint(ntt3.Unix()) + n3.Spec.Taints = append(n3.Spec.Taints, nt3) + + // Node with invalid DeletionCandidateTaint, taint should be deleted + n4 := BuildTestNode("n4", 1000, 1000) + SetNodeReadyState(n4, true, currentTime) + nt4 := deletionCandidateTaint + nt4.Value = "invalid-value" + n4.Spec.Taints = append(n4.Spec.Taints, nt4) + + // Node with no DeletionCandidateTaint, should not be deleted + n5 := BuildTestNode("n5", 1000, 1000) + SetNodeReadyState(n5, true, currentTime) + + // Pod that blocks eviction on node n3 + p1 := BuildTestPod("p1", 600, 100) + p1.Spec.NodeName = n3.Name + p1.SetAnnotations( + map[string]string{ + drain.PodSafeToEvictKey: "false", + }, + ) + + testCases := []struct { + name string + allNodes []*apiv1.Node + expectedDeletionCandidateNodes []*apiv1.Node + deletionCandidateStalenessTTL time.Duration + }{ + { + name: "All deletion candidate nodes with standard TTL", + allNodes: []*apiv1.Node{n1, n2, n3}, + expectedDeletionCandidateNodes: []*apiv1.Node{n1}, + deletionCandidateStalenessTTL: time.Minute * 5, + }, + { + name: "Node without deletion candidate taint should not be deleted", + allNodes: []*apiv1.Node{n5}, + expectedDeletionCandidateNodes: []*apiv1.Node{}, + deletionCandidateStalenessTTL: time.Minute * 5, + }, + { + name: "Node with invalid deletion candidate taint should be deleted", + allNodes: []*apiv1.Node{n4}, + expectedDeletionCandidateNodes: []*apiv1.Node{}, + deletionCandidateStalenessTTL: time.Minute * 5, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Setup mocks for this test case + allPodListerMock := &podListerMock{} + podDisruptionBudgetListerMock := &podDisruptionBudgetListerMock{} + daemonSetListerMock := &daemonSetListerMock{} + onScaleUpMock := &onScaleUpMock{} + onScaleDownMock := &onScaleDownMock{} + deleteFinished := make(chan bool, len(tc.expectedDeletionCandidateNodes)) + + tn := BuildTestNode("tn", 1000, 1000) + tni := framework.NewTestNodeInfo(tn) + + provider := testprovider.NewTestCloudProviderBuilder().WithOnScaleUp(func(id string, delta int) error { + return onScaleUpMock.ScaleUp(id, delta) + }).WithOnScaleDown(func(id string, name string) error { + ret := onScaleDownMock.ScaleDown(id, name) + deleteFinished <- true + return ret + }).WithMachineTemplates(map[string]*framework.NodeInfo{"ng1": tni, "ng2": tni, "ng3": tni}).Build() + + provider.AddNodeGroup("ng1", 1, 10, len(tc.allNodes)) + for _, node := range tc.allNodes { + provider.AddNode("ng1", node) + } + + ng1 := reflect.ValueOf(provider.GetNodeGroup("ng1")).Interface().(*testprovider.TestNodeGroup) + assert.NotNil(t, ng1) + assert.NotNil(t, provider) + + options := config.AutoscalingOptions{ + NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ + ScaleDownUnneededTime: time.Minute, + ScaleDownUnreadyTime: time.Minute, + ScaleDownUtilizationThreshold: 0.5, + MaxNodeProvisionTime: 10 * time.Second, + }, + EstimatorName: estimator.BinpackingEstimatorName, + EnforceNodeGroupMinSize: true, + ScaleDownEnabled: true, + MaxNodesTotal: 100, + MaxCoresTotal: 100, + MaxMemoryTotal: 100000, + NodeDeletionCandidateTTL: tc.deletionCandidateStalenessTTL, + } + + processorCallbacks := newStaticAutoscalerProcessorCallbacks() + + clientset := buildFakeClient(t, tc.allNodes...) + + readyNodeLister := kubernetes.NewDynamicTestNodeLister(clientset) + allNodeLister := kubernetes.NewDynamicTestNodeLister(clientset) + + context, err := NewScaleTestAutoscalingContext( + options, + clientset, + nil, + provider, + processorCallbacks, + nil, + ) + assert.NoError(t, err) + + setUpScaleDownActuator(&context, options) + + listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, allPodListerMock, podDisruptionBudgetListerMock, daemonSetListerMock, + nil, nil, nil, nil) + context.ListerRegistry = listerRegistry + + clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ + OkTotalUnreadyCount: 1, + } + processors := processorstest.NewTestProcessors(&context) + clusterState := clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), processors.AsyncNodeGroupStateChecker) + sdPlanner, sdActuator := newScaleDownPlannerAndActuator(&context, processors, clusterState, nil) + suOrchestrator := orchestrator.New() + suOrchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) + + autoscaler := &StaticAutoscaler{ + AutoscalingContext: &context, + clusterStateRegistry: clusterState, + lastScaleUpTime: currentTime, + lastScaleDownFailTime: currentTime, + scaleDownPlanner: sdPlanner, + scaleDownActuator: sdActuator, + scaleUpOrchestrator: suOrchestrator, + processors: processors, + loopStartNotifier: loopstart.NewObserversList(nil), + processorCallbacks: processorCallbacks, + initialized: false, + } + + allPodListerMock.On("List").Return([]*apiv1.Pod{p1}, nil).Twice() + daemonSetListerMock.On("List", labels.Everything()).Return([]*appsv1.DaemonSet{}, nil).Once() + podDisruptionBudgetListerMock.On("List").Return([]*policyv1.PodDisruptionBudget{}, nil).Once() + + for _, node := range tc.expectedDeletionCandidateNodes { + onScaleDownMock.On("ScaleDown", "ng1", node.Name).Return(nil).Once() + } + + err = autoscaler.RunOnce(currentTime) + assert.NoError(t, err) + for range tc.expectedDeletionCandidateNodes { + waitForDeleteToFinish(t, deleteFinished) + } + + for _, node := range tc.expectedDeletionCandidateNodes { + onScaleDownMock.AssertCalled(t, "ScaleDown", "ng1", node.Name) + } + + for _, node := range tc.allNodes { + shouldBeDeleted := false + for _, expectedDeletedNode := range tc.expectedDeletionCandidateNodes { + if node.Name == expectedDeletedNode.Name { + shouldBeDeleted = true + break + } + } + if !shouldBeDeleted { + onScaleDownMock.AssertNotCalled(t, "ScaleDown", "ng1", node.Name) + } + } + }) + } +} + func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { testCases := []struct { forceDeleteEnabled bool @@ -3091,7 +3297,7 @@ func createNodeGroupWithSoftTaintedNodes(provider *testprovider.TestCloudProvide node := BuildTestNode(fmt.Sprintf("%s-node-%d", name, i), 2000, 1000) node.CreationTimestamp = metav1.NewTime(nodesCreationTime) node.Spec.Taints = []apiv1.Taint{{ - Key: taints.DeletionCandidateTaint, + Key: taints.DeletionCandidateTaintKey, Value: "1", Effect: apiv1.TaintEffectNoSchedule, }} diff --git a/cluster-autoscaler/utils/kubernetes/testlisters.go b/cluster-autoscaler/utils/kubernetes/testlisters.go index 571298484a6c..3c1539422254 100644 --- a/cluster-autoscaler/utils/kubernetes/testlisters.go +++ b/cluster-autoscaler/utils/kubernetes/testlisters.go @@ -17,12 +17,15 @@ limitations under the License. package kubernetes import ( + "context" "fmt" appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" apiv1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" v1appslister "k8s.io/client-go/listers/apps/v1" v1batchlister "k8s.io/client-go/listers/batch/v1" v1lister "k8s.io/client-go/listers/core/v1" @@ -89,6 +92,36 @@ func NewTestNodeLister(nodes []*apiv1.Node) *TestNodeLister { return &TestNodeLister{nodes: nodes} } +// DynamicTestNodeLister is used in tests involving listers where nodes might change over the test run. +type DynamicTestNodeLister struct { + clientset *fake.Clientset +} + +// List returns all nodes in test lister. +func (l *DynamicTestNodeLister) List() ([]*apiv1.Node, error) { + nodes, err := l.clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return nil, err + } + var nodePointers []*apiv1.Node + for i := range nodes.Items { + nodePointers = append(nodePointers, &nodes.Items[i]) + } + return nodePointers, err +} + +// Get returns node from test lister. +func (l *DynamicTestNodeLister) Get(name string) (*apiv1.Node, error) { + return l.clientset.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{}) +} + +// NewDynamicTestNodeLister is used in tests involving listers where nodes might change over the test run. +func NewDynamicTestNodeLister(clientset *fake.Clientset) *DynamicTestNodeLister { + return &DynamicTestNodeLister{ + clientset: clientset, + } +} + // NewTestDaemonSetLister returns a lister that returns provided DaemonSets func NewTestDaemonSetLister(dss []*appsv1.DaemonSet) (v1appslister.DaemonSetLister, error) { store := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go index 3a6262ae4517..acae356089d0 100644 --- a/cluster-autoscaler/utils/taints/taints.go +++ b/cluster-autoscaler/utils/taints/taints.go @@ -39,8 +39,8 @@ import ( const ( // ToBeDeletedTaint is a taint used to make the node unschedulable. ToBeDeletedTaint = "ToBeDeletedByClusterAutoscaler" - // DeletionCandidateTaint is a taint used to mark unneeded node as preferably unschedulable. - DeletionCandidateTaint = "DeletionCandidateOfClusterAutoscaler" + // DeletionCandidateTaintKey is a taint used to mark unneeded node as preferably unschedulable. + DeletionCandidateTaintKey = "DeletionCandidateOfClusterAutoscaler" // IgnoreTaintPrefix any taint starting with it will be filtered out from autoscaler template node. IgnoreTaintPrefix = "ignore-taint.cluster-autoscaler.kubernetes.io/" @@ -114,8 +114,8 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig { } explicitlyReportedTaints := TaintKeySet{ - ToBeDeletedTaint: true, - DeletionCandidateTaint: true, + ToBeDeletedTaint: true, + DeletionCandidateTaintKey: true, } for k, v := range NodeConditionTaints { @@ -170,13 +170,18 @@ func MarkToBeDeleted(node *apiv1.Node, client kube_client.Interface, cordonNode return AddTaints(node, client, []apiv1.Taint{taint}, cordonNode) } -// MarkDeletionCandidate sets a soft taint that makes the node preferably unschedulable. -func MarkDeletionCandidate(node *apiv1.Node, client kube_client.Interface) (*apiv1.Node, error) { - taint := apiv1.Taint{ - Key: DeletionCandidateTaint, +// DeletionCandidateTaint returns a taint that marks the node as a DeletionCandidate for Cluster Autoscaler. +func DeletionCandidateTaint() apiv1.Taint { + return apiv1.Taint{ + Key: DeletionCandidateTaintKey, Value: fmt.Sprint(time.Now().Unix()), Effect: apiv1.TaintEffectPreferNoSchedule, } +} + +// MarkDeletionCandidate sets a soft taint that makes the node preferably unschedulable. +func MarkDeletionCandidate(node *apiv1.Node, client kube_client.Interface) (*apiv1.Node, error) { + taint := DeletionCandidateTaint() return AddTaints(node, client, []apiv1.Taint{taint}, false) } @@ -247,7 +252,7 @@ func HasToBeDeletedTaint(node *apiv1.Node) bool { // HasDeletionCandidateTaint returns true if DeletionCandidate taint is applied on the node. func HasDeletionCandidateTaint(node *apiv1.Node) bool { - return HasTaint(node, DeletionCandidateTaint) + return HasTaint(node, DeletionCandidateTaintKey) } // HasTaint returns true if the specified taint is applied on the node. @@ -267,7 +272,7 @@ func GetToBeDeletedTime(node *apiv1.Node) (*time.Time, error) { // GetDeletionCandidateTime returns the date when the node was marked by CA as for delete. func GetDeletionCandidateTime(node *apiv1.Node) (*time.Time, error) { - return GetTaintTime(node, DeletionCandidateTaint) + return GetTaintTime(node, DeletionCandidateTaintKey) } // GetTaintTime returns the date when the node was marked by CA with the specified taint. @@ -292,7 +297,7 @@ func CleanToBeDeleted(node *apiv1.Node, client kube_client.Interface, cordonNode // CleanDeletionCandidate cleans CA's soft NoSchedule taint from a node. func CleanDeletionCandidate(node *apiv1.Node, client kube_client.Interface) (*apiv1.Node, error) { - return CleanTaints(node, client, []string{DeletionCandidateTaint}, false) + return CleanTaints(node, client, []string{DeletionCandidateTaintKey}, false) } // CleanTaints cleans the specified taints from a node and returns an updated copy of the node. @@ -355,33 +360,60 @@ func CleanTaints(node *apiv1.Node, client kube_client.Interface, taintKeys []str } } +// getDeletionCandidateTTLCondition returns a function that checks if a node's deletion candidate time has reached the specified TTL. +func getDeletionCandidateTTLCondition(deletionCandidateTTL time.Duration) func(*apiv1.Node) bool { + return func(node *apiv1.Node) bool { + if deletionCandidateTTL == 0 { + return true + } + markedForDeletionTime, err := GetDeletionCandidateTime(node) + if err != nil { + klog.Warningf("Error while getting DeletionCandidate time for node %v: %v", node.Name, err) + return true + } + if markedForDeletionTime == nil { + return true + } + if time.Since(*markedForDeletionTime) < deletionCandidateTTL { + klog.V(4).Infof("Node %v has stale %v taint: the time is %v (%v ago)", node.Name, DeletionCandidateTaintKey, markedForDeletionTime, time.Since(*markedForDeletionTime)) + return false + } + return true + } +} + // CleanAllToBeDeleted cleans ToBeDeleted taints from given nodes. func CleanAllToBeDeleted(nodes []*apiv1.Node, client kube_client.Interface, recorder kube_record.EventRecorder, cordonNode bool) { - CleanAllTaints(nodes, client, recorder, []string{ToBeDeletedTaint}, cordonNode) + CleanAllTaints(nodes, client, recorder, ToBeDeletedTaint, cordonNode) } -// CleanAllDeletionCandidates cleans DeletionCandidate taints from given nodes. -func CleanAllDeletionCandidates(nodes []*apiv1.Node, client kube_client.Interface, recorder kube_record.EventRecorder) { - CleanAllTaints(nodes, client, recorder, []string{DeletionCandidateTaint}, false) +// CleanStaleDeletionCandidates cleans DeletionCandidate taints from given nodes. +func CleanStaleDeletionCandidates(nodes []*apiv1.Node, client kube_client.Interface, recorder kube_record.EventRecorder, deletionCandidateTTL time.Duration) { + CleanAllTaints(nodes, client, recorder, DeletionCandidateTaintKey, false, getDeletionCandidateTTLCondition(deletionCandidateTTL)) } // CleanAllTaints cleans all specified taints from given nodes. -func CleanAllTaints(nodes []*apiv1.Node, client kube_client.Interface, recorder kube_record.EventRecorder, taintKeys []string, cordonNode bool) { +func CleanAllTaints(nodes []*apiv1.Node, client kube_client.Interface, recorder kube_record.EventRecorder, taintKey string, cordonNode bool, conditions ...func(*apiv1.Node) bool) { for _, node := range nodes { - taintsPresent := false - for _, taintKey := range taintKeys { - taintsPresent = taintsPresent || HasTaint(node, taintKey) + skip := false + if !HasTaint(node, taintKey) { + continue + } + for _, condition := range conditions { + if !condition(node) { + skip = true + } } - if !taintsPresent { + if skip { continue } - updatedNode, err := CleanTaints(node, client, taintKeys, cordonNode) + updatedNode, err := CleanTaints(node, client, []string{taintKey}, cordonNode) if err != nil { recorder.Eventf(node, apiv1.EventTypeWarning, "ClusterAutoscalerCleanup", - "failed to clean %v on node %v: %v", strings.Join(taintKeys, ","), node.Name, err) + "failed to clean %v on node %v: %v", taintKey, node.Name, err) } else if node != nil && updatedNode != nil && !slices.Equal(updatedNode.Spec.Taints, node.Spec.Taints) { recorder.Eventf(node, apiv1.EventTypeNormal, "ClusterAutoscalerCleanup", - "removed %v taints from node %v", strings.Join(taintKeys, ","), node.Name) + "removed %v taint from node %v", taintKey, node.Name) } } } @@ -403,7 +435,7 @@ func SanitizeTaints(taints []apiv1.Taint, taintConfig TaintConfig) []apiv1.Taint case ToBeDeletedTaint: klog.V(4).Infof("Removing autoscaler taint when creating template from node") continue - case DeletionCandidateTaint: + case DeletionCandidateTaintKey: klog.V(4).Infof("Removing autoscaler soft taint when creating template from node") continue } diff --git a/cluster-autoscaler/utils/taints/taints_test.go b/cluster-autoscaler/utils/taints/taints_test.go index c8f803bf9a72..c8898cd72ba2 100644 --- a/cluster-autoscaler/utils/taints/taints_test.go +++ b/cluster-autoscaler/utils/taints/taints_test.go @@ -94,7 +94,7 @@ func TestSoftCheckNodes(t *testing.T) { node := BuildTestNode("node", 1000, 1000) taints := []apiv1.Taint{ { - Key: DeletionCandidateTaint, + Key: DeletionCandidateTaintKey, Value: fmt.Sprint(time.Now().Unix()), Effect: apiv1.TaintEffectPreferNoSchedule, }, @@ -254,7 +254,7 @@ func TestSoftCleanNodes(t *testing.T) { node := BuildTestNode("node", 1000, 1000) taints := []apiv1.Taint{ { - Key: DeletionCandidateTaint, + Key: DeletionCandidateTaintKey, Value: fmt.Sprint(time.Now().Unix()), Effect: apiv1.TaintEffectPreferNoSchedule, }, @@ -301,14 +301,14 @@ func TestCleanAllToBeDeleted(t *testing.T) { func TestCleanAllDeletionCandidates(t *testing.T) { n1 := BuildTestNode("n1", 1000, 10) n2 := BuildTestNode("n2", 1000, 10) - n2.Spec.Taints = []apiv1.Taint{{Key: DeletionCandidateTaint, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}} + n2.Spec.Taints = []apiv1.Taint{{Key: DeletionCandidateTaintKey, Value: strconv.FormatInt(time.Now().Unix()-301, 10)}} fakeClient := buildFakeClient(t, n1, n2) fakeRecorder := kube_util.CreateEventRecorder(fakeClient, false) assert.Equal(t, 1, len(getNode(t, fakeClient, "n2").Spec.Taints)) - CleanAllDeletionCandidates([]*apiv1.Node{n1, n2}, fakeClient, fakeRecorder) + CleanStaleDeletionCandidates([]*apiv1.Node{n1, n2}, fakeClient, fakeRecorder, time.Duration(0)) assert.Equal(t, 0, len(getNode(t, fakeClient, "n1").Spec.Taints)) assert.Equal(t, 0, len(getNode(t, fakeClient, "n2").Spec.Taints)) @@ -331,7 +331,7 @@ func getNode(t *testing.T, client kube_client.Interface, name string) *apiv1.Nod func buildFakeClient(t *testing.T, nodes ...*apiv1.Node) *fake.Clientset { t.Helper() - fakeClient := fake.NewSimpleClientset() + fakeClient := fake.NewClientset() for _, node := range nodes { _, err := fakeClient.CoreV1().Nodes().Create(context.TODO(), node, metav1.CreateOptions{}) @@ -853,3 +853,103 @@ func TestCleanTaints(t *testing.T) { }) } } + +func TestCleanStaleDeletionCandidates(t *testing.T) { + + currentTime := time.Now() + deletionCandidateTaint := DeletionCandidateTaint() + + n1 := BuildTestNode("n1", 1000, 1000) + SetNodeReadyState(n1, true, currentTime) + nt1 := deletionCandidateTaint + ntt1 := currentTime.Add(-time.Minute * 2) + nt1.Value = fmt.Sprint(ntt1.Unix()) + n1.Spec.Taints = append(n1.Spec.Taints, nt1) + + // Node whose DeletionCandidateTaint has lapsed, shouldn't be deleted + n2 := BuildTestNode("n2", 1000, 1000) + SetNodeReadyState(n2, true, currentTime) + nt2 := deletionCandidateTaint + ntt2 := currentTime.Add(-time.Minute * 10) + nt2.Value = fmt.Sprint(ntt2.Unix()) + n2.Spec.Taints = append(n2.Spec.Taints, nt2) + + // Node that is marked for deletion, but should have that mark removed + n3 := BuildTestNode("n3", 1000, 1000) + SetNodeReadyState(n3, true, currentTime) + nt3 := deletionCandidateTaint + ntt3 := currentTime.Add(-time.Minute * 2) + nt3.Value = fmt.Sprint(ntt3.Unix()) + n3.Spec.Taints = append(n3.Spec.Taints, nt3) + + // Node with invalid DeletionCandidateTaint, taint should be deleted + n4 := BuildTestNode("n4", 1000, 1000) + SetNodeReadyState(n4, true, currentTime) + nt4 := deletionCandidateTaint + nt4.Value = "invalid-value" + n4.Spec.Taints = append(n4.Spec.Taints, nt4) + + // Node with no DeletionCandidateTaint, should not be deleted + n5 := BuildTestNode("n5", 1000, 1000) + SetNodeReadyState(n5, true, currentTime) + + testCases := []struct { + name string + allNodes []*apiv1.Node + unneededNodes []*apiv1.Node + nodeDeletionCandidateTTL time.Duration + }{ + { + name: "All deletion candidate nodes with standard TTL", + allNodes: []*apiv1.Node{n1, n2, n3}, + unneededNodes: []*apiv1.Node{n1, n3}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + { + name: "Node without deletion candidate taint should not be deleted", + allNodes: []*apiv1.Node{n5}, + unneededNodes: []*apiv1.Node{}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + { + name: "Node with invalid deletion candidate taint should be deleted", + allNodes: []*apiv1.Node{n4}, + unneededNodes: []*apiv1.Node{}, + nodeDeletionCandidateTTL: time.Minute * 5, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fakeClient := buildFakeClient(t, tc.allNodes...) + CleanStaleDeletionCandidates( + tc.allNodes, + fakeClient, + kube_util.CreateEventRecorder(fakeClient, false), + tc.nodeDeletionCandidateTTL, + ) + + allNodes, err := fakeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + assert.NoError(t, err) + assert.NotNil(t, allNodes) + + for _, node := range allNodes.Items { + hasTaint := HasDeletionCandidateTaint(&node) + isUnneeded := false + for _, unneededNode := range tc.unneededNodes { + if unneededNode.Name == node.Name { + isUnneeded = true + break + } + } + + if isUnneeded { + assert.True(t, hasTaint, "Node %s should still have deletion candidate taint", node.Name) + } else { + assert.False(t, hasTaint, "Node %s should have had deletion candidate taint removed", node.Name) + } + } + + }) + } +} diff --git a/hack/verify-gofmt.sh b/hack/verify-gofmt.sh index 8917d92348fe..e3244b2c054a 100755 --- a/hack/verify-gofmt.sh +++ b/hack/verify-gofmt.sh @@ -47,7 +47,7 @@ find_files() { \) -name '*.go' } -DOCKER_IMAGE=`grep 'FROM golang' builder/Dockerfile | sed 's/FROM //'` +DOCKER_IMAGE=`grep --color=none 'FROM golang' builder/Dockerfile | sed 's/FROM //'` GOFMT="docker run -v $(pwd):/code -w /code $DOCKER_IMAGE gofmt -s" bad_files=$(find_files | xargs $GOFMT -l) From 85a0d9455a1129c8d82d33e7e58b2a28959f0fcf Mon Sep 17 00:00:00 2001 From: Luiz Oliveira Date: Wed, 17 Sep 2025 11:53:37 -0400 Subject: [PATCH 09/12] Add rapid release channel to GKE cluster creation command --- vertical-pod-autoscaler/RELEASE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vertical-pod-autoscaler/RELEASE.md b/vertical-pod-autoscaler/RELEASE.md index e2d76988a4d0..1b84f6e4384c 100644 --- a/vertical-pod-autoscaler/RELEASE.md +++ b/vertical-pod-autoscaler/RELEASE.md @@ -105,7 +105,7 @@ for component in recommender updater admission-controller ; do TAG=`grep 'const 1. [ ] Create a Kubernetes cluster. If you're using GKE you can use the following command: ```shell - gcloud container clusters create e2e-test --machine-type=n1-standard-2 --image-type=COS_CONTAINERD --num-nodes=3 + gcloud container clusters create e2e-test --machine-type=n1-standard-2 --image-type=COS_CONTAINERD --num-nodes=3 --release-channel=rapid ``` 1. [ ] Create clusterrole. If you're using GKE you can use the following command: @@ -230,4 +230,4 @@ sure nothing we care about will break if we do. A member of the [autoscaler-admins](https://github.com/orgs/kubernetes/teams/autoscaler-admins) - can add you to add you as a collaborator. \ No newline at end of file + can add you to add you as a collaborator. From b09676ca0623069db4211087cdad692dac36baa1 Mon Sep 17 00:00:00 2001 From: David Morrison Date: Wed, 17 Sep 2025 12:05:56 -0700 Subject: [PATCH 10/12] change kwok nodegroup annotation key --- .../cloudprovider/kwok/README.md | 12 ++++---- .../cloudprovider/kwok/kwok_config.go | 10 +++---- .../cloudprovider/kwok/kwok_config_test.go | 28 +++++++++---------- .../cloudprovider/kwok/kwok_types.go | 4 +-- .../samples/dynamic_nodegroups_config.yaml | 4 +-- .../samples/static_nodegroups_config.yaml | 4 +-- 6 files changed, 31 insertions(+), 31 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/kwok/README.md b/cluster-autoscaler/cloudprovider/kwok/README.md index 840192303590..fc229221e769 100644 --- a/cluster-autoscaler/cloudprovider/kwok/README.md +++ b/cluster-autoscaler/cloudprovider/kwok/README.md @@ -81,13 +81,13 @@ kubectl create configmap kwok-provider-templates --from-file=templates=template- ``` Replace `template-nodes.yaml` with the path to your template nodes file. -If you are using your template nodes in the `kwok-provider-templates` ConfigMap, make sure you have set the correct value for `nodegroups.fromNodeLabelKey`/`nodegroups.fromNodeAnnotation`. Not doing so will make CA not scale up nodes (it won't throw any error either). +If you are using your template nodes in the `kwok-provider-templates` ConfigMap, make sure you have set the correct value for `nodegroups.fromNodeLabelKey`/`nodegroups.fromNodeAnnotationKey`. Not doing so will make CA not scale up nodes (it won't throw any error either). If you want to use dynamic template nodes, Set `readNodesFrom` in `kwok-provider-config` ConfigMap to `cluster`. This tells the kwok provider to use live nodes from the cluster as template nodes. -If you are using live nodes from the cluster as template nodes in the `kwok-provider-templates` ConfigMap, make sure you have set the correct value for `nodegroups.fromNodeLabelKey`/`nodegroups.fromNodeAnnotation`. Not doing so will make CA not scale up nodes (it won't throw any error either). +If you are using live nodes from the cluster as template nodes in the `kwok-provider-templates` ConfigMap, make sure you have set the correct value for `nodegroups.fromNodeLabelKey`/`nodegroups.fromNodeAnnotationKey`. Not doing so will make CA not scale up nodes (it won't throw any error either). ### For local development 1. Point your kubeconfig to the cluster where you want to test your changes @@ -158,11 +158,11 @@ nodegroups: # nodegroup2: [node2] fromNodeLabelKey: "node.kubernetes.io/instance-type" - # fromNodeAnnotation's value is used to group nodes together into nodegroups + # fromNodeAnnotationKey's value is used to group nodes together into nodegroups # (basically same as `fromNodeLabelKey` except based on annotation) - # you can specify either of `fromNodeLabelKey` OR `fromNodeAnnotation` + # you can specify either of `fromNodeLabelKey` OR `fromNodeAnnotationKey` # (both are not allowed) - fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" # nodes specifies node level config nodes: # skipTaint is used to enable/disable adding kwok provider taint on the template nodes @@ -211,7 +211,7 @@ kwok provider config is a configuration to change the behavior of the kwok provi ### Gotchas 1. The kwok provider by default taints the template nodes with `kwok-provider: true` taint so that production workloads don't get scheduled on these nodes accidentally. You have to tolerate the taint to schedule your workload on the nodes created by the kwok provider. You can turn this off by setting `nodes.skipTaint: true` in the kwok provider config. -2. Make sure the label/annotation for `fromNodeLabelKey`/`fromNodeAnnotation` in the kwok provider config is actually present on the template nodes. If it isn't present on the template nodes, the kwok provider will not be able to create new nodes. +2. Make sure the label/annotation for `fromNodeLabelKey`/`fromNodeAnnotationKey` in the kwok provider config is actually present on the template nodes. If it isn't present on the template nodes, the kwok provider will not be able to create new nodes. 3. Note that the kwok provider makes the following changes to all the template nodes: (pseudocode) ``` diff --git a/cluster-autoscaler/cloudprovider/kwok/kwok_config.go b/cluster-autoscaler/cloudprovider/kwok/kwok_config.go index 5b5ab7037f74..2d291316eb1c 100644 --- a/cluster-autoscaler/cloudprovider/kwok/kwok_config.go +++ b/cluster-autoscaler/cloudprovider/kwok/kwok_config.go @@ -111,12 +111,12 @@ func LoadConfigFile(kubeClient kubeclient.Interface) (*KwokProviderConfig, error } if strings.TrimSpace(kwokConfig.Nodegroups.FromNodeLabelKey) == "" && - strings.TrimSpace(kwokConfig.Nodegroups.FromNodeLabelAnnotation) == "" { - return nil, fmt.Errorf("please specify either 'nodegroups.fromNodeLabelKey' or 'nodegroups.fromNodeAnnotation' in kwok provider config (currently empty or undefined)") + strings.TrimSpace(kwokConfig.Nodegroups.FromNodeAnnotationKey) == "" { + return nil, fmt.Errorf("please specify either 'nodegroups.fromNodeLabelKey' or 'nodegroups.fromNodeAnnotationKey' in kwok provider config (currently empty or undefined)") } if strings.TrimSpace(kwokConfig.Nodegroups.FromNodeLabelKey) != "" && - strings.TrimSpace(kwokConfig.Nodegroups.FromNodeLabelAnnotation) != "" { - return nil, fmt.Errorf("please specify either 'nodegroups.fromNodeLabelKey' or 'nodegroups.fromNodeAnnotation' in kwok provider config (you can't use both)") + strings.TrimSpace(kwokConfig.Nodegroups.FromNodeAnnotationKey) != "" { + return nil, fmt.Errorf("please specify either 'nodegroups.fromNodeLabelKey' or 'nodegroups.fromNodeAnnotationKey' in kwok provider config (you can't use both)") } if strings.TrimSpace(kwokConfig.Nodegroups.FromNodeLabelKey) != "" { @@ -124,7 +124,7 @@ func LoadConfigFile(kubeClient kubeclient.Interface) (*KwokProviderConfig, error kwokConfig.status.key = kwokConfig.Nodegroups.FromNodeLabelKey } else { kwokConfig.status.groupNodesBy = groupNodesByAnnotation - kwokConfig.status.key = kwokConfig.Nodegroups.FromNodeLabelAnnotation + kwokConfig.status.key = kwokConfig.Nodegroups.FromNodeAnnotationKey } if kwokConfig.Nodes == nil { diff --git a/cluster-autoscaler/cloudprovider/kwok/kwok_config_test.go b/cluster-autoscaler/cloudprovider/kwok/kwok_config_test.go index 8029ab6bb811..431d1a09d141 100644 --- a/cluster-autoscaler/cloudprovider/kwok/kwok_config_test.go +++ b/cluster-autoscaler/cloudprovider/kwok/kwok_config_test.go @@ -49,9 +49,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "kwok-nodegroup" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: gpuConfig: # to tell kwok provider what label should be considered as GPU label @@ -77,9 +77,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "kwok-nodegroup" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: skipTaint: true gpuConfig: @@ -104,9 +104,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "kwok-nodegroup" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: gpuConfig: # to tell kwok provider what label should be considered as GPU label @@ -131,9 +131,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "kwok-nodegroup" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: skipTaint: true gpuConfig: @@ -159,9 +159,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "node.kubernetes.io/instance-type" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: gpuConfig: # to tell kwok provider what label should be considered as GPU label @@ -185,9 +185,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "node.kubernetes.io/instance-type" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: gpuConfig: # to tell kwok provider what label should be considered as GPU label @@ -213,9 +213,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "node.kubernetes.io/instance-type" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: gpuConfig: # to tell kwok provider what label should be considered as GPU label diff --git a/cluster-autoscaler/cloudprovider/kwok/kwok_types.go b/cluster-autoscaler/cloudprovider/kwok/kwok_types.go index 538162df57bf..6937c189d81b 100644 --- a/cluster-autoscaler/cloudprovider/kwok/kwok_types.go +++ b/cluster-autoscaler/cloudprovider/kwok/kwok_types.go @@ -63,8 +63,8 @@ type NodeGroup struct { // NodegroupsConfig defines options for creating nodegroups type NodegroupsConfig struct { - FromNodeLabelKey string `json:"fromNodeLabelKey" yaml:"fromNodeLabelKey"` - FromNodeLabelAnnotation string `json:"fromNodeLabelAnnotation" yaml:"fromNodeLabelAnnotation"` + FromNodeLabelKey string `json:"fromNodeLabelKey" yaml:"fromNodeLabelKey"` + FromNodeAnnotationKey string `json:"fromNodeAnnotationKey" yaml:"fromNodeAnnotationKey"` } // NodeConfig defines config options for the nodes diff --git a/cluster-autoscaler/cloudprovider/kwok/samples/dynamic_nodegroups_config.yaml b/cluster-autoscaler/cloudprovider/kwok/samples/dynamic_nodegroups_config.yaml index 4af16add09ff..832b07e31141 100644 --- a/cluster-autoscaler/cloudprovider/kwok/samples/dynamic_nodegroups_config.yaml +++ b/cluster-autoscaler/cloudprovider/kwok/samples/dynamic_nodegroups_config.yaml @@ -9,9 +9,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "node.kubernetes.io/instance-type" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: # kwok provider adds a taint on the template nodes diff --git a/cluster-autoscaler/cloudprovider/kwok/samples/static_nodegroups_config.yaml b/cluster-autoscaler/cloudprovider/kwok/samples/static_nodegroups_config.yaml index eba6cde17c58..c685d9c08510 100644 --- a/cluster-autoscaler/cloudprovider/kwok/samples/static_nodegroups_config.yaml +++ b/cluster-autoscaler/cloudprovider/kwok/samples/static_nodegroups_config.yaml @@ -9,9 +9,9 @@ nodegroups: # nodegroup1: [node1,node3] # nodegroup2: [node2] fromNodeLabelKey: "node.kubernetes.io/instance-type" - # you can either specify fromNodeLabelKey OR fromNodeAnnotation + # you can either specify fromNodeLabelKey OR fromNodeAnnotationKey # (both are not allowed) - # fromNodeAnnotation: "eks.amazonaws.com/nodegroup" + # fromNodeAnnotationKey: "eks.amazonaws.com/nodegroup" nodes: # kwok provider adds a taint on the template nodes # so that even if you run the provider in a production cluster From 4fa5202e2b0d6ae9ef0988cb4535de07ae0049ec Mon Sep 17 00:00:00 2001 From: MenD32 Date: Fri, 12 Sep 2025 17:00:50 +0300 Subject: [PATCH 11/12] feat: added flag to set deletion candidate taint TTL Signed-off-by: MenD32 From 39564434a165c0b0e0b9d648c207fd7d3b3f86a5 Mon Sep 17 00:00:00 2001 From: MenD32 Date: Thu, 2 Oct 2025 12:29:32 +0300 Subject: [PATCH 12/12] fix: updated resourceapi to v1 Signed-off-by: MenD32 --- .../dynamicresources/utils/utilization.go | 13 ++--- .../utils/utilization_test.go | 48 +++++++++---------- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go index bc884a6e6bd8..c717fdfd605f 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go @@ -20,7 +20,7 @@ import ( "fmt" v1 "k8s.io/api/core/v1" - resourceapi "k8s.io/api/resource/v1beta1" + resourceapi "k8s.io/api/resource/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/simulator/framework" ) @@ -89,13 +89,13 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlic devicesWithoutCounters := 0 for _, device := range allocated { - if device.Basic == nil || device.Basic.ConsumesCounters == nil { + if device.ConsumesCounters == nil { devicesWithoutCounters++ allocatedDevicesWithoutCounters++ } } for _, device := range unallocated { - if device.Basic == nil || device.Basic.ConsumesCounters == nil { + if device.ConsumesCounters == nil { devicesWithoutCounters++ } } @@ -126,13 +126,10 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlic func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[string]resource.Quantity { countersConsumed := map[string]map[string]resource.Quantity{} for _, device := range devices { - if device.Basic == nil { + if device.ConsumesCounters == nil { continue } - if device.Basic.ConsumesCounters == nil { - continue - } - for _, consumedCounter := range device.Basic.ConsumesCounters { + for _, consumedCounter := range device.ConsumesCounters { if _, ok := countersConsumed[consumedCounter.CounterSet]; !ok { countersConsumed[consumedCounter.CounterSet] = map[string]resource.Quantity{} } diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go index 8231c2eb6648..5f94152a718b 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go @@ -24,7 +24,7 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" apiv1 "k8s.io/api/core/v1" - resourceapi "k8s.io/api/resource/v1beta1" + resourceapi "k8s.io/api/resource/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -220,19 +220,17 @@ func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName str devices, resourceapi.Device{ Name: fmt.Sprintf("gpu-0-partition-%d", i), - Basic: &resourceapi.BasicDevice{ - Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{ - "memory": { - Value: resource.MustParse("10Gi"), - }, + Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{ + "memory": { + Value: resource.MustParse("10Gi"), }, - ConsumesCounters: []resourceapi.DeviceCounterConsumption{ - { - CounterSet: "gpu-0-counter-set", - Counters: map[string]resourceapi.Counter{ - "memory": { - Value: resource.MustParse("10Gi"), - }, + }, + ConsumesCounters: []resourceapi.DeviceCounterConsumption{ + { + CounterSet: "gpu-0-counter-set", + Counters: map[string]resourceapi.Counter{ + "memory": { + Value: resource.MustParse("10Gi"), }, }, }, @@ -243,19 +241,17 @@ func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName str devices = append(devices, resourceapi.Device{ Name: "gpu-0", - Basic: &resourceapi.BasicDevice{ - Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{ - "memory": { - Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)), - }, + Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{ + "memory": { + Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)), }, - ConsumesCounters: []resourceapi.DeviceCounterConsumption{ - { - CounterSet: "gpu-0-counter-set", - Counters: map[string]resourceapi.Counter{ - "memory": { - Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)), - }, + }, + ConsumesCounters: []resourceapi.DeviceCounterConsumption{ + { + CounterSet: "gpu-0-counter-set", + Counters: map[string]resourceapi.Counter{ + "memory": { + Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)), }, }, }, @@ -266,7 +262,7 @@ func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName str ObjectMeta: metav1.ObjectMeta{Name: sliceName, UID: types.UID(sliceName)}, Spec: resourceapi.ResourceSliceSpec{ Driver: driverName, - NodeName: nodeName, + NodeName: &nodeName, Pool: resourceapi.ResourcePool{Name: poolName, Generation: int64(poolGen), ResourceSliceCount: 1}, Devices: devices, SharedCounters: []resourceapi.CounterSet{