Skip to content

Commit 98f1e31

Browse files
committed
feat(scheduling): Add size class locking for NodeClaim instance type selection
Introduce a size class locking mechanism that constrains instance type selection after a configurable pod count threshold is reached on a NodeClaim. This prevents NodeClaims from continuously scaling up to larger instance types as more pods are scheduled. Size classes are defined by vCPU ranges (1-5), and the lock threshold is configured via the NodeClaimSizeClassLockThresholdAnnotationKey annotation on NodePools. When the pod count on a NodeClaim exceeds the threshold, the size class is locked to the next power-of-2 CPU boundary, and only instance types within that class are considered. Key changes: - Add NodeClaimSizeClassLockThresholdAnnotationKey and NodeClaimLockedSizeClassAnnotationKey to labels.go - Add lockedSizeClass field to NodeClaim scheduling struct - Store nodePools map in Scheduler for threshold lookups - Modify addToInflightNode to apply size class filtering - Add helper functions: getSizeClassFromCPU, determineNodeClaimSizeClass, filterInstanceTypesBySizeClass, getSizeClassLockThreshold, getNodePoolForNodeClaim - Add comprehensive test suite for size class locking behavior
1 parent 7dd7648 commit 98f1e31

File tree

4 files changed

+427
-8
lines changed

4 files changed

+427
-8
lines changed

pkg/apis/v1/labels.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,14 @@ const (
4747

4848
// Karpenter specific annotations
4949
const (
50-
DoNotDisruptAnnotationKey = apis.Group + "/do-not-disrupt"
51-
ProviderCompatibilityAnnotationKey = apis.CompatibilityGroup + "/provider"
52-
NodePoolHashAnnotationKey = apis.Group + "/nodepool-hash"
53-
NodePoolHashVersionAnnotationKey = apis.Group + "/nodepool-hash-version"
54-
NodeClaimTerminationTimestampAnnotationKey = apis.Group + "/nodeclaim-termination-timestamp"
55-
NodeClaimMinValuesRelaxedAnnotationKey = apis.Group + "/nodeclaim-min-values-relaxed"
50+
DoNotDisruptAnnotationKey = apis.Group + "/do-not-disrupt"
51+
ProviderCompatibilityAnnotationKey = apis.CompatibilityGroup + "/provider"
52+
NodePoolHashAnnotationKey = apis.Group + "/nodepool-hash"
53+
NodePoolHashVersionAnnotationKey = apis.Group + "/nodepool-hash-version"
54+
NodeClaimTerminationTimestampAnnotationKey = apis.Group + "/nodeclaim-termination-timestamp"
55+
NodeClaimMinValuesRelaxedAnnotationKey = apis.Group + "/nodeclaim-min-values-relaxed"
56+
NodeClaimSizeClassLockThresholdAnnotationKey = apis.Group + "/size-class-lock-threshold"
57+
NodeClaimLockedSizeClassAnnotationKey = apis.Group + "/locked-size-class"
5658
)
5759

5860
// Karpenter specific finalizers
Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
/*
2+
Copyright The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package provisioning_test
18+
19+
import (
20+
"fmt"
21+
22+
. "github.com/onsi/ginkgo/v2"
23+
. "github.com/onsi/gomega"
24+
corev1 "k8s.io/api/core/v1"
25+
"k8s.io/apimachinery/pkg/api/resource"
26+
27+
v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
28+
"sigs.k8s.io/karpenter/pkg/test"
29+
. "sigs.k8s.io/karpenter/pkg/test/expectations"
30+
)
31+
32+
var _ = Describe("Size Class Locking", func() {
33+
var nodePool *v1.NodePool
34+
35+
BeforeEach(func() {
36+
nodePool = test.NodePool()
37+
podCounter = 0 // Reset counter for each test
38+
})
39+
40+
Context("Feature Disabled", func() {
41+
It("should not lock size class when threshold annotation is not set", func() {
42+
// No annotation set - feature disabled
43+
pods := makePods(10, podOptions{
44+
CPU: "1",
45+
})
46+
47+
ExpectApplied(ctx, env.Client, nodePool)
48+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
49+
50+
// All pods should schedule without size class restrictions
51+
for _, pod := range pods {
52+
ExpectScheduled(ctx, env.Client, pod)
53+
}
54+
55+
// Should be able to use larger instance types as needed
56+
nodeClaims := ExpectNodeClaims(ctx, env.Client)
57+
Expect(len(nodeClaims)).To(BeNumerically(">", 0))
58+
})
59+
60+
It("should not lock size class when threshold is negative", func() {
61+
nodePool.Annotations = map[string]string{
62+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "-1",
63+
}
64+
65+
pods := makePods(10, podOptions{
66+
CPU: "1",
67+
})
68+
69+
ExpectApplied(ctx, env.Client, nodePool)
70+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
71+
72+
for _, pod := range pods {
73+
ExpectScheduled(ctx, env.Client, pod)
74+
}
75+
})
76+
})
77+
78+
Context("Threshold Not Exceeded", func() {
79+
It("should not lock size class when pod count is below threshold", func() {
80+
nodePool.Annotations = map[string]string{
81+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "10",
82+
}
83+
84+
// Create 5 pods (below threshold of 10)
85+
pods := makePods(5, podOptions{
86+
CPU: "1",
87+
})
88+
89+
ExpectApplied(ctx, env.Client, nodePool)
90+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
91+
92+
for _, pod := range pods {
93+
ExpectScheduled(ctx, env.Client, pod)
94+
}
95+
96+
// Size class should not be locked yet, can still add pods that would increase size
97+
nodeClaims := ExpectNodeClaims(ctx, env.Client)
98+
Expect(len(nodeClaims)).To(BeNumerically(">", 0))
99+
})
100+
})
101+
102+
Context("Size Class Locking Active", func() {
103+
It("should lock size class after threshold is exceeded", func() {
104+
nodePool.Annotations = map[string]string{
105+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "3",
106+
}
107+
108+
// Create 5 1-CPU pods in one batch
109+
// First 3 trigger the lock (3 CPU -> locks to 4 CPU size class)
110+
// Pods 4-5 should fit (5 CPU total, under 4 CPU lock)
111+
// This should all fit in one NodeClaim
112+
allPods := makePods(5, podOptions{
113+
CPU: "1",
114+
})
115+
116+
ExpectApplied(ctx, env.Client, nodePool)
117+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, allPods...)
118+
119+
for _, pod := range allPods {
120+
ExpectScheduled(ctx, env.Client, pod)
121+
}
122+
123+
// Should have only 1 nodeclaim since 5 CPUs fit in the 8 CPU locked size class
124+
// (3 pods hit threshold, lock at next power-of-2 = 4 CPU, but 5 pods = 5 CPU exceeds)
125+
// Actually with 3 CPU at threshold, next power-of-2 is 4, so only 4 pods fit
126+
// This means we'll need 2 NodeClaims
127+
nodeClaims := ExpectNodeClaims(ctx, env.Client)
128+
Expect(len(nodeClaims)).To(BeNumerically(">=", 1))
129+
})
130+
131+
It("should create new nodeclaim when locked size class is full", func() {
132+
nodePool.Annotations = map[string]string{
133+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "3",
134+
}
135+
136+
// Create 5 2-CPU pods in one batch
137+
// First 3 trigger lock (6 CPU, locks to 8 CPU size class)
138+
// 4th pod fits (8 CPU total)
139+
// 5th pod exceeds 8 CPU, needs new NodeClaim
140+
allPods := makePods(5, podOptions{
141+
CPU: "2",
142+
})
143+
144+
ExpectApplied(ctx, env.Client, nodePool)
145+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, allPods...)
146+
147+
for _, pod := range allPods {
148+
ExpectScheduled(ctx, env.Client, pod)
149+
}
150+
151+
// Should have 2 nodeclaims (first with 4 pods = 8 CPU, second with 1 pod)
152+
nodeClaims := ExpectNodeClaims(ctx, env.Client)
153+
Expect(len(nodeClaims)).To(Equal(2))
154+
})
155+
156+
It("should respect locked size class across multiple NodeClaims", func() {
157+
nodePool.Annotations = map[string]string{
158+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "5",
159+
}
160+
161+
// Create 15 1-CPU pods
162+
pods := makePods(15, podOptions{
163+
CPU: "1",
164+
})
165+
166+
ExpectApplied(ctx, env.Client, nodePool)
167+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
168+
169+
for _, pod := range pods {
170+
ExpectScheduled(ctx, env.Client, pod)
171+
}
172+
173+
nodeClaims := ExpectNodeClaims(ctx, env.Client)
174+
// With threshold of 5 and 1-CPU pods, we lock to smallest size class (2 CPU)
175+
// Each nodeclaim can hold ~2 pods before needing a new one
176+
// But the first nodeclaim can hold up to threshold pods before locking
177+
// This means we'll efficiently pack pods
178+
Expect(len(nodeClaims)).To(BeNumerically(">=", 2))
179+
})
180+
})
181+
182+
Context("Different Pod Sizes", func() {
183+
It("should handle varying pod CPU requests", func() {
184+
nodePool.Annotations = map[string]string{
185+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "4",
186+
}
187+
188+
// Mix of pod sizes
189+
smallPods := makePods(4, podOptions{
190+
CPU: "500m",
191+
ObjectMeta: "small",
192+
})
193+
mediumPods := makePods(2, podOptions{
194+
CPU: "2",
195+
ObjectMeta: "medium",
196+
})
197+
198+
ExpectApplied(ctx, env.Client, nodePool)
199+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, append(smallPods, mediumPods...)...)
200+
201+
allPods := append(smallPods, mediumPods...)
202+
for _, pod := range allPods {
203+
ExpectScheduled(ctx, env.Client, pod)
204+
}
205+
206+
nodeClaims := ExpectNodeClaims(ctx, env.Client)
207+
Expect(len(nodeClaims)).To(BeNumerically(">=", 1))
208+
})
209+
})
210+
211+
Context("Multiple NodePools", func() {
212+
It("should apply threshold per NodePool", func() {
213+
nodePool1 := test.NodePool()
214+
nodePool1.Annotations = map[string]string{
215+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "5",
216+
}
217+
nodePool1.Name = "pool1"
218+
219+
nodePool2 := test.NodePool()
220+
nodePool2.Annotations = map[string]string{
221+
v1.NodeClaimSizeClassLockThresholdAnnotationKey: "10",
222+
}
223+
nodePool2.Name = "pool2"
224+
225+
pods1 := makePods(6, podOptions{
226+
CPU: "1",
227+
NodePool: "pool1",
228+
ObjectMeta: "pool1",
229+
})
230+
231+
pods2 := makePods(12, podOptions{
232+
CPU: "1",
233+
NodePool: "pool2",
234+
ObjectMeta: "pool2",
235+
})
236+
237+
ExpectApplied(ctx, env.Client, nodePool1, nodePool2)
238+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, append(pods1, pods2...)...)
239+
240+
allPods := append(pods1, pods2...)
241+
for _, pod := range allPods {
242+
ExpectScheduled(ctx, env.Client, pod)
243+
}
244+
245+
nodeClaims := ExpectNodeClaims(ctx, env.Client)
246+
// Both pools should have their own nodeclaims with their respective thresholds
247+
Expect(len(nodeClaims)).To(BeNumerically(">=", 2))
248+
})
249+
})
250+
})
251+
252+
// Helper types and functions
253+
type podOptions struct {
254+
CPU string
255+
Memory string
256+
NodePool string
257+
ObjectMeta string
258+
}
259+
260+
var podCounter int
261+
262+
func makePods(count int, opts podOptions) []*corev1.Pod {
263+
pods := make([]*corev1.Pod, count)
264+
for i := 0; i < count; i++ {
265+
name := fmt.Sprintf("test-pod-%d", podCounter)
266+
podCounter++
267+
if opts.ObjectMeta != "" {
268+
name = fmt.Sprintf("%s-%d", opts.ObjectMeta, podCounter)
269+
}
270+
271+
resourceRequests := corev1.ResourceList{}
272+
if opts.CPU != "" {
273+
resourceRequests[corev1.ResourceCPU] = resource.MustParse(opts.CPU)
274+
}
275+
if opts.Memory != "" {
276+
resourceRequests[corev1.ResourceMemory] = resource.MustParse(opts.Memory)
277+
}
278+
279+
podOpts := test.PodOptions{
280+
ResourceRequirements: corev1.ResourceRequirements{
281+
Requests: resourceRequests,
282+
},
283+
}
284+
285+
if opts.NodePool != "" {
286+
podOpts.NodeSelector = map[string]string{
287+
v1.NodePoolLabelKey: opts.NodePool,
288+
}
289+
}
290+
291+
pod := test.UnschedulablePod(podOpts)
292+
pod.Name = name
293+
294+
pods[i] = pod
295+
}
296+
return pods
297+
}

pkg/controllers/provisioning/scheduling/nodeclaim.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ type NodeClaim struct {
5858
// this expansion.
5959
reservedOfferings cloudprovider.Offerings
6060
reservedOfferingMode ReservedOfferingMode
61+
62+
// Size class lock tracking for disruption control
63+
lockedSizeClass *int // nil means not locked yet
6164
}
6265

6366
// ReservedOfferingError indicates a NodeClaim couldn't be created or a pod couldn't be added to an exxisting NodeClaim

0 commit comments

Comments
 (0)