Skip to content

Commit e8dc9ef

Browse files
fix create nodeclaim issue (#290)
* Add namespace level vGPU resource quotas * Add namespace level vGPU resource quotas * add webhook and fix type convert issue * fix make lint issue * fix webhook validate * rollback webhook validate * remove duplicated webhook * #207 in memory check gpuquota and allocate/deallocate/init * quota design * refactor code * rollback webhook suit-test * rollback test suit * Karpenter node claim design * design refactor * remove design * fix load karpenter scheme issue * fix rbac create node claim no permission and gpuresourcename null * fix zone/region nodeclaim requirement key bug and add the role * Mapping capacity type for capacity type * fix nodeclaim capacity type value issue --------- Co-authored-by: 0x5457 <[email protected]>
1 parent 019495e commit e8dc9ef

File tree

4 files changed

+175
-26
lines changed

4 files changed

+175
-26
lines changed

config/rbac/role.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,23 @@ rules:
164164
- get
165165
- patch
166166
- update
167+
- apiGroups:
168+
- karpenter.sh
169+
resources:
170+
- nodeclaims
171+
verbs:
172+
- get
173+
- list
174+
- watch
175+
- update
176+
- delete
177+
- create
178+
- patch
179+
- apiGroups:
180+
- karpenter.k8s.aws
181+
resources:
182+
- ec2nodeclasses
183+
verbs:
184+
- get
185+
- list
186+
- watch

internal/cloudprovider/karpenter/nodeclaim.go

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,22 @@ import (
2525
)
2626

2727
var (
28-
KarpenterGroup = "karpenter.sh"
29-
KarpenterVersion = "v1"
30-
kindNodeClaim = "NodeClaim"
28+
KarpenterGroup = "karpenter.sh"
29+
KarpenterVersion = "v1"
30+
kindNodeClaim = "NodeClaim"
31+
DefaultGPUResourceName = "nvidia.com/gpu"
32+
EC2NodeClassGroup = "karpenter.k8s.aws"
33+
// AWS capacity type: https://karpenter.sh/docs/concepts/scheduling/#well-known-labels
34+
AWSOnDemandType = "on-demand"
35+
AWSReservedType = "reserved"
36+
AWSSpotType = "spot"
37+
38+
// CapacityTypeMapping maps the CapacityTypeEnum to the corresponding Karpenter capacity type
39+
CapacityTypeMapping = map[tfv1.CapacityTypeEnum]string{
40+
tfv1.CapacityTypeOnDemand: AWSOnDemandType,
41+
tfv1.CapacityTypeSpot: AWSReservedType,
42+
tfv1.CapacityTypeReserved: AWSSpotType,
43+
}
3144
)
3245

3346
// KarpenterExtraConfig holds Karpenter-specific configuration parsed from ExtraParams
@@ -83,20 +96,12 @@ func (p KarpenterGPUNodeProvider) TestConnection() error {
8396
if p.client == nil {
8497
return fmt.Errorf("kubernetes client is not initialized")
8598
}
86-
exist := true
8799
// check if NodeClaim CRD exists
88100
if err := p.client.List(context.Background(), &karpv1.NodeClaimList{}); err != nil {
89101
log.FromContext(p.ctx).Error(err, "karpenter NodeClaim CRD not found.")
90-
exist = false
91-
}
92-
if exist {
93-
return nil
94-
}
95-
// check if NodePool CRD exists
96-
if err := p.client.List(context.Background(), &karpv1.NodePoolList{}); err != nil {
97-
log.FromContext(p.ctx).Error(err, "karpenter NodePool CRD not found.")
98-
return fmt.Errorf("karpenter CRD not found or not accessible")
102+
return fmt.Errorf("karpenter nodeclaim CRD not found or not accessible")
99103
}
104+
log.FromContext(p.ctx).Info("Test connection to Karpenter nodeclaim succeeded.")
100105
return nil
101106
}
102107

@@ -224,7 +229,9 @@ func (p KarpenterGPUNodeProvider) GetGPUNodeInstanceTypeInfo(region string) []ty
224229

225230
// parseKarpenterConfig extracts Karpenter-specific configuration from ExtraParams
226231
func (p KarpenterGPUNodeProvider) parseKarpenterConfig(param *tfv1.GPUNodeClaimSpec) *KarpenterExtraConfig {
227-
karpenterConfig := &KarpenterExtraConfig{}
232+
karpenterConfig := &KarpenterExtraConfig{
233+
GPUResourceName: corev1.ResourceName(DefaultGPUResourceName),
234+
}
228235
extraParams := param.ExtraParams
229236
if extraParams == nil {
230237
return karpenterConfig
@@ -261,7 +268,7 @@ func (p KarpenterGPUNodeProvider) parseKarpenterConfig(param *tfv1.GPUNodeClaimS
261268
}
262269

263270
if karpenterConfig.GPUResourceName == "" {
264-
karpenterConfig.GPUResourceName = "nvidia.com/gpu"
271+
karpenterConfig.GPUResourceName = corev1.ResourceName(DefaultGPUResourceName)
265272
}
266273

267274
return karpenterConfig
@@ -375,44 +382,77 @@ func (p KarpenterGPUNodeProvider) queryAndBuildNodeClassRef(ctx context.Context,
375382
func (p KarpenterGPUNodeProvider) buildRequirements(nodeClaim *karpv1.NodeClaim, param *tfv1.GPUNodeClaimSpec) {
376383
// Build node selector requirements using Karpenter's NodeSelectorRequirement
377384
requirements := []karpv1.NodeSelectorRequirementWithMinValues{}
385+
seen := make(map[string]struct{})
378386
// 1. instance type
379387
if param.InstanceType != "" {
388+
key := string(tfv1.NodeRequirementKeyInstanceType)
380389
requirements = append(requirements, karpv1.NodeSelectorRequirementWithMinValues{
381390
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
382-
Key: string(tfv1.NodeRequirementKeyInstanceType),
391+
Key: key,
383392
Operator: corev1.NodeSelectorOpIn,
384393
Values: []string{param.InstanceType},
385394
},
386395
})
396+
seen[key] = struct{}{}
387397
}
388398
// 2. zone
389399
if param.Zone != "" {
400+
key := string(tfv1.NodeRequirementKeyZone)
390401
requirements = append(requirements, karpv1.NodeSelectorRequirementWithMinValues{
391402
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
392-
Key: string(tfv1.NodeRequirementKeyZone),
403+
Key: key,
393404
Operator: corev1.NodeSelectorOpIn,
394405
Values: []string{param.Zone},
395406
},
396407
})
408+
seen[key] = struct{}{}
397409
}
398410

399411
// 3. region
400412
if param.Region != "" {
413+
key := string(tfv1.NodeRequirementKeyRegion)
401414
requirements = append(requirements, karpv1.NodeSelectorRequirementWithMinValues{
402415
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
403-
Key: string(tfv1.NodeRequirementKeyRegion),
416+
Key: key,
404417
Operator: corev1.NodeSelectorOpIn,
405418
Values: []string{param.Region},
406419
},
407420
})
421+
seen[key] = struct{}{}
422+
}
423+
// 4. capacity type
424+
if param.CapacityType != "" {
425+
key := string(tfv1.NodeRequirementKeyCapacityType)
426+
value := string(param.CapacityType)
427+
// if capacity type is not in the mapping, use the original value
428+
// otherwise, use the mapping value
429+
// https://github.com/kubernetes-sigs/karpenter/blob/f8da711d7e72b678e77f4758bc73a34ba34286d2/pkg/apis/v1/labels.go#L35
430+
if _, exists := CapacityTypeMapping[param.CapacityType]; exists {
431+
value = CapacityTypeMapping[param.CapacityType]
432+
}
433+
434+
requirements = append(requirements, karpv1.NodeSelectorRequirementWithMinValues{
435+
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
436+
Key: key,
437+
Operator: corev1.NodeSelectorOpIn,
438+
Values: []string{value},
439+
},
440+
})
441+
seen[key] = struct{}{}
408442
}
409443

410444
// 4. custom GPU requirements
411445
if p.nodeManagerConfig.NodeProvisioner.GPURequirements != nil {
412446
for _, requirement := range p.nodeManagerConfig.NodeProvisioner.GPURequirements {
447+
key := string(requirement.Key)
448+
// remove duplicate requirements
449+
if _, exists := seen[key]; exists {
450+
continue
451+
}
452+
seen[key] = struct{}{}
413453
requirements = append(requirements, karpv1.NodeSelectorRequirementWithMinValues{
414454
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
415-
Key: string(requirement.Key),
455+
Key: key,
416456
Operator: requirement.Operator,
417457
Values: requirement.Values,
418458
},

internal/cloudprovider/karpenter/nodeclaim_test.go

Lines changed: 90 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,12 @@ func TestKarpenterGPUNodeProvider_CreateNode(t *testing.T) {
121121
{
122122
Key: "karpenter.sh/capacity-type",
123123
Operator: corev1.NodeSelectorOpIn,
124-
Values: []string{"on-demand"},
124+
Values: []string{string(tfv1.CapacityTypeOnDemand)},
125+
},
126+
{
127+
Key: "node.kubernetes.io/instance-type",
128+
Operator: corev1.NodeSelectorOpIn,
129+
Values: []string{"x6g.large"},
125130
},
126131
},
127132
GPULabels: map[string]string{
@@ -167,10 +172,70 @@ func TestKarpenterGPUNodeProvider_CreateNode(t *testing.T) {
167172
TFlopsOffered: resource.MustParse("125"),
168173
VRAMOffered: resource.MustParse("64Gi"),
169174
GPUDeviceOffered: 4,
170-
ExtraParams: map[string]string{
171-
"karpenter.nodeClaim.terminationGracePeriod": "30s",
172-
"karpenter.gpuResource": "nvidia.com/gpu",
175+
ExtraParams: map[string]string{},
176+
},
177+
expectError: false,
178+
},
179+
{
180+
name: "successful node creation with GPU",
181+
param: &tfv1.GPUNodeClaimSpec{
182+
NodeName: "test-gpu-node",
183+
Region: "us-west-2",
184+
Zone: "us-west-2a",
185+
InstanceType: "p3.8xlarge",
186+
CapacityType: tfv1.CapacityTypeReserved,
187+
NodeClassRef: tfv1.GroupKindName{
188+
Name: "test-ec2-node-class",
189+
Group: "karpenter.k8s.aws",
190+
Kind: "EC2NodeClass",
191+
Version: "v1",
192+
},
193+
TFlopsOffered: resource.MustParse("125"),
194+
VRAMOffered: resource.MustParse("64Gi"),
195+
GPUDeviceOffered: 4,
196+
ExtraParams: map[string]string{},
197+
},
198+
expectError: false,
199+
},
200+
{
201+
name: "successful node creation with GPU",
202+
param: &tfv1.GPUNodeClaimSpec{
203+
NodeName: "test-gpu-node",
204+
Region: "us-west-2",
205+
Zone: "us-west-2a",
206+
InstanceType: "p3.8xlarge",
207+
CapacityType: tfv1.CapacityTypeSpot,
208+
NodeClassRef: tfv1.GroupKindName{
209+
Name: "test-ec2-node-class",
210+
Group: "karpenter.k8s.aws",
211+
Kind: "EC2NodeClass",
212+
Version: "v1",
173213
},
214+
TFlopsOffered: resource.MustParse("125"),
215+
VRAMOffered: resource.MustParse("64Gi"),
216+
GPUDeviceOffered: 4,
217+
ExtraParams: map[string]string{},
218+
},
219+
expectError: false,
220+
},
221+
{
222+
name: "successful node creation with GPU",
223+
param: &tfv1.GPUNodeClaimSpec{
224+
NodeName: "test-gpu-node",
225+
Region: "us-west-2",
226+
Zone: "us-west-2a",
227+
InstanceType: "p3.8xlarge",
228+
CapacityType: "unexpect-capacity-type",
229+
NodeClassRef: tfv1.GroupKindName{
230+
Name: "test-ec2-node-class",
231+
Group: "karpenter.k8s.aws",
232+
Kind: "EC2NodeClass",
233+
Version: "v1",
234+
},
235+
TFlopsOffered: resource.MustParse("125"),
236+
VRAMOffered: resource.MustParse("64Gi"),
237+
GPUDeviceOffered: 4,
238+
ExtraParams: map[string]string{},
174239
},
175240
expectError: false,
176241
},
@@ -317,18 +382,29 @@ func TestKarpenterGPUNodeProvider_parseKarpenterConfig(t *testing.T) {
317382
param: &tfv1.GPUNodeClaimSpec{
318383
ExtraParams: nil,
319384
},
320-
expected: "", // Early return when extraParams is nil
385+
expected: "nvidia.com/gpu", // Early return when extraParams is nil
321386
},
322387
{
323388
name: "empty extra params",
324389
param: &tfv1.GPUNodeClaimSpec{
325390
ExtraParams: map[string]string{
326-
"karpenter.nodeClassRef.kind": "EC2NodeClass",
327391
"karpenter.nodeClaim.terminationGracePeriod": "30s",
328392
},
329393
},
330394
expected: "30s", // Default value should be set
331395
},
396+
{
397+
name: "empty extra params",
398+
param: &tfv1.GPUNodeClaimSpec{
399+
ExtraParams: map[string]string{},
400+
},
401+
expected: "nvidia.com/gpu", // Default value should be set
402+
},
403+
{
404+
name: "empty extra params",
405+
param: &tfv1.GPUNodeClaimSpec{},
406+
expected: "nvidia.com/gpu", // Default value should be set
407+
},
332408
}
333409

334410
t.Run(tests[0].name, func(t *testing.T) {
@@ -341,6 +417,14 @@ func TestKarpenterGPUNodeProvider_parseKarpenterConfig(t *testing.T) {
341417
assert.Equal(t, tests[1].expected, result.NodeClaim.TerminationGracePeriod)
342418
})
343419

420+
t.Run(tests[2].name, func(t *testing.T) {
421+
result := provider.parseKarpenterConfig(tests[2].param)
422+
assert.Equal(t, tests[2].expected, string(result.GPUResourceName))
423+
})
424+
t.Run(tests[3].name, func(t *testing.T) {
425+
result := provider.parseKarpenterConfig(tests[3].param)
426+
assert.Equal(t, tests[3].expected, string(result.GPUResourceName))
427+
})
344428
}
345429

346430
func TestSetNestedValue(t *testing.T) {

internal/controller/fake_node_claim_contoller_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,12 @@ var _ = Describe("FakeNodeClaimController", func() {
5656
{
5757
Key: "karpenter.sh/capacity-type",
5858
Operator: corev1.NodeSelectorOpIn,
59-
Values: []string{"on-demand"},
59+
Values: []string{"OnDemand"},
60+
},
61+
{
62+
Key: "node.kubernetes.io/instance-type",
63+
Operator: corev1.NodeSelectorOpIn,
64+
Values: []string{"x6g.large"},
6065
},
6166
},
6267
GPULabels: map[string]string{

0 commit comments

Comments
 (0)