Skip to content

Commit a862b0a

Browse files
authored
fix: fix resourcePerNode override not applied with Volcano scheduler (#2982)
* volcano integeration for resourcePerNode override in trainjob Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * Fixed the issue where PodGroup resources weren't scaling with spec.trainer.resourcesPerNode when specified on a TrainJob Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * limits removed Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * maintains consistency across all MLPolicy plugins Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * Fix pre-commit formatting Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * Unit test updates for mpi and torch pluggins Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * framework unit test prblm solved Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * fix:All MLPolicy Plugins update SinglePodRequests from TrainJob.spec.trainer.resourcesPerNode Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * Refactor ResourcesPerNode handling to use PodRequest helper for correct scaling with init/sidecar containers Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * refactor: simplify ResourcesPerNode by modifying jobSetTemplateSpec directly Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * test: modify TrainJob resources in coscheduling test to verify node container override Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> * Refactor: remove duplicate resource logic from plugins Signed-off-by: sksingh2005 <shashanksgh3@gmail.com> --------- Signed-off-by: sksingh2005 <shashanksgh3@gmail.com>
1 parent f921483 commit a862b0a

File tree

2 files changed

+38
-12
lines changed

2 files changed

+38
-12
lines changed

pkg/runtime/core/trainingruntime.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,16 @@ func (r *TrainingRuntime) newRuntimeInfo(
171171
if labelAncestor, ok := metadata.Labels[constants.LabelTrainJobAncestor]; ok {
172172
if labelAncestor == constants.AncestorTrainer && mlPolicy != nil {
173173
count = ptr.Deref(mlPolicy.NumNodes, 1)
174+
175+
// Apply resourcesPerNode from TrainJob to the template spec
176+
if trainJob.Spec.Trainer != nil && trainJob.Spec.Trainer.ResourcesPerNode != nil {
177+
for j := range jobSetTemplateSpec.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec.Containers {
178+
if jobSetTemplateSpec.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec.Containers[j].Name == constants.Node {
179+
jobSetTemplateSpec.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec.Containers[j].Resources = *trainJob.Spec.Trainer.ResourcesPerNode.DeepCopy()
180+
break
181+
}
182+
}
183+
}
174184
}
175185
ancestor = &labelAncestor
176186
}

pkg/runtime/framework/core/framework_test.go

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,10 @@ func TestRunComponentBuilderPlugins(t *testing.T) {
552552
Endpoints: func(yield func(string) bool) {
553553
yield("test-job-launcher-0-0.test-job")
554554
},
555+
SinglePodRequests: corev1.ResourceList{
556+
corev1.ResourceCPU: resource.MustParse("1"),
557+
corev1.ResourceMemory: resource.MustParse("4Gi"),
558+
},
555559
Containers: []runtime.Container{{
556560
Name: constants.Node,
557561
VolumeMounts: []corev1ac.VolumeMountApplyConfiguration{
@@ -575,6 +579,10 @@ func TestRunComponentBuilderPlugins(t *testing.T) {
575579
yield("test-job-node-0-0.test-job")
576580
yield("test-job-node-0-1.test-job")
577581
},
582+
SinglePodRequests: corev1.ResourceList{
583+
corev1.ResourceCPU: resource.MustParse("1"),
584+
corev1.ResourceMemory: resource.MustParse("4Gi"),
585+
},
578586
Containers: []runtime.Container{{
579587
Name: constants.Node,
580588
VolumeMounts: []corev1ac.VolumeMountApplyConfiguration{
@@ -1019,6 +1027,10 @@ func TestRunComponentBuilderPlugins(t *testing.T) {
10191027
Endpoints: func(yield func(string) bool) {
10201028
yield("test-job-launcher-0-0.test-job")
10211029
},
1030+
SinglePodRequests: corev1.ResourceList{
1031+
corev1.ResourceCPU: resource.MustParse("1"),
1032+
corev1.ResourceMemory: resource.MustParse("4Gi"),
1033+
},
10221034
Containers: []runtime.Container{{
10231035
Name: constants.Node,
10241036
VolumeMounts: []corev1ac.VolumeMountApplyConfiguration{
@@ -1089,6 +1101,10 @@ func TestRunComponentBuilderPlugins(t *testing.T) {
10891101
yield("test-job-node-0-0.test-job")
10901102
yield("test-job-node-0-1.test-job")
10911103
},
1104+
SinglePodRequests: corev1.ResourceList{
1105+
corev1.ResourceCPU: resource.MustParse("1"),
1106+
corev1.ResourceMemory: resource.MustParse("4Gi"),
1107+
},
10921108
Containers: []runtime.Container{{
10931109
Name: constants.Node,
10941110
VolumeMounts: []corev1ac.VolumeMountApplyConfiguration{
@@ -1355,8 +1371,8 @@ test-job-node-0-1.test-job slots=1
13551371
Ancestor: ptr.To(constants.AncestorTrainer),
13561372
Count: ptr.To[int32](1),
13571373
SinglePodRequests: corev1.ResourceList{
1358-
corev1.ResourceCPU: resource.MustParse("1"),
1359-
corev1.ResourceMemory: resource.MustParse("4Gi"),
1374+
corev1.ResourceCPU: resource.MustParse("2"),
1375+
corev1.ResourceMemory: resource.MustParse("8Gi"),
13601376
},
13611377
Containers: []runtime.Container{{
13621378
VolumeMounts: []corev1ac.VolumeMountApplyConfiguration{
@@ -1473,8 +1489,8 @@ test-job-node-0-1.test-job slots=1
14731489
testingutil.MakeTrainJobTrainerWrapper().
14741490
NumNodes(100).
14751491
Container("test:trainjob", []string{"trainjob"}, []string{"trainjob"}, corev1.ResourceList{
1476-
corev1.ResourceCPU: resource.MustParse("1"),
1477-
corev1.ResourceMemory: resource.MustParse("4Gi"),
1492+
corev1.ResourceCPU: resource.MustParse("2"),
1493+
corev1.ResourceMemory: resource.MustParse("8Gi"),
14781494
}).
14791495
Obj(),
14801496
).
@@ -1586,8 +1602,8 @@ test-job-node-0-1.test-job slots=1
15861602
WithArgs("trainjob").
15871603
WithResources(corev1ac.ResourceRequirements().
15881604
WithRequests(corev1.ResourceList{
1589-
corev1.ResourceCPU: resource.MustParse("1"),
1590-
corev1.ResourceMemory: resource.MustParse("4Gi"),
1605+
corev1.ResourceCPU: resource.MustParse("2"),
1606+
corev1.ResourceMemory: resource.MustParse("8Gi"),
15911607
})).
15921608
WithVolumeMounts(
15931609
corev1ac.VolumeMount().
@@ -1665,8 +1681,8 @@ test-job-node-0-1.test-job slots=1
16651681
Ancestor: ptr.To(constants.AncestorTrainer),
16661682
Count: ptr.To[int32](100),
16671683
SinglePodRequests: corev1.ResourceList{
1668-
corev1.ResourceCPU: resource.MustParse("1"),
1669-
corev1.ResourceMemory: resource.MustParse("4Gi"),
1684+
corev1.ResourceCPU: resource.MustParse("2"),
1685+
corev1.ResourceMemory: resource.MustParse("8Gi"),
16701686
},
16711687
Containers: []runtime.Container{{
16721688
VolumeMounts: []corev1ac.VolumeMountApplyConfiguration{
@@ -1689,8 +1705,8 @@ test-job-node-0-1.test-job slots=1
16891705
SchedulingTimeout(300).
16901706
MinMember(102). // 102 replicas = 100 Trainer nodes + 2 Initializer.
16911707
MinResources(corev1.ResourceList{
1692-
corev1.ResourceCPU: resource.MustParse("102"), // 1 CPU and 4Gi per replica.
1693-
corev1.ResourceMemory: resource.MustParse("408Gi"),
1708+
corev1.ResourceCPU: resource.MustParse("202"), // 2 CPU and 8Gi per trainer replica, 1 CPU and 4Gi per initializer.
1709+
corev1.ResourceMemory: resource.MustParse("808Gi"),
16941710
}).
16951711
ControllerReference(trainer.SchemeGroupVersion.WithKind("TrainJob"), "test-job", "uid").
16961712
Obj(),
@@ -1702,8 +1718,8 @@ test-job-node-0-1.test-job slots=1
17021718
Completions(1, constants.DatasetInitializer, constants.ModelInitializer).
17031719
NumNodes(100).
17041720
Container(constants.Node, constants.Node, "test:trainjob", []string{"trainjob"}, []string{"trainjob"}, corev1.ResourceList{
1705-
corev1.ResourceCPU: resource.MustParse("1"),
1706-
corev1.ResourceMemory: resource.MustParse("4Gi"),
1721+
corev1.ResourceCPU: resource.MustParse("2"),
1722+
corev1.ResourceMemory: resource.MustParse("8Gi"),
17071723
}).
17081724
Obj(),
17091725
},

0 commit comments

Comments
 (0)