@@ -196,7 +196,7 @@ func createAlpacaTrainJob(test Test, namespace, runtimeName string, config corev
196196 PodTemplateOverrides : []trainerv1alpha1.PodTemplateOverride {
197197 {
198198 TargetJobs : []trainerv1alpha1.PodTemplateOverrideTargetJob {
199- {Name : "trainer " },
199+ {Name : "node " },
200200 },
201201 Spec : & trainerv1alpha1.PodTemplateSpecOverride {
202202 Tolerations : []corev1.Toleration {
@@ -342,10 +342,10 @@ var mountModelVolumeIntoTrainer = ErrorOption[*trainerv1alpha1.TrainJob](func(to
342342 MountPath : "/mnt/model" ,
343343 }
344344
345- // Find the trainer pod template override and add the volume and volume mount
345+ // Find the trainer/node pod template override and add the volume and volume mount
346346 for i := range to .Spec .PodTemplateOverrides {
347347 for _ , target := range to .Spec .PodTemplateOverrides [i ].TargetJobs {
348- if target .Name == "trainer " && to .Spec .PodTemplateOverrides [i ].Spec != nil {
348+ if target .Name == "node " && to .Spec .PodTemplateOverrides [i ].Spec != nil {
349349 to .Spec .PodTemplateOverrides [i ].Spec .Volumes = append (to .Spec .PodTemplateOverrides [i ].Spec .Volumes , modelVolume )
350350
351351 // Find the node container and add the volume mount
@@ -387,15 +387,15 @@ func createMultiGpuTrainingRuntime(test Test, namespace string, numberOfGpus int
387387 Spec : jobsetv1alpha2.JobSetSpec {
388388 ReplicatedJobs : []jobsetv1alpha2.ReplicatedJob {
389389 {
390- Name : "trainer " ,
390+ Name : "node " ,
391391 Template : batchv1.JobTemplateSpec {
392+ ObjectMeta : metav1.ObjectMeta {
393+ Labels : map [string ]string {
394+ "trainer.kubeflow.org/trainjob-ancestor-step" : "trainer" ,
395+ },
396+ },
392397 Spec : batchv1.JobSpec {
393398 Template : corev1.PodTemplateSpec {
394- ObjectMeta : metav1.ObjectMeta {
395- Labels : map [string ]string {
396- "trainer.kubeflow.org/trainjob-ancestor-step" : "trainer" ,
397- },
398- },
399399 Spec : corev1.PodSpec {
400400 Tolerations : []corev1.Toleration {
401401 {
0 commit comments