@@ -191,15 +191,7 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
191191 return ctrl.Result {}, err
192192 }
193193
194- jobIsRunning := false
195- for _ , condition := range pytorchjob .Status .Conditions {
196- if condition .Type == kubeflowv1 .JobRunning && condition .Status == corev1 .ConditionTrue {
197- jobIsRunning = true
198- break
199- }
200- }
201-
202- if jobIsRunning {
194+ if commonutil .IsRunning (pytorchjob .Status ) {
203195 if content , err := r .readCompletionPercentageFromPod (pytorchjob ); err == nil {
204196 if percentage , parseErr := r .parseCompletionPercentage (content ); parseErr == nil {
205197 // Assuming your PyTorchJobStatus has a CompletionPercentage field.
@@ -219,12 +211,12 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
219211 } else {
220212 logrus .Debugf ("Failed to read completion percentage from rank-0 pod for PyTorchJob %s: %v" , pytorchjob .Name , err )
221213 }
222-
214+
223215 // Return a short requeue interval for running jobs
224216 // TODO instead of hard coding the requeue interval we could make this configurable
225217 return ctrl.Result {RequeueAfter : 30 * time .Second }, nil
226218 }
227-
219+
228220 t , err := util .DurationUntilExpireTime (& pytorchjob .Spec .RunPolicy , pytorchjob .Status )
229221 if err != nil {
230222 logrus .Warnf ("Reconcile PyTorchJob error %v" , err )
@@ -527,7 +519,9 @@ func (r *PyTorchJobReconciler) execInPod(pod *corev1.Pod, containerName string,
527519 }
528520
529521 var stdout , stderr bytes.Buffer
530- err = executor .StreamWithContext (context .Background (), remotecommand.StreamOptions {
522+ ctx , cancel := context .WithTimeoutCause (context .Background (), 10 * time .Second , fmt .Errorf ("pod execution timed out" ))
523+ defer cancel ()
524+ err = executor .StreamWithContext (ctx , remotecommand.StreamOptions {
531525 Stdout : & stdout ,
532526 Stderr : & stderr ,
533527 })
@@ -576,16 +570,13 @@ func (r *PyTorchJobReconciler) readCompletionPercentageFromPod(pytorchjob *kubef
576570 return "" , fmt .Errorf ("rank-0 pod %s is not in running state: %s" , rankZeroPod .Name , rankZeroPod .Status .Phase )
577571 }
578572
579- // Get the container name (use default PyTorch container name)
580- containerName := kubeflowv1 .PyTorchJobDefaultContainerName
581- if len (rankZeroPod .Spec .Containers ) > 0 {
582- containerName = rankZeroPod .Spec .Containers [0 ].Name
583- }
573+ // Get the container name
574+ containerName := rankZeroPod .Spec .Containers [0 ].Name
584575
585576 // Read the progress.json file from /mnt/checkpoints - /var/run is not accessible by non-root user
586577 // TODO we could have the user add the file path in an annotation instead of hardcoding it here
587578 // later we could update the CRD spec to allow for checkpoint config
588- progressFilePath := "/mnt/checkpoints/progress.json"
579+ progressFilePath := GetProgressFilePath ( pytorchjob )
589580 catCommand := []string {"cat" , progressFilePath }
590581 content , err := r .execInPod (rankZeroPod , containerName , catCommand )
591582 if err != nil {
@@ -600,7 +591,7 @@ func (r *PyTorchJobReconciler) parseCompletionPercentage(content string) (string
600591 var progress ProgressData
601592
602593 if err := json .Unmarshal ([]byte (content ), & progress ); err != nil {
603- return "" , fmt .Errorf ("failed to parse JSON: %v" , err )
594+ return "" , fmt .Errorf ("failed to parse JSON from content '%s' : %v" , content , err )
604595 }
605596
606597 // Extract current and total steps
0 commit comments