CARRY: Use temp folder to store progress file

sutaakar · sutaakar · commit d926bc018c22 · 2025-08-15T14:12:57.000+02:00
diff --git a/pkg/controller.v1/pytorch/envvar.go b/pkg/controller.v1/pytorch/envvar.go
@@ -121,6 +121,12 @@ func setPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,
 					Value: strconv.Itoa(int(totalReplicas)),
 				})
 		}
+
+		// Set the training progress file path.
+		podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
+			Name:  EnvTrainingProgressFilePath,
+			Value: GetProgressFilePath(pytorchjob),
+		})
 	}
 
 	return nil
diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go
@@ -191,15 +191,7 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		return ctrl.Result{}, err
 	}
 
-	jobIsRunning := false
-	for _, condition := range pytorchjob.Status.Conditions {
-		if condition.Type == kubeflowv1.JobRunning && condition.Status == corev1.ConditionTrue {
-			jobIsRunning = true
-			break
-		}
-	}
-
-	if jobIsRunning {
+	if commonutil.IsRunning(pytorchjob.Status) {
 		if content, err := r.readCompletionPercentageFromPod(pytorchjob); err == nil {
 			if percentage, parseErr := r.parseCompletionPercentage(content); parseErr == nil {
 				// Assuming your PyTorchJobStatus has a CompletionPercentage field.
@@ -219,12 +211,12 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		} else {
 			logrus.Debugf("Failed to read completion percentage from rank-0 pod for PyTorchJob %s: %v", pytorchjob.Name, err)
 		}
-		
+
 		// Return a short requeue interval for running jobs
 		// TODO instead of hard coding the requeue interval we could make this configurable
 		return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
 	}
-	
+
 	t, err := util.DurationUntilExpireTime(&pytorchjob.Spec.RunPolicy, pytorchjob.Status)
 	if err != nil {
 		logrus.Warnf("Reconcile PyTorchJob error %v", err)
@@ -527,7 +519,9 @@ func (r *PyTorchJobReconciler) execInPod(pod *corev1.Pod, containerName string,
 	}
 
 	var stdout, stderr bytes.Buffer
-	err = executor.StreamWithContext(context.Background(), remotecommand.StreamOptions{
+	ctx, cancel := context.WithTimeoutCause(context.Background(), 10*time.Second, fmt.Errorf("pod execution timed out"))
+	defer cancel()
+	err = executor.StreamWithContext(ctx, remotecommand.StreamOptions{
 		Stdout: &stdout,
 		Stderr: &stderr,
 	})
@@ -576,16 +570,13 @@ func (r *PyTorchJobReconciler) readCompletionPercentageFromPod(pytorchjob *kubef
 		return "", fmt.Errorf("rank-0 pod %s is not in running state: %s", rankZeroPod.Name, rankZeroPod.Status.Phase)
 	}
 
-	// Get the container name (use default PyTorch container name)
-	containerName := kubeflowv1.PyTorchJobDefaultContainerName
-	if len(rankZeroPod.Spec.Containers) > 0 {
-		containerName = rankZeroPod.Spec.Containers[0].Name
-	}
+	// Get the container name
+	containerName := rankZeroPod.Spec.Containers[0].Name
 
 	// Read the progress.json file from /mnt/checkpoints - /var/run is not accessible by non-root user
 	// TODO we could have the user add the file path in an annotation instead of hardcoding it here
 	// later we could update the CRD spec to allow for checkpoint config
-	progressFilePath := "/mnt/checkpoints/progress.json"
+	progressFilePath := GetProgressFilePath(pytorchjob)
 	catCommand := []string{"cat", progressFilePath}
 	content, err := r.execInPod(rankZeroPod, containerName, catCommand)
 	if err != nil {
@@ -600,7 +591,7 @@ func (r *PyTorchJobReconciler) parseCompletionPercentage(content string) (string
 	var progress ProgressData
 
 	if err := json.Unmarshal([]byte(content), &progress); err != nil {
-		return "", fmt.Errorf("failed to parse JSON: %v", err)
+		return "", fmt.Errorf("failed to parse JSON from content '%s': %v", content, err)
 	}
 
 	// Extract current and total steps
diff --git a/pkg/controller.v1/pytorch/training_info.go b/pkg/controller.v1/pytorch/training_info.go
@@ -0,0 +1,30 @@
+// Copyright 2025 The Kubeflow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+
+package pytorch
+
+import (
+	"fmt"
+
+	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
+)
+
+const (
+	// EnvTrainingProgressFilePath is the environment variable name for the training progress file path.
+	EnvTrainingProgressFilePath = "TRAINING_PROGRESS_FILE_PATH"
+)
+
+func GetProgressFilePath(job *kubeflowv1.PyTorchJob) string {
+	return fmt.Sprintf("/tmp/training_data/%s/progress.json", job.Name)
+}

Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,12 @@ func setPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,`
`121`	`121`	`Value: strconv.Itoa(int(totalReplicas)),`
`122`	`122`	`})`
`123`	`123`	`}`
	`124`	`+`
	`125`	`+ // Set the training progress file path.`
	`126`	`+ podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{`
	`127`	`+ Name: EnvTrainingProgressFilePath,`
	`128`	`+ Value: GetProgressFilePath(pytorchjob),`
	`129`	`+ })`
`124`	`130`	`}`
`125`	`131`
`126`	`132`	`return nil`