Skip to content

Commit 1da7891

Browse files
committed
CARRY: Use temp folder to store progress file
1 parent d07ec3b commit 1da7891

File tree

11 files changed

+95
-27
lines changed

11 files changed

+95
-27
lines changed

manifests/base/crds/kubeflow.org_jaxjobs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7761,6 +7761,11 @@ spec:
77617761
Most recently observed status of the JAXJob.
77627762
Read-only (modified by the system).
77637763
properties:
7764+
completionPercentage:
7765+
description: |-
7766+
CompletionPercentage is the percentage of the job that has completed from 0.0 to 100.0,
7767+
formatted as a string with one decimal place (e.g., "45.2").
7768+
type: string
77647769
completionTime:
77657770
description: |-
77667771
Represents time when the job was completed. It is not guaranteed to

manifests/base/crds/kubeflow.org_mpijobs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7771,6 +7771,11 @@ spec:
77717771
description: JobStatus represents the current observed state of the training
77727772
Job.
77737773
properties:
7774+
completionPercentage:
7775+
description: |-
7776+
CompletionPercentage is the percentage of the job that has completed from 0.0 to 100.0,
7777+
formatted as a string with one decimal place (e.g., "45.2").
7778+
type: string
77747779
completionTime:
77757780
description: |-
77767781
Represents time when the job was completed. It is not guaranteed to

manifests/base/crds/kubeflow.org_paddlejobs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8254,6 +8254,11 @@ spec:
82548254
Most recently observed status of the PaddleJob.
82558255
Read-only (modified by the system).
82568256
properties:
8257+
completionPercentage:
8258+
description: |-
8259+
CompletionPercentage is the percentage of the job that has completed from 0.0 to 100.0,
8260+
formatted as a string with one decimal place (e.g., "45.2").
8261+
type: string
82578262
completionTime:
82588263
description: |-
82598264
Represents time when the job was completed. It is not guaranteed to

manifests/base/crds/kubeflow.org_pytorchjobs.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8292,9 +8292,9 @@ spec:
82928292
Read-only (modified by the system).
82938293
properties:
82948294
completionPercentage:
8295-
description: CompletionPercentage is the percentage of the job that
8296-
has completed from 0.0 to 100.0, formatted as a string with one
8297-
decimal place (e.g., "45.2").
8295+
description: |-
8296+
CompletionPercentage is the percentage of the job that has completed from 0.0 to 100.0,
8297+
formatted as a string with one decimal place (e.g., "45.2").
82988298
type: string
82998299
completionTime:
83008300
description: |-

manifests/base/crds/kubeflow.org_tfjobs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7771,6 +7771,11 @@ spec:
77717771
Populated by the system.
77727772
Read-only.
77737773
properties:
7774+
completionPercentage:
7775+
description: |-
7776+
CompletionPercentage is the percentage of the job that has completed from 0.0 to 100.0,
7777+
formatted as a string with one decimal place (e.g., "45.2").
7778+
type: string
77747779
completionTime:
77757780
description: |-
77767781
Represents time when the job was completed. It is not guaranteed to

manifests/base/crds/kubeflow.org_xgboostjobs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7753,6 +7753,11 @@ spec:
77537753
description: JobStatus represents the current observed state of the training
77547754
Job.
77557755
properties:
7756+
completionPercentage:
7757+
description: |-
7758+
CompletionPercentage is the percentage of the job that has completed from 0.0 to 100.0,
7759+
formatted as a string with one decimal place (e.g., "45.2").
7760+
type: string
77567761
completionTime:
77577762
description: |-
77587763
Represents time when the job was completed. It is not guaranteed to

pkg/apis/kubeflow.org/v1/zz_generated.openapi.go

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/client/applyconfiguration/kubeflow.org/v1/jobstatus.go

Lines changed: 14 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/controller.v1/pytorch/envvar.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@ func setPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,
121121
Value: strconv.Itoa(int(totalReplicas)),
122122
})
123123
}
124+
125+
// Set the training progress file path.
126+
podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
127+
Name: EnvTrainingProgressFilePath,
128+
Value: GetProgressFilePath(pytorchjob),
129+
})
124130
}
125131

126132
return nil

pkg/controller.v1/pytorch/pytorchjob_controller.go

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -191,15 +191,7 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
191191
return ctrl.Result{}, err
192192
}
193193

194-
jobIsRunning := false
195-
for _, condition := range pytorchjob.Status.Conditions {
196-
if condition.Type == kubeflowv1.JobRunning && condition.Status == corev1.ConditionTrue {
197-
jobIsRunning = true
198-
break
199-
}
200-
}
201-
202-
if jobIsRunning {
194+
if commonutil.IsRunning(pytorchjob.Status) {
203195
if content, err := r.readCompletionPercentageFromPod(pytorchjob); err == nil {
204196
if percentage, parseErr := r.parseCompletionPercentage(content); parseErr == nil {
205197
// Assuming your PyTorchJobStatus has a CompletionPercentage field.
@@ -219,12 +211,12 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
219211
} else {
220212
logrus.Debugf("Failed to read completion percentage from rank-0 pod for PyTorchJob %s: %v", pytorchjob.Name, err)
221213
}
222-
214+
223215
// Return a short requeue interval for running jobs
224216
// TODO instead of hard coding the requeue interval we could make this configurable
225217
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
226218
}
227-
219+
228220
t, err := util.DurationUntilExpireTime(&pytorchjob.Spec.RunPolicy, pytorchjob.Status)
229221
if err != nil {
230222
logrus.Warnf("Reconcile PyTorchJob error %v", err)
@@ -527,7 +519,9 @@ func (r *PyTorchJobReconciler) execInPod(pod *corev1.Pod, containerName string,
527519
}
528520

529521
var stdout, stderr bytes.Buffer
530-
err = executor.StreamWithContext(context.Background(), remotecommand.StreamOptions{
522+
ctx, cancel := context.WithTimeoutCause(context.Background(), 10*time.Second, fmt.Errorf("pod execution timed out"))
523+
defer cancel()
524+
err = executor.StreamWithContext(ctx, remotecommand.StreamOptions{
531525
Stdout: &stdout,
532526
Stderr: &stderr,
533527
})
@@ -576,16 +570,13 @@ func (r *PyTorchJobReconciler) readCompletionPercentageFromPod(pytorchjob *kubef
576570
return "", fmt.Errorf("rank-0 pod %s is not in running state: %s", rankZeroPod.Name, rankZeroPod.Status.Phase)
577571
}
578572

579-
// Get the container name (use default PyTorch container name)
580-
containerName := kubeflowv1.PyTorchJobDefaultContainerName
581-
if len(rankZeroPod.Spec.Containers) > 0 {
582-
containerName = rankZeroPod.Spec.Containers[0].Name
583-
}
573+
// Get the container name
574+
containerName := rankZeroPod.Spec.Containers[0].Name
584575

585576
// Read the progress.json file from /mnt/checkpoints - /var/run is not accessible by non-root user
586577
// TODO we could have the user add the file path in an annotation instead of hardcoding it here
587578
// later we could update the CRD spec to allow for checkpoint config
588-
progressFilePath := "/mnt/checkpoints/progress.json"
579+
progressFilePath := GetProgressFilePath(pytorchjob)
589580
catCommand := []string{"cat", progressFilePath}
590581
content, err := r.execInPod(rankZeroPod, containerName, catCommand)
591582
if err != nil {
@@ -600,7 +591,7 @@ func (r *PyTorchJobReconciler) parseCompletionPercentage(content string) (string
600591
var progress ProgressData
601592

602593
if err := json.Unmarshal([]byte(content), &progress); err != nil {
603-
return "", fmt.Errorf("failed to parse JSON: %v", err)
594+
return "", fmt.Errorf("failed to parse JSON from content '%s': %v", content, err)
604595
}
605596

606597
// Extract current and total steps

0 commit comments

Comments
 (0)