Skip to content

Commit 7be599e

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 96285d6 + 540c42e commit 7be599e

File tree

5 files changed

+80
-8
lines changed

5 files changed

+80
-8
lines changed

tests/common/support/defaults.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ const (
2222
RayROCmImage = "quay.io/modh/ray:2.35.0-py311-rocm62"
2323
RayTorchCudaImage = "quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26"
2424
RayTorchROCmImage = "quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26"
25-
TrainingCudaPyTorch241Image = "quay.io/modh/training:py311-cuda121-torch241"
26-
TrainingCudaPyTorch251Image = "quay.io/modh/training:py311-cuda124-torch251"
27-
TrainingRocmPyTorch241Image = "quay.io/modh/training:py311-rocm62-torch241"
28-
TrainingRocmPyTorch251Image = "quay.io/modh/training:py311-rocm62-torch251"
25+
TrainingCudaPyTorch241Image = "quay.io/rhoai/odh-training-cuda121-torch24-py311-rhel9:rhoai-3.0"
26+
TrainingCudaPyTorch251Image = "quay.io/rhoai/odh-training-cuda124-torch25-py311-rhel9:rhoai-3.0"
27+
TrainingCudaPyTorch28Image = "quay.io/rhoai/odh-training-cuda128-torch28-py312-rhel9:rhoai-3.0"
28+
TrainingRocmPyTorch241Image = "quay.io/rhoai/odh-training-rocm62-torch24-py311-rhel9:rhoai-3.0"
29+
TrainingRocmPyTorch251Image = "quay.io/rhoai/odh-training-rocm62-torch25-py311-rhel9:rhoai-3.0"
30+
TrainingRocmPyTorch28Image = "quay.io/rhoai/odh-training-rocm64-torch28-py312-rhel9:rhoai-3.0"
2931
)

tests/common/support/environment.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@ const (
2929
TestPyTorchImage = "TEST_PYTORCH_IMAGE"
3030
TestTrainingCudaPyTorch241Image = "TEST_TRAINING_CUDA_PYTORCH_241_IMAGE"
3131
TestTrainingCudaPyTorch251Image = "TEST_TRAINING_CUDA_PYTORCH_251_IMAGE"
32+
TestTrainingCudaPyTorch28Image = "TEST_TRAINING_CUDA_PYTORCH_28_IMAGE"
3233
TestTrainingRocmPyTorch241Image = "TEST_TRAINING_ROCM_PYTORCH_241_IMAGE"
3334
TestTrainingRocmPyTorch251Image = "TEST_TRAINING_ROCM_PYTORCH_251_IMAGE"
35+
TestTrainingRocmPyTorch28Image = "TEST_TRAINING_ROCM_PYTORCH_28_IMAGE"
3436

3537
// The testing output directory, to write output files into.
3638
TestOutputDir = "TEST_OUTPUT_DIR"
@@ -98,6 +100,10 @@ func GetTrainingCudaPyTorch251Image() string {
98100
return lookupEnvOrDefault(TestTrainingCudaPyTorch251Image, TrainingCudaPyTorch251Image)
99101
}
100102

103+
func GetTrainingCudaPyTorch28Image() string {
104+
return lookupEnvOrDefault(TestTrainingCudaPyTorch28Image, TrainingCudaPyTorch28Image)
105+
}
106+
101107
func GetTrainingROCmPyTorch241Image() string {
102108
return lookupEnvOrDefault(TestTrainingRocmPyTorch241Image, TrainingRocmPyTorch241Image)
103109
}
@@ -106,6 +112,10 @@ func GetTrainingROCmPyTorch251Image() string {
106112
return lookupEnvOrDefault(TestTrainingRocmPyTorch251Image, TrainingRocmPyTorch251Image)
107113
}
108114

115+
func GetTrainingRocmPyTorch28Image() string {
116+
return lookupEnvOrDefault(TestTrainingRocmPyTorch28Image, TrainingRocmPyTorch28Image)
117+
}
118+
109119
func GetClusterType(t Test) ClusterType {
110120
clusterType, ok := os.LookupEnv(ClusterTypeEnvVar)
111121
if !ok {

tests/kfto/kfto_mnist_sdk_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ func TestMnistSDKPyTorch251(t *testing.T) {
4545
runMnistSDK(t, GetTrainingCudaPyTorch251Image())
4646
}
4747

48+
func TestMnistSDKPyTorch28(t *testing.T) {
49+
Tags(t, Tier1)
50+
runMnistSDK(t, GetTrainingCudaPyTorch28Image())
51+
}
52+
4853
func runMnistSDK(t *testing.T, trainingImage string) {
4954
test := With(t)
5055
// Create a namespace

tests/kfto/kfto_mnist_training_test.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ func TestPyTorchJobMnistMultiNodeSingleGpuWithCudaPyTorch251(t *testing.T) {
5353
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch251Image(), "resources/requirements.txt", 1, 1)
5454
}
5555

56+
func TestPyTorchJobMnistMultiNodeSingleGpuWithCudaPyTorch28(t *testing.T) {
57+
Tags(t, KftoCuda)
58+
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch28Image(), "resources/requirements.txt", 1, 1)
59+
}
60+
5661
func TestPyTorchJobMnistMultiNodeMultiGpuWithCudaPyTorch241(t *testing.T) {
5762
Tags(t, KftoCuda)
5863
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch241Image(), "resources/requirements.txt", 1, 2)
@@ -63,6 +68,11 @@ func TestPyTorchJobMnistMultiNodeMultiGpuWithCudaPyTorch251(t *testing.T) {
6368
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch251Image(), "resources/requirements.txt", 1, 2)
6469
}
6570

71+
func TestPyTorchJobMnistMultiNodeMultiGpuWithCudaPyTorch28(t *testing.T) {
72+
Tags(t, KftoCuda)
73+
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch28Image(), "resources/requirements.txt", 1, 2)
74+
}
75+
6676
func TestPyTorchJobMnistMultiNodeSingleGpuWithROCmPyTorch241(t *testing.T) {
6777
Tags(t, KftoRocm)
6878
runKFTOPyTorchMnistJob(t, AMD, GetTrainingROCmPyTorch241Image(), "resources/requirements-rocm.txt", 1, 1)
@@ -83,6 +93,11 @@ func TestPyTorchJobMnistMultiNodeMultiGpuWithROCmPyTorch251(t *testing.T) {
8393
runKFTOPyTorchMnistJob(t, AMD, GetTrainingROCmPyTorch251Image(), "resources/requirements-rocm.txt", 1, 2)
8494
}
8595

96+
func TestPyTorchJobMnistMultiNodeMultiGpuWithROCmPyTorch28(t *testing.T) {
97+
Tags(t, KftoRocm)
98+
runKFTOPyTorchMnistJob(t, AMD, GetTrainingRocmPyTorch28Image(), "resources/requirements-rocm.txt", 1, 2)
99+
}
100+
86101
func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string, requirementsFile string, workerReplicas, numProcPerNode int) {
87102
test := With(t)
88103

@@ -229,7 +244,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
229244
{
230245
Name: "pytorch",
231246
Image: baseImage,
232-
ImagePullPolicy: corev1.PullIfNotPresent,
247+
ImagePullPolicy: corev1.PullAlways,
233248
Command: []string{
234249
"/bin/bash", "-c",
235250
fmt.Sprintf(`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
@@ -325,7 +340,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
325340
{
326341
Name: "pytorch",
327342
Image: baseImage,
328-
ImagePullPolicy: corev1.PullIfNotPresent,
343+
ImagePullPolicy: corev1.PullAlways,
329344
Command: []string{
330345
"/bin/bash", "-c",
331346
fmt.Sprintf(`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \

tests/kfto/kfto_training_test.go

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ func TestPyTorchJobSingleNodeSingleGpuWithCudaPyTorch251(t *testing.T) {
4242
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch251Image(), NVIDIA, 1, 0)
4343
}
4444

45+
func TestPyTorchJobSingleNodeSingleGpuWithCudaPyTorch28(t *testing.T) {
46+
Tags(t, Tier1, Gpu(NVIDIA))
47+
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch28Image(), NVIDIA, 1, 0)
48+
}
49+
4550
func TestPyTorchJobSingleNodeMultiGpuWithCudaPyTorch241(t *testing.T) {
4651
Tags(t, KftoCuda)
4752
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch241Image(), NVIDIA, 2, 0)
@@ -52,6 +57,11 @@ func TestPyTorchJobSingleNodeMultiGpuWithCudaPyTorch251(t *testing.T) {
5257
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch251Image(), NVIDIA, 2, 0)
5358
}
5459

60+
func TestPyTorchJobSingleNodeMultiGpuWithCudaPyTorch28(t *testing.T) {
61+
Tags(t, KftoCuda)
62+
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch28Image(), NVIDIA, 2, 0)
63+
}
64+
5565
func TestPyTorchJobMultiNodeSingleGpuWithCudaPyTorch241(t *testing.T) {
5666
Tags(t, KftoCuda)
5767
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch241Image(), NVIDIA, 1, 1)
@@ -62,6 +72,11 @@ func TestPyTorchJobMultiNodeSingleGpuWithCudaPyTorch251(t *testing.T) {
6272
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch251Image(), NVIDIA, 1, 1)
6373
}
6474

75+
func TestPyTorchJobMultiNodeSingleGpuWithCudaPyTorch28(t *testing.T) {
76+
Tags(t, KftoCuda)
77+
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch28Image(), NVIDIA, 1, 1)
78+
}
79+
6580
func TestPyTorchJobMultiNodeMultiGpuWithCudaPyTorch241(t *testing.T) {
6681
Tags(t, KftoCuda)
6782
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch241Image(), NVIDIA, 2, 1)
@@ -72,6 +87,11 @@ func TestPyTorchJobMultiNodeMultiGpuWithCudaPyTorch251(t *testing.T) {
7287
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch251Image(), NVIDIA, 2, 1)
7388
}
7489

90+
func TestPyTorchJobMultiNodeMultiGpuWithCudaPyTorch28(t *testing.T) {
91+
Tags(t, KftoCuda)
92+
runKFTOPyTorchJob(t, GetTrainingCudaPyTorch28Image(), NVIDIA, 2, 1)
93+
}
94+
7595
func TestPyTorchJobSingleNodeSingleGpuWithROCmPyTorch241(t *testing.T) {
7696
Tags(t, Tier1, Gpu(AMD))
7797
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch241Image(), AMD, 1, 0)
@@ -82,6 +102,11 @@ func TestPyTorchJobSingleNodeSingleGpuWithROCmPyTorch251(t *testing.T) {
82102
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch251Image(), AMD, 1, 0)
83103
}
84104

105+
func TestPyTorchJobSingleNodeSingleGpuWithROCmPyTorch28(t *testing.T) {
106+
Tags(t, Tier1, Gpu(AMD))
107+
runKFTOPyTorchJob(t, GetTrainingRocmPyTorch28Image(), AMD, 1, 0)
108+
}
109+
85110
func TestPyTorchJobSingleNodeMultiGpuWithROCmPyTorch241(t *testing.T) {
86111
Tags(t, KftoRocm)
87112
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch241Image(), AMD, 2, 0)
@@ -92,6 +117,11 @@ func TestPyTorchJobSingleNodeMultiGpuWithROCmPyTorch251(t *testing.T) {
92117
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch251Image(), AMD, 2, 0)
93118
}
94119

120+
func TestPyTorchJobSingleNodeMultiGpuWithROCmPyTorch28(t *testing.T) {
121+
Tags(t, KftoRocm)
122+
runKFTOPyTorchJob(t, GetTrainingRocmPyTorch28Image(), AMD, 2, 0)
123+
}
124+
95125
func TestPyTorchJobMultiNodeSingleGpuWithROCmPyTorch241(t *testing.T) {
96126
Tags(t, KftoRocm)
97127
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch241Image(), AMD, 1, 1)
@@ -102,6 +132,11 @@ func TestPyTorchJobMultiNodeSingleGpuWithROCmPyTorch251(t *testing.T) {
102132
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch251Image(), AMD, 1, 1)
103133
}
104134

135+
func TestPyTorchJobMultiNodeSingleGpuWithROCmPyTorch28(t *testing.T) {
136+
Tags(t, KftoRocm)
137+
runKFTOPyTorchJob(t, GetTrainingRocmPyTorch28Image(), AMD, 1, 1)
138+
}
139+
105140
func TestPyTorchJobMultiNodeMultiGpuWithROCmPyTorch241(t *testing.T) {
106141
Tags(t, KftoRocm)
107142
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch241Image(), AMD, 2, 1)
@@ -112,6 +147,11 @@ func TestPyTorchJobMultiNodeMultiGpuWithROCmPyTorch251(t *testing.T) {
112147
runKFTOPyTorchJob(t, GetTrainingROCmPyTorch251Image(), AMD, 2, 1)
113148
}
114149

150+
func TestPyTorchJobMultiNodeMultiGpuWithROCmPyTorch28(t *testing.T) {
151+
Tags(t, KftoRocm)
152+
runKFTOPyTorchJob(t, GetTrainingRocmPyTorch28Image(), AMD, 2, 1)
153+
}
154+
115155
func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, numberOfWorkerNodes int) {
116156
test := With(t)
117157

@@ -263,7 +303,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
263303
{
264304
Name: "pytorch",
265305
Image: baseImage,
266-
ImagePullPolicy: corev1.PullIfNotPresent,
306+
ImagePullPolicy: corev1.PullAlways,
267307
Command: []string{
268308
"/bin/bash", "-c",
269309
`torchrun /etc/config/hf_llm_training.py \
@@ -432,7 +472,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
432472
{
433473
Name: "pytorch",
434474
Image: baseImage,
435-
ImagePullPolicy: corev1.PullIfNotPresent,
475+
ImagePullPolicy: corev1.PullAlways,
436476
Command: []string{
437477
"/bin/bash", "-c",
438478
`torchrun /etc/config/hf_llm_training.py \

0 commit comments

Comments
 (0)