Skip to content

Commit a1f7a72

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents c117324 + 14af2e2 commit a1f7a72

14 files changed

+421
-143
lines changed

.github/workflows/build-and-push-test-images.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ on:
99
- 'go.mod'
1010
- 'go.sum'
1111
- 'tests/**'
12+
- 'images/tests/**'
1213
workflow_dispatch:
1314

1415
jobs:

images/tests/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,7 @@ COPY tests/ .
2020

2121
# Command to run the tests
2222
ENTRYPOINT [ "gotestsum"]
23+
24+
# Configure images using environment variables
25+
ENV FMS_HF_TUNING_IMAGE=quay.io/modh/fms-hf-tuning:release
26+
ENV NOTEBOOK_IMAGE=quay.io/modh/odh-generic-data-science-notebook:v3-20250519

tests/common/support/defaults.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@ limitations under the License.
1717
package support
1818

1919
const (
20-
RayVersion = "2.35.0"
21-
RayImage = "quay.io/modh/ray:2.35.0-py311-cu121"
22-
RayROCmImage = "quay.io/modh/ray:2.35.0-py311-rocm62"
23-
RayTorchCudaImage = "quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26"
24-
RayTorchROCmImage = "quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26"
25-
TrainingCudaImage = "quay.io/modh/training:py311-cuda124-torch251"
26-
TrainingROCmImage = "quay.io/modh/training:py311-rocm62-torch251"
20+
RayVersion = "2.35.0"
21+
RayImage = "quay.io/modh/ray:2.35.0-py311-cu121"
22+
RayROCmImage = "quay.io/modh/ray:2.35.0-py311-rocm62"
23+
RayTorchCudaImage = "quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26"
24+
RayTorchROCmImage = "quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26"
25+
TrainingCudaPyTorch241Image = "quay.io/modh/training:py311-cuda121-torch241"
26+
TrainingCudaPyTorch251Image = "quay.io/modh/training:py311-cuda124-torch251"
27+
TrainingRocmPyTorch241Image = "quay.io/modh/training:py311-rocm62-torch241"
28+
TrainingRocmPyTorch251Image = "quay.io/modh/training:py311-rocm62-torch251"
2729
)

tests/common/support/environment.go

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,16 @@ const (
2424
// The environment variables hereafter can be used to change the components
2525
// used for testing.
2626

27-
CodeFlareTestRayVersion = "TEST_RAY_VERSION"
28-
CodeFlareTestRayImage = "TEST_RAY_IMAGE"
29-
CodeFlareTestPyTorchImage = "TEST_PYTORCH_IMAGE"
30-
CodeFlareTestTrainingImage = "TEST_TRAINING_IMAGE"
27+
TestRayVersion = "TEST_RAY_VERSION"
28+
TestRayImage = "TEST_RAY_IMAGE"
29+
TestPyTorchImage = "TEST_PYTORCH_IMAGE"
30+
TestTrainingCudaPyTorch241Image = "TEST_TRAINING_CUDA_PYTORCH_241_IMAGE"
31+
TestTrainingCudaPyTorch251Image = "TEST_TRAINING_CUDA_PYTORCH_251_IMAGE"
32+
TestTrainingRocmPyTorch241Image = "TEST_TRAINING_ROCM_PYTORCH_241_IMAGE"
33+
TestTrainingRocmPyTorch251Image = "TEST_TRAINING_ROCM_PYTORCH_251_IMAGE"
3134

3235
// The testing output directory, to write output files into.
33-
CodeFlareTestOutputDir = "TEST_OUTPUT_DIR"
36+
TestOutputDir = "TEST_OUTPUT_DIR"
3437

3538
// Type of cluster test is run on
3639
ClusterTypeEnvVar = "CLUSTER_TYPE"
@@ -68,31 +71,39 @@ const (
6871
)
6972

7073
func GetRayVersion() string {
71-
return lookupEnvOrDefault(CodeFlareTestRayVersion, RayVersion)
74+
return lookupEnvOrDefault(TestRayVersion, RayVersion)
7275
}
7376

7477
func GetRayImage() string {
75-
return lookupEnvOrDefault(CodeFlareTestRayImage, RayImage)
78+
return lookupEnvOrDefault(TestRayImage, RayImage)
7679
}
7780

7881
func GetRayROCmImage() string {
79-
return lookupEnvOrDefault(CodeFlareTestRayImage, RayROCmImage)
82+
return lookupEnvOrDefault(TestRayImage, RayROCmImage)
8083
}
8184

8285
func GetRayTorchCudaImage() string {
83-
return lookupEnvOrDefault(CodeFlareTestRayImage, RayTorchCudaImage)
86+
return lookupEnvOrDefault(TestRayImage, RayTorchCudaImage)
8487
}
8588

8689
func GetRayTorchROCmImage() string {
87-
return lookupEnvOrDefault(CodeFlareTestRayImage, RayTorchROCmImage)
90+
return lookupEnvOrDefault(TestRayImage, RayTorchROCmImage)
8891
}
8992

90-
func GetCudaTrainingImage() string {
91-
return lookupEnvOrDefault(CodeFlareTestTrainingImage, TrainingCudaImage)
93+
func GetTrainingCudaPyTorch241Image() string {
94+
return lookupEnvOrDefault(TestTrainingCudaPyTorch241Image, TrainingCudaPyTorch241Image)
9295
}
9396

94-
func GetROCmTrainingImage() string {
95-
return lookupEnvOrDefault(CodeFlareTestTrainingImage, TrainingROCmImage)
97+
func GetTrainingCudaPyTorch251Image() string {
98+
return lookupEnvOrDefault(TestTrainingCudaPyTorch251Image, TrainingCudaPyTorch251Image)
99+
}
100+
101+
func GetTrainingROCmPyTorch241Image() string {
102+
return lookupEnvOrDefault(TestTrainingRocmPyTorch241Image, TrainingRocmPyTorch241Image)
103+
}
104+
105+
func GetTrainingROCmPyTorch251Image() string {
106+
return lookupEnvOrDefault(TestTrainingRocmPyTorch251Image, TrainingRocmPyTorch251Image)
96107
}
97108

98109
func GetClusterType(t Test) ClusterType {

tests/common/support/environment_test.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func TestGetRayVersion(t *testing.T) {
2727

2828
g := gomega.NewGomegaWithT(t)
2929
// Set the environment variable.
30-
os.Setenv(CodeFlareTestRayVersion, "1.4.5")
30+
os.Setenv(TestRayVersion, "1.4.5")
3131

3232
// Get the version.
3333
version := GetRayVersion()
@@ -42,7 +42,7 @@ func TestGetRayImage(t *testing.T) {
4242

4343
g := gomega.NewGomegaWithT(t)
4444
// Set the environment variable.
45-
os.Setenv(CodeFlareTestRayImage, "ray/ray:latest")
45+
os.Setenv(TestRayImage, "ray/ray:latest")
4646

4747
// Get the image.
4848
image := GetRayImage()
@@ -57,11 +57,10 @@ func TestGetTrainingImage(t *testing.T) {
5757

5858
g := gomega.NewGomegaWithT(t)
5959
// Set the environment variable.
60-
os.Setenv(CodeFlareTestTrainingImage, "training/training:latest")
60+
os.Setenv(TestTrainingCudaPyTorch251Image, "training/training:latest")
6161

6262
// Get the image.
63-
image := GetCudaTrainingImage()
64-
63+
image := GetTrainingCudaPyTorch251Image()
6564
// Assert that the image is correct.
6665

6766
g.Expect(image).To(gomega.Equal("training/training:latest"), "Expected image training/training:latest, but got %s", image)

tests/common/support/test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ func (t *T) Config() *rest.Config {
134134
func (t *T) OutputDir() string {
135135
t.T().Helper()
136136
t.once.outputDir.Do(func() {
137-
if parent, ok := os.LookupEnv(CodeFlareTestOutputDir); ok {
137+
if parent, ok := os.LookupEnv(TestOutputDir); ok {
138138
if !path.IsAbs(parent) {
139139
if cwd, err := os.Getwd(); err == nil {
140140
// best effort to output the parent absolute path
@@ -148,7 +148,7 @@ func (t *T) OutputDir() string {
148148
}
149149
t.outputDir = dir
150150
} else {
151-
t.T().Logf("Creating ephemeral output directory as %s env variable is unset", CodeFlareTestOutputDir)
151+
t.T().Logf("Creating ephemeral output directory as %s env variable is unset", TestOutputDir)
152152
t.outputDir = t.T().TempDir()
153153
}
154154
t.T().Logf("Output directory has been created at: %s", t.outputDir)

tests/kfto/kfto_kueue_mnist_upgrade_training_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
187187
Containers: []corev1.Container{
188188
{
189189
Name: "pytorch",
190-
Image: GetCudaTrainingImage(),
190+
Image: GetTrainingCudaPyTorch251Image(),
191191
ImagePullPolicy: corev1.PullIfNotPresent,
192192
Command: []string{
193193
"/bin/bash", "-c",

tests/kfto/kfto_mnist_sdk_test.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ import (
2525
. "github.com/onsi/gomega"
2626

2727
corev1 "k8s.io/api/core/v1"
28-
v1 "k8s.io/api/core/v1"
2928
"k8s.io/apimachinery/pkg/api/resource"
3029
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3130
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
@@ -34,8 +33,17 @@ import (
3433
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
3534
)
3635

37-
func TestMnistSDK(t *testing.T) {
36+
func TestMnistSDKPyTorch241(t *testing.T) {
3837
Tags(t, Tier1)
38+
runMnistSDK(t, GetTrainingCudaPyTorch241Image())
39+
}
40+
41+
func TestMnistSDKPyTorch251(t *testing.T) {
42+
Tags(t, Tier1)
43+
runMnistSDK(t, GetTrainingCudaPyTorch251Image())
44+
}
45+
46+
func runMnistSDK(t *testing.T, trainingImage string) {
3947
test := With(t)
4048
// Create a namespace
4149
namespace := test.NewTestNamespace()
@@ -87,7 +95,7 @@ func TestMnistSDK(t *testing.T) {
8795
"${password}": userToken,
8896
"${num_gpus}": "0",
8997
"${namespace}": namespace.Name,
90-
"${training_image}": GetCudaTrainingImage(),
98+
"${training_image}": trainingImage,
9199
}
92100

93101
jupyterNotebook := string(readFile(test, "resources/mnist_kfto.ipynb"))
@@ -121,11 +129,11 @@ func TestMnistSDK(t *testing.T) {
121129

122130
// Make sure pytorch job is created
123131
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp"), TestTimeoutDouble).
124-
Should(WithTransform(PyTorchJobConditionRunning, Equal(v1.ConditionTrue)))
132+
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
125133

126134
// Make sure that the job eventually succeeds
127135
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp"), TestTimeoutLong, 1*time.Second).
128-
Should(WithTransform(PyTorchJobConditionSucceeded, Equal(v1.ConditionTrue)))
136+
Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
129137

130138
// TODO: write torch job logs?
131139
// time.Sleep(60 * time.Second)

tests/kfto/kfto_mnist_training_test.go

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,31 +35,52 @@ import (
3535

3636
func TestPyTorchJobMnistMultiNodeSingleCpu(t *testing.T) {
3737
Tags(t, Sanity, MultiNode(3))
38-
runKFTOPyTorchMnistJob(t, CPU, GetCudaTrainingImage(), "resources/requirements.txt", 2, 1)
38+
runKFTOPyTorchMnistJob(t, CPU, GetTrainingCudaPyTorch251Image(), "resources/requirements.txt", 2, 1)
3939
}
40+
4041
func TestPyTorchJobMnistMultiNodeMultiCpu(t *testing.T) {
4142
Tags(t, Tier1, MultiNode(3))
42-
runKFTOPyTorchMnistJob(t, CPU, GetCudaTrainingImage(), "resources/requirements.txt", 2, 2)
43+
runKFTOPyTorchMnistJob(t, CPU, GetTrainingCudaPyTorch251Image(), "resources/requirements.txt", 2, 2)
44+
}
45+
46+
func TestPyTorchJobMnistMultiNodeSingleGpuWithCudaPyTorch241(t *testing.T) {
47+
Tags(t, KftoCuda)
48+
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch241Image(), "resources/requirements.txt", 1, 1)
49+
}
50+
51+
func TestPyTorchJobMnistMultiNodeSingleGpuWithCudaPyTorch251(t *testing.T) {
52+
Tags(t, KftoCuda)
53+
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch251Image(), "resources/requirements.txt", 1, 1)
4354
}
4455

45-
func TestPyTorchJobMnistMultiNodeSingleGpuWithCuda(t *testing.T) {
56+
func TestPyTorchJobMnistMultiNodeMultiGpuWithCudaPyTorch241(t *testing.T) {
4657
Tags(t, KftoCuda)
47-
runKFTOPyTorchMnistJob(t, NVIDIA, GetCudaTrainingImage(), "resources/requirements.txt", 1, 1)
58+
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch241Image(), "resources/requirements.txt", 1, 2)
4859
}
4960

50-
func TestPyTorchJobMnistMultiNodeMultiGpuWithCuda(t *testing.T) {
61+
func TestPyTorchJobMnistMultiNodeMultiGpuWithCudaPyTorch251(t *testing.T) {
5162
Tags(t, KftoCuda)
52-
runKFTOPyTorchMnistJob(t, NVIDIA, GetCudaTrainingImage(), "resources/requirements.txt", 1, 2)
63+
runKFTOPyTorchMnistJob(t, NVIDIA, GetTrainingCudaPyTorch251Image(), "resources/requirements.txt", 1, 2)
64+
}
65+
66+
func TestPyTorchJobMnistMultiNodeSingleGpuWithROCmPyTorch241(t *testing.T) {
67+
Tags(t, KftoRocm)
68+
runKFTOPyTorchMnistJob(t, AMD, GetTrainingROCmPyTorch241Image(), "resources/requirements-rocm.txt", 1, 1)
69+
}
70+
71+
func TestPyTorchJobMnistMultiNodeSingleGpuWithROCmPyTorch251(t *testing.T) {
72+
Tags(t, KftoRocm)
73+
runKFTOPyTorchMnistJob(t, AMD, GetTrainingROCmPyTorch251Image(), "resources/requirements-rocm.txt", 1, 1)
5374
}
5475

55-
func TestPyTorchJobMnistMultiNodeSingleGpuWithROCm(t *testing.T) {
76+
func TestPyTorchJobMnistMultiNodeMultiGpuWithROCmPyTorch241(t *testing.T) {
5677
Tags(t, KftoRocm)
57-
runKFTOPyTorchMnistJob(t, AMD, GetROCmTrainingImage(), "resources/requirements-rocm.txt", 1, 1)
78+
runKFTOPyTorchMnistJob(t, AMD, GetTrainingROCmPyTorch241Image(), "resources/requirements-rocm.txt", 1, 2)
5879
}
5980

60-
func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm(t *testing.T) {
81+
func TestPyTorchJobMnistMultiNodeMultiGpuWithROCmPyTorch251(t *testing.T) {
6182
Tags(t, KftoRocm)
62-
runKFTOPyTorchMnistJob(t, AMD, GetROCmTrainingImage(), "resources/requirements-rocm.txt", 1, 2)
83+
runKFTOPyTorchMnistJob(t, AMD, GetTrainingROCmPyTorch251Image(), "resources/requirements-rocm.txt", 1, 2)
6384
}
6485

6586
func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string, requirementsFile string, workerReplicas, numProcPerNode int) {

tests/kfto/kfto_pytorchjob_failed_test.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,24 @@ import (
1515
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
1616
)
1717

18-
func TestPyTorchJobFailureWithCuda(t *testing.T) {
18+
func TestPyTorchJobFailureWithCudaPyTorch241(t *testing.T) {
1919
Tags(t, Tier1)
20-
runFailedPyTorchJobTest(t, GetCudaTrainingImage())
20+
runFailedPyTorchJobTest(t, GetTrainingCudaPyTorch241Image())
2121
}
2222

23-
func TestPyTorchJobFailureWithROCm(t *testing.T) {
23+
func TestPyTorchJobFailureWithCudaPyTorch251(t *testing.T) {
2424
Tags(t, Tier1)
25-
runFailedPyTorchJobTest(t, GetROCmTrainingImage())
25+
runFailedPyTorchJobTest(t, GetTrainingCudaPyTorch251Image())
26+
}
27+
28+
func TestPyTorchJobFailureWithROCmPyTorch241(t *testing.T) {
29+
Tags(t, Tier1)
30+
runFailedPyTorchJobTest(t, GetTrainingROCmPyTorch241Image())
31+
}
32+
33+
func TestPyTorchJobFailureWithROCmPyTorch251(t *testing.T) {
34+
Tags(t, Tier1)
35+
runFailedPyTorchJobTest(t, GetTrainingROCmPyTorch251Image())
2636
}
2737

2838
func runFailedPyTorchJobTest(t *testing.T, image string) {

0 commit comments

Comments
 (0)