Skip to content

Commit 8e0e6cb

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents f140a6e + 1aece45 commit 8e0e6cb

35 files changed

+144
-135
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@
2727

2828
NOTE: `quay.io/modh/ray:2.35.0-py311-cu121` is the default image used for creating a RayCluster resource. If you have your own custom ray image which suits your purposes, specify it in `CODEFLARE_TEST_RAY_IMAGE` environment variable.
2929

30-
### Environment variables for Training operator test suite
30+
### Environment variables for fms-hf-tuning test suite
3131

3232
* `FMS_HF_TUNING_IMAGE` - Image tag used in PyTorchJob CR for model training
3333

34-
### Environment variables for Training operator GPU test suite
34+
### Environment variables for fms-hf-tuning GPU test suite
3535

3636
* `TEST_NAMESPACE_NAME` (Optional) - Existing namespace where will the Training operator GPU tests be executed
3737
* `HF_TOKEN` - HuggingFace token used to pull models which has limited access

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ toolchain go1.21.5
77
require (
88
github.com/kubeflow/training-operator v1.7.0
99
github.com/onsi/gomega v1.31.1
10-
github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d
10+
github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904
1111
github.com/prometheus/client_golang v1.20.4
1212
github.com/prometheus/common v0.57.0
1313
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,8 +365,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
365365
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
366366
github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
367367
github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
368-
github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d h1:WN/cN/giLiicdGjnztRYgfR7K7biaGmPO98WdWMppos=
369-
github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d/go.mod h1:v7XKwaDoCspsHQlWJNarO7gOpR+iumSS+c1bWs3kJOI=
368+
github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904 h1:brU4j1V4o+z/sw0TGi360Wdjk1TEQ313ynBRGqSTaNU=
369+
github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904/go.mod h1:v7XKwaDoCspsHQlWJNarO7gOpR+iumSS+c1bWs3kJOI=
370370
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
371371
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
372372
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=

tests/kfto/core/environment.go renamed to tests/fms/environment.go

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package core
17+
package fms
1818

1919
import (
2020
"fmt"
@@ -26,10 +26,6 @@ import (
2626
const (
2727
// The environment variable for FMS HF Tuning image to be tested
2828
fmsHfTuningImageEnvVar = "FMS_HF_TUNING_IMAGE"
29-
// The environment variable referring to image containing bloom-560m model
30-
bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
31-
// The environment variable referring to image containing Stanford Alpaca dataset
32-
alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
3329
// The environment variable referring to image containing minio CLI
3430
minioCliImageEnvVar = "MINIO_CLI_IMAGE"
3531
// The environment variable for HuggingFace token to download models which require authentication
@@ -51,14 +47,6 @@ func GetFmsHfTuningImage(t Test) string {
5147
return image
5248
}
5349

54-
func GetBloomModelImage() string {
55-
return lookupEnvOrDefault(bloomModelImageEnvVar, "quay.io/ksuta/bloom-560m@sha256:f6db02bb7b5d09a8d698c04994d747bfb9e581bbb4c07d00290244d207623733")
56-
}
57-
58-
func GetAlpacaDatasetImage() string {
59-
return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:2e90f631180c7b2c916f9569b914b336b612e8ae86efad82546adc5c9fcbbb8d")
60-
}
61-
6250
func GetMinioCliImage() string {
6351
return lookupEnvOrDefault(minioCliImageEnvVar, "quay.io/ksuta/mc@sha256:e128ce4caee276bcbfe3bd32ebb01c814f6b2eb2fd52d08ef0d4684f68c1e3d6")
6452
}

tests/kfto/core/kfto_kueue_sft_GPU_test.go renamed to tests/fms/kfto_kueue_sft_GPU_test.go

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,15 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package core
17+
package fms
1818

1919
import (
2020
"fmt"
2121
"testing"
2222
"time"
2323

2424
. "github.com/onsi/gomega"
25+
"github.com/opendatahub-io/distributed-workloads/tests/kfto"
2526
. "github.com/project-codeflare/codeflare-common/support"
2627

2728
corev1 "k8s.io/api/core/v1"
@@ -34,75 +35,76 @@ import (
3435
)
3536

3637
func TestMultiGpuPytorchjobAllamBeta13bChatGptq(t *testing.T) {
37-
runMultiGpuPytorchjob(t, "config_allam_beta_13b_chat_gptq.json", 2, mountModelVolumeIntoMaster)
38+
runMultiGpuPytorchjob(t, "resources/config_allam_beta_13b_chat_gptq.json", 2, mountModelVolumeIntoMaster)
3839
}
3940

4041
func TestMultiGpuPytorchjobGranite8bCodeInstructGptq(t *testing.T) {
41-
runMultiGpuPytorchjob(t, "config_granite_8b_code_instruct_gptq.json", 2, mountModelVolumeIntoMaster)
42+
runMultiGpuPytorchjob(t, "resources/config_granite_8b_code_instruct_gptq.json", 2, mountModelVolumeIntoMaster)
4243
}
4344

4445
func TestMultiGpuPytorchjobGranite20bCodeInstruct(t *testing.T) {
45-
runMultiGpuPytorchjob(t, "config_granite_20b_code_instruct.json", 4)
46+
runMultiGpuPytorchjob(t, "resources/config_granite_20b_code_instruct.json", 4)
4647
}
4748

4849
func TestMultiGpuPytorchjobGranite34bCodeBaseGptq(t *testing.T) {
49-
runMultiGpuPytorchjob(t, "config_granite_34b_code_base_gptq.json", 2, mountModelVolumeIntoMaster)
50+
runMultiGpuPytorchjob(t, "resources/config_granite_34b_code_base_gptq.json", 2, mountModelVolumeIntoMaster)
5051
}
5152

5253
func TestMultiGpuPytorchjobGranite34bCodeInstructLoRa(t *testing.T) {
53-
runMultiGpuPytorchjob(t, "config_granite_34b_code_instruct_lora.json", 4)
54+
runMultiGpuPytorchjob(t, "resources/config_granite_34b_code_instruct_lora.json", 4)
5455
}
5556

5657
func TestMultiGpuPytorchjobMetaLlama318b(t *testing.T) {
57-
runMultiGpuPytorchjob(t, "config_meta_llama3_1_8b.json", 2)
58+
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_8b.json", 2)
5859
}
5960

6061
func TestMultiGpuPytorchjobMetaLlama38bInstruct(t *testing.T) {
61-
runMultiGpuPytorchjob(t, "config_meta_llama3_8b_instruct.json", 2)
62+
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_8b_instruct.json", 2)
6263
}
6364

6465
func TestMultiGpuPytorchjobMetaLlama370bInstructGptqBlue(t *testing.T) {
65-
runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_gptq_blue.json", 2, mountModelVolumeIntoMaster)
66+
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_70b_instruct_gptq_blue.json", 2, mountModelVolumeIntoMaster)
6667
}
6768

6869
func TestMultiGpuPytorchjobMetaLlama31405bGptq(t *testing.T) {
69-
runMultiGpuPytorchjob(t, "config_meta_llama3_1_405b_gptq.json", 8, mountModelVolumeIntoMaster)
70+
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_405b_gptq.json", 8, mountModelVolumeIntoMaster)
7071
}
7172

7273
func TestMultiGpuPytorchjobMetaLlama3170bLoRa(t *testing.T) {
73-
runMultiGpuPytorchjob(t, "config_meta_llama3_1_70b_lora.json", 4)
74+
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_70b_lora.json", 4)
7475
}
7576

7677
func TestMultiGpuPytorchjobMetaLlama370bInstructLoRa(t *testing.T) {
77-
runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_lora.json", 4)
78+
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_70b_instruct_lora.json", 4)
7879
}
7980

8081
func TestMultiGpuPytorchjobMistral7bv03Gptq(t *testing.T) {
81-
runMultiGpuPytorchjob(t, "config_mistral_7b_v03_gptq.json", 2, mountModelVolumeIntoMaster)
82+
runMultiGpuPytorchjob(t, "resources/config_mistral_7b_v03_gptq.json", 2, mountModelVolumeIntoMaster)
8283
}
8384
func TestMultiGpuPytorchjobMistral7bv03(t *testing.T) {
84-
runMultiGpuPytorchjob(t, "config_mistral_7b_v03.json", 2)
85+
runMultiGpuPytorchjob(t, "resources/config_mistral_7b_v03.json", 2)
8586
}
8687

8788
func TestMultiGpuPytorchjobMixtral8x7bv01(t *testing.T) {
88-
runMultiGpuPytorchjob(t, "config_mixtral_8x7b_v01.json", 8)
89+
runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_v01.json", 8)
8990
}
9091

9192
func TestMultiGpuPytorchjobMixtral8x7bInstructv01Gptq(t *testing.T) {
92-
runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01_gptq.json", 2, mountModelVolumeIntoMaster)
93+
runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_instruct_v01_gptq.json", 2, mountModelVolumeIntoMaster)
9394
}
95+
9496
func TestMultiGpuPytorchjobMixtral8x7bInstructv01LoRa(t *testing.T) {
95-
runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01_lora.json", 4)
97+
runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_instruct_v01_lora.json", 4)
9698
}
9799

98100
func TestMultiGpuPytorchjobMerlinite7b(t *testing.T) {
99-
runMultiGpuPytorchjob(t, "config_merlinite_7b.json", 2)
101+
runMultiGpuPytorchjob(t, "resources/config_merlinite_7b.json", 2)
100102
}
101103

102104
func runMultiGpuPytorchjob(t *testing.T, modelConfigFile string, numberOfGpus int, options ...Option[*kftov1.PyTorchJob]) {
103105
test := With(t)
104106

105-
namespace := GetOrCreateTestNamespace(test)
107+
namespace := test.CreateOrGetTestNamespace().Name
106108

107109
// Create a ConfigMap with configuration
108110
configData := map[string][]byte{
@@ -173,7 +175,7 @@ func createAlpacaPyTorchJob(test Test, namespace string, config corev1.ConfigMap
173175
InitContainers: []corev1.Container{
174176
{
175177
Name: "copy-dataset",
176-
Image: GetAlpacaDatasetImage(),
178+
Image: kfto.GetAlpacaDatasetImage(),
177179
ImagePullPolicy: corev1.PullIfNotPresent,
178180
VolumeMounts: []corev1.VolumeMount{
179181
{

tests/kfto/core/kfto_kueue_sft_test.go renamed to tests/fms/kfto_kueue_sft_test.go

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package core
17+
package fms
1818

1919
import (
2020
"fmt"
2121
"testing"
2222

2323
. "github.com/onsi/gomega"
24+
"github.com/opendatahub-io/distributed-workloads/tests/kfto"
2425
. "github.com/project-codeflare/codeflare-common/support"
2526
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
2627

@@ -32,26 +33,26 @@ import (
3233
)
3334

3435
func TestPytorchjobWithSFTtrainerFinetuning(t *testing.T) {
35-
runPytorchjobWithSFTtrainer(t, "config.json", 0)
36+
runPytorchjobWithSFTtrainer(t, "resources/config.json", 0)
3637
}
3738

3839
func TestPytorchjobWithSFTtrainerLoRa(t *testing.T) {
39-
runPytorchjobWithSFTtrainer(t, "config_lora.json", 0)
40+
runPytorchjobWithSFTtrainer(t, "resources/config_lora.json", 0)
4041
}
4142
func TestPytorchjobWithSFTtrainerQLoRa(t *testing.T) {
42-
runPytorchjobWithSFTtrainer(t, "config_qlora.json", 1)
43+
runPytorchjobWithSFTtrainer(t, "resources/config_qlora.json", 1)
4344
}
4445

4546
func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus int) {
4647
test := With(t)
4748

4849
// Create a namespace
49-
namespace := GetOrCreateTestNamespace(test)
50+
namespace := test.CreateOrGetTestNamespace().Name
5051

5152
// Create a ConfigMap with training dataset and configuration
5253
configData := map[string][]byte{
5354
"config.json": ReadFile(test, modelConfigFile),
54-
"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
55+
"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
5556
}
5657
config := CreateConfigMap(test, namespace, configData)
5758

@@ -125,12 +126,12 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
125126
test := With(t)
126127

127128
// Create a namespace
128-
namespace := GetOrCreateTestNamespace(test)
129+
namespace := test.CreateOrGetTestNamespace().Name
129130

130131
// Create a ConfigMap with training dataset and configuration
131132
configData := map[string][]byte{
132-
"config.json": ReadFile(test, "config.json"),
133-
"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
133+
"config.json": ReadFile(test, "resources/config.json"),
134+
"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
134135
}
135136
config := CreateConfigMap(test, namespace, configData)
136137

@@ -231,7 +232,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
231232
InitContainers: []corev1.Container{
232233
{
233234
Name: "copy-model",
234-
Image: GetBloomModelImage(),
235+
Image: kfto.GetBloomModelImage(),
235236
ImagePullPolicy: corev1.PullIfNotPresent,
236237
VolumeMounts: []corev1.VolumeMount{
237238
{

tests/kfto/upgrade/kfto_kueue_sft_upgrade_training_test.go renamed to tests/fms/kfto_kueue_sft_upgrade_training_test.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package kfto
17+
package fms
1818

1919
import (
2020
"testing"
2121

2222
. "github.com/onsi/gomega"
23-
kftocore "github.com/opendatahub-io/distributed-workloads/tests/kfto/core"
23+
"github.com/opendatahub-io/distributed-workloads/tests/kfto"
2424
. "github.com/project-codeflare/codeflare-common/support"
2525
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
2626
kueueacv1beta1 "sigs.k8s.io/kueue/client-go/applyconfiguration/kueue/v1beta1"
@@ -48,8 +48,8 @@ func TestSetupPytorchjob(t *testing.T) {
4848

4949
// Create a ConfigMap with training dataset and configuration
5050
configData := map[string][]byte{
51-
"config.json": kftocore.ReadFile(test, "config.json"),
52-
"twitter_complaints_small.json": kftocore.ReadFile(test, "twitter_complaints_small.json"),
51+
"config.json": ReadFile(test, "resources/config.json"),
52+
"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
5353
}
5454
config := CreateConfigMap(test, namespaceName, configData)
5555

@@ -90,11 +90,11 @@ func TestSetupPytorchjob(t *testing.T) {
9090
test.T().Logf("Applied Kueue LocalQueue %s/%s successfully", appliedLocalQueue.Namespace, appliedLocalQueue.Name)
9191

9292
// Create training PyTorch job
93-
tuningJob := createPyTorchJob(test, namespaceName, appliedLocalQueue.Name, *config)
93+
tuningJob := createUpgradePyTorchJob(test, namespaceName, appliedLocalQueue.Name, *config)
9494

9595
// Make sure the PyTorch job is suspended, waiting for ClusterQueue to be enabled
96-
test.Eventually(kftocore.PyTorchJob(test, tuningJob.Namespace, pyTorchJobName), TestTimeoutShort).
97-
Should(WithTransform(kftocore.PyTorchJobConditionSuspended, Equal(corev1.ConditionTrue)))
96+
test.Eventually(PyTorchJob(test, tuningJob.Namespace, pyTorchJobName), TestTimeoutShort).
97+
Should(WithTransform(PyTorchJobConditionSuspended, Equal(corev1.ConditionTrue)))
9898
}
9999

100100
func TestRunPytorchjob(t *testing.T) {
@@ -112,22 +112,22 @@ func TestRunPytorchjob(t *testing.T) {
112112
test.Expect(err).NotTo(HaveOccurred())
113113

114114
// PyTorch job should be started now
115-
test.Eventually(kftocore.PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
116-
Should(WithTransform(kftocore.PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
115+
test.Eventually(PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
116+
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
117117

118118
// Make sure the PyTorch job succeed
119-
test.Eventually(kftocore.PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
120-
Should(WithTransform(kftocore.PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
119+
test.Eventually(PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
120+
Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
121121
}
122122

123-
func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
123+
func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
124124
// Does PyTorchJob already exist?
125125
_, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(test.Ctx(), pyTorchJobName, metav1.GetOptions{})
126126
if err == nil {
127127
// If yes then delete it and wait until there are no PyTorchJobs in the namespace
128128
err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), pyTorchJobName, metav1.DeleteOptions{})
129129
test.Expect(err).NotTo(HaveOccurred())
130-
test.Eventually(kftocore.PyTorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
130+
test.Eventually(PyTorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
131131
} else if !errors.IsNotFound(err) {
132132
test.T().Fatalf("Error retrieving PyTorchJob with name `%s`: %v", pyTorchJobName, err)
133133
}
@@ -149,7 +149,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
149149
InitContainers: []corev1.Container{
150150
{
151151
Name: "copy-model",
152-
Image: kftocore.GetBloomModelImage(),
152+
Image: kfto.GetBloomModelImage(),
153153
ImagePullPolicy: corev1.PullIfNotPresent,
154154
VolumeMounts: []corev1.VolumeMount{
155155
{
@@ -164,7 +164,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
164164
Containers: []corev1.Container{
165165
{
166166
Name: "pytorch",
167-
Image: kftocore.GetFmsHfTuningImage(test),
167+
Image: GetFmsHfTuningImage(test),
168168
ImagePullPolicy: corev1.PullIfNotPresent,
169169
Env: []corev1.EnvVar{
170170
{

0 commit comments

Comments
 (0)