Skip to content

Commit 586e074

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 2330ef9 + 538c465 commit 586e074

File tree

10 files changed

+1835
-2148
lines changed

10 files changed

+1835
-2148
lines changed

images/runtime/training/cuda/Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,13 @@ RUN pip install --no-cache-dir -U "micropipenv[toml]"
9494
# Install Python dependencies from Pipfile.lock file
9595
COPY Pipfile.lock ./
9696

97-
RUN micropipenv install && rm -f ./Pipfile.lock
97+
RUN micropipenv install && \
98+
rm -f ./Pipfile.lock && \
99+
# Fix permissions to support pip in OpenShift environments \
100+
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
101+
fix-permissions /opt/app-root -P
98102

99103
# Restore user workspace
100104
USER 1001
105+
101106
WORKDIR /opt/app-root/src

images/runtime/training/cuda/Pipfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@ verify_ssl = true
44
name = "pypi"
55

66
[packages]
7-
peft = "*"
7+
peft = ">=0.14.0"
88
datasets = ">=2.15.0"
9-
transformers = ">=4.41.2"
9+
transformers = ">=4.49.0"
1010
numpy = "<2.0.0,>=1.23.5"
11-
accelerate = ">=0.34.2"
11+
accelerate = ">=1.4.0"
1212
torch = "==2.4.1"
1313
sentencepiece = "<0.3,>=0.1.99"
1414
tokenizers = "<1.0,>=0.13.3"
1515
tqdm = "<5.0,>=4.66.2"
16-
trl = ">=0.9.4"
16+
trl = ">=0.15.1"
1717
protobuf = "<6.0.0,>=5.28.0"
1818
simpleeval = "<1.0,>=0.9.13"
1919
safetensors = "*"

images/runtime/training/cuda/Pipfile.lock

Lines changed: 973 additions & 1193 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

images/runtime/training/rocm/Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,13 @@ RUN pip install --no-cache-dir -U "micropipenv[toml]"
5858
# Install Python dependencies from Pipfile.lock file
5959
COPY Pipfile.lock ./
6060

61-
RUN micropipenv install && rm -f ./Pipfile.lock
61+
RUN micropipenv install && \
62+
rm -f ./Pipfile.lock && \
63+
# Fix permissions to support pip in OpenShift environments \
64+
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
65+
fix-permissions /opt/app-root -P
6266

6367
# Restore user workspace
6468
USER 1001
69+
6570
WORKDIR /opt/app-root/src

images/runtime/training/rocm/Pipfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,17 @@ verify_ssl = true
99
name = "pytorch"
1010

1111
[packages]
12-
peft = "*"
12+
peft = ">=0.14.0"
1313
datasets = ">=2.15.0"
14-
transformers = ">=4.41.2"
14+
transformers = ">=4.49.0"
1515
numpy = "<2.0.0,>=1.23.5"
16-
accelerate = ">=0.34.2"
16+
accelerate = ">=1.4.0"
1717
torch = {version = "==2.4.1+rocm6.1", index = "pytorch"}
1818
pytorch-triton-rocm = {version = "*", index = "pytorch"}
1919
sentencepiece = "<0.3,>=0.1.99"
2020
tokenizers = "<1.0,>=0.13.3"
2121
tqdm = "<5.0,>=4.66.2"
22-
trl = ">=0.9.4"
22+
trl = ">=0.15.1"
2323
protobuf = "<6.0.0,>=5.28.0"
2424
simpleeval = "<1.0,>=0.9.13"
2525
safetensors = "*"

images/runtime/training/rocm/Pipfile.lock

Lines changed: 709 additions & 890 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/fms/environment.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ const (
3232
huggingfaceTokenEnvVar = "HF_TOKEN"
3333
// The environment variable specifying name of PersistenceVolumeClaim containing GPTQ models
3434
gptqModelPvcNameEnvVar = "GPTQ_MODEL_PVC_NAME"
35-
// The environment variable referring to image simulating sleep condition in container
36-
sleepImageEnvVar = "SLEEP_IMAGE"
3735
// The environment variable specifying s3 bucket folder path used to store model
3836
storageBucketModelPath = "AWS_STORAGE_BUCKET_MODEL_PATH"
3937
)
@@ -68,10 +66,6 @@ func GetGptqModelPvcName() (string, error) {
6866
return image, nil
6967
}
7068

71-
func GetSleepImage() string {
72-
return lookupEnvOrDefault(sleepImageEnvVar, "gcr.io/k8s-staging-perf-tests/sleep@sha256:8d91ddf9f145b66475efda1a1b52269be542292891b5de2a7fad944052bab6ea")
73-
}
74-
7569
func GetStorageBucketModelPath() string {
7670
storageBucketModelPath := lookupEnvOrDefault(storageBucketModelPath, "")
7771
return storageBucketModelPath

tests/kfto/environment.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ const (
2525
bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
2626
// The environment variable referring to image containing Stanford Alpaca dataset
2727
alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
28+
// The environment variable referring to image simulating sleep condition in container
29+
sleepImageEnvVar = "SLEEP_IMAGE"
2830
)
2931

3032
func GetBloomModelImage() string {
@@ -35,6 +37,10 @@ func GetAlpacaDatasetImage() string {
3537
return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:2e90f631180c7b2c916f9569b914b336b612e8ae86efad82546adc5c9fcbbb8d")
3638
}
3739

40+
func GetSleepImage() string {
41+
return lookupEnvOrDefault(sleepImageEnvVar, "gcr.io/k8s-staging-perf-tests/sleep@sha256:8d91ddf9f145b66475efda1a1b52269be542292891b5de2a7fad944052bab6ea")
42+
}
43+
3844
func lookupEnvOrDefault(key, value string) string {
3945
if v, ok := os.LookupEnv(key); ok {
4046
return v

tests/fms/kfto_kueue_sft_upgrade_training_test.go renamed to tests/kfto/kfto_kueue_mnist_upgrade_training_test.go

Lines changed: 126 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package fms
17+
package kfto
1818

1919
import (
2020
"testing"
@@ -29,8 +29,6 @@ import (
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3030
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
3131
kueueacv1beta1 "sigs.k8s.io/kueue/client-go/applyconfiguration/kueue/v1beta1"
32-
33-
"github.com/opendatahub-io/distributed-workloads/tests/kfto"
3432
)
3533

3634
var (
@@ -47,10 +45,16 @@ func TestSetupPytorchjob(t *testing.T) {
4745
createOrGetUpgradeTestNamespace(test, namespaceName)
4846

4947
// Create a ConfigMap with training dataset and configuration
48+
mnist := readFile(test, "resources/mnist.py")
49+
download_mnist_dataset := readFile(test, "resources/download_mnist_datasets.py")
50+
requirementsFileName := readFile(test, "resources/requirements.txt")
51+
5052
configData := map[string][]byte{
51-
"config.json": ReadFile(test, "resources/config.json"),
52-
"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
53+
"mnist.py": mnist,
54+
"download_mnist_datasets.py": download_mnist_dataset,
55+
"requirements.txt": requirementsFileName,
5356
}
57+
5458
config := CreateConfigMap(test, namespaceName, configData)
5559

5660
// Create Kueue resources
@@ -70,7 +74,7 @@ func TestSetupPytorchjob(t *testing.T) {
7074
WithName(kueuev1beta1.ResourceFlavorReference(resourceFlavorName)).
7175
WithResources(
7276
kueueacv1beta1.ResourceQuota().WithName(corev1.ResourceCPU).WithNominalQuota(resource.MustParse("8")),
73-
kueueacv1beta1.ResourceQuota().WithName(corev1.ResourceMemory).WithNominalQuota(resource.MustParse("12Gi")),
77+
kueueacv1beta1.ResourceQuota().WithName(corev1.ResourceMemory).WithNominalQuota(resource.MustParse("18Gi")),
7478
),
7579
),
7680
).
@@ -133,6 +137,10 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
133137
}
134138

135139
tuningJob := &kftov1.PyTorchJob{
140+
TypeMeta: metav1.TypeMeta{
141+
APIVersion: corev1.SchemeGroupVersion.String(),
142+
Kind: "PyTorchJob",
143+
},
136144
ObjectMeta: metav1.ObjectMeta{
137145
Name: pyTorchJobName,
138146
Labels: map[string]string{
@@ -141,85 +149,76 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
141149
},
142150
Spec: kftov1.PyTorchJobSpec{
143151
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{
144-
"Master": {
152+
kftov1.PyTorchJobReplicaTypeMaster: {
145153
Replicas: Ptr(int32(1)),
146-
RestartPolicy: "OnFailure",
154+
RestartPolicy: kftov1.RestartPolicyOnFailure,
147155
Template: corev1.PodTemplateSpec{
156+
ObjectMeta: metav1.ObjectMeta{
157+
Labels: map[string]string{
158+
"app": "kfto-mnist",
159+
"role": "master",
160+
},
161+
},
148162
Spec: corev1.PodSpec{
149-
InitContainers: []corev1.Container{
150-
{
151-
Name: "copy-model",
152-
Image: kfto.GetBloomModelImage(),
153-
ImagePullPolicy: corev1.PullIfNotPresent,
154-
VolumeMounts: []corev1.VolumeMount{
163+
Affinity: &corev1.Affinity{
164+
PodAntiAffinity: &corev1.PodAntiAffinity{
165+
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
155166
{
156-
Name: "tmp-volume",
157-
MountPath: "/tmp",
167+
LabelSelector: &metav1.LabelSelector{
168+
MatchLabels: map[string]string{
169+
"app": "kfto-mnist",
170+
},
171+
},
172+
TopologyKey: "kubernetes.io/hostname",
158173
},
159174
},
160-
Command: []string{"/bin/sh", "-c"},
161-
Args: []string{"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model"},
162175
},
163176
},
164177
Containers: []corev1.Container{
165178
{
166179
Name: "pytorch",
167-
Image: GetFmsHfTuningImage(test),
180+
Image: GetCudaTrainingImage(),
168181
ImagePullPolicy: corev1.PullIfNotPresent,
169-
Env: []corev1.EnvVar{
170-
{
171-
Name: "SFT_TRAINER_CONFIG_JSON_PATH",
172-
Value: "/etc/config/config.json",
173-
},
174-
{
175-
Name: "HF_HOME",
176-
Value: "/tmp/huggingface",
177-
},
182+
Command: []string{
183+
"/bin/bash", "-c",
184+
(`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
185+
pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib --verbose && \
186+
echo "Downloading MNIST dataset..." && \
187+
python3 /mnt/files/download_mnist_datasets.py --dataset_path "/tmp/datasets/mnist" && \
188+
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
189+
echo -e "\n\n Starting training..." && \
190+
torchrun --nproc_per_node 2 /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend "gloo"`),
178191
},
179192
VolumeMounts: []corev1.VolumeMount{
180193
{
181-
Name: "config-volume",
182-
MountPath: "/etc/config",
194+
Name: config.Name,
195+
MountPath: "/mnt/files",
183196
},
184197
{
185198
Name: "tmp-volume",
186199
MountPath: "/tmp",
187200
},
188-
{
189-
Name: "output-volume",
190-
MountPath: "/mnt/output",
191-
},
192201
},
193202
Resources: corev1.ResourceRequirements{
194203
Requests: corev1.ResourceList{
195204
corev1.ResourceCPU: resource.MustParse("2"),
196-
corev1.ResourceMemory: resource.MustParse("7Gi"),
205+
corev1.ResourceMemory: resource.MustParse("6Gi"),
197206
},
198207
Limits: corev1.ResourceList{
199208
corev1.ResourceCPU: resource.MustParse("2"),
200-
corev1.ResourceMemory: resource.MustParse("7Gi"),
209+
corev1.ResourceMemory: resource.MustParse("6Gi"),
201210
},
202211
},
203212
},
204213
},
205214
Volumes: []corev1.Volume{
206215
{
207-
Name: "config-volume",
216+
Name: config.Name,
208217
VolumeSource: corev1.VolumeSource{
209218
ConfigMap: &corev1.ConfigMapVolumeSource{
210219
LocalObjectReference: corev1.LocalObjectReference{
211220
Name: config.Name,
212221
},
213-
Items: []corev1.KeyToPath{
214-
{
215-
Key: "config.json",
216-
Path: "config.json",
217-
},
218-
{
219-
Key: "twitter_complaints_small.json",
220-
Path: "twitter_complaints_small.json",
221-
},
222-
},
223222
},
224223
},
225224
},
@@ -229,13 +228,92 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
229228
EmptyDir: &corev1.EmptyDirVolumeSource{},
230229
},
231230
},
231+
},
232+
RestartPolicy: corev1.RestartPolicyOnFailure,
233+
},
234+
},
235+
},
236+
kftov1.PyTorchJobReplicaTypeWorker: {
237+
Replicas: Ptr(int32(2)),
238+
RestartPolicy: kftov1.RestartPolicyOnFailure,
239+
Template: corev1.PodTemplateSpec{
240+
ObjectMeta: metav1.ObjectMeta{
241+
Labels: map[string]string{
242+
"app": "kfto-mnist",
243+
"role": "worker",
244+
},
245+
},
246+
Spec: corev1.PodSpec{
247+
Affinity: &corev1.Affinity{
248+
PodAntiAffinity: &corev1.PodAntiAffinity{
249+
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
250+
{
251+
LabelSelector: &metav1.LabelSelector{
252+
MatchLabels: map[string]string{
253+
"app": "kfto-mnist",
254+
},
255+
},
256+
TopologyKey: "kubernetes.io/hostname",
257+
},
258+
},
259+
},
260+
},
261+
Containers: []corev1.Container{
232262
{
233-
Name: "output-volume",
263+
Name: "pytorch",
264+
Image: GetCudaTrainingImage(),
265+
ImagePullPolicy: corev1.PullIfNotPresent,
266+
Command: []string{
267+
"/bin/bash", "-c",
268+
(`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
269+
pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib --verbose && \
270+
echo "Downloading MNIST dataset..." && \
271+
python3 /mnt/files/download_mnist_datasets.py --dataset_path "/tmp/datasets/mnist" && \
272+
echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
273+
echo -e "\n\n Starting training..." && \
274+
torchrun --nproc_per_node 2 /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend "gloo"`),
275+
},
276+
VolumeMounts: []corev1.VolumeMount{
277+
{
278+
Name: config.Name,
279+
MountPath: "/mnt/files",
280+
},
281+
{
282+
Name: "tmp-volume",
283+
MountPath: "/tmp",
284+
},
285+
},
286+
Resources: corev1.ResourceRequirements{
287+
Requests: corev1.ResourceList{
288+
corev1.ResourceCPU: resource.MustParse("2"),
289+
corev1.ResourceMemory: resource.MustParse("6Gi"),
290+
},
291+
Limits: corev1.ResourceList{
292+
corev1.ResourceCPU: resource.MustParse("2"),
293+
corev1.ResourceMemory: resource.MustParse("6Gi"),
294+
},
295+
},
296+
},
297+
},
298+
Volumes: []corev1.Volume{
299+
{
300+
Name: config.Name,
301+
VolumeSource: corev1.VolumeSource{
302+
ConfigMap: &corev1.ConfigMapVolumeSource{
303+
LocalObjectReference: corev1.LocalObjectReference{
304+
Name: config.Name,
305+
},
306+
},
307+
},
308+
},
309+
{
310+
Name: "tmp-volume",
234311
VolumeSource: corev1.VolumeSource{
235312
EmptyDir: &corev1.EmptyDirVolumeSource{},
236313
},
237314
},
238315
},
316+
RestartPolicy: corev1.RestartPolicyOnFailure,
239317
},
240318
},
241319
},
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package fms
17+
package kfto
1818

1919
import (
2020
"testing"

0 commit comments

Comments
 (0)