Skip to content

Commit 354b002

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents cccc6e0 + 074f9bf commit 354b002

9 files changed

+73
-7
lines changed

images/runtime/training/cuda/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ RUN micropipenv install && \
100100
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
101101
fix-permissions /opt/app-root -P
102102

103+
# Install Flash Attention
104+
RUN pip install wheel
105+
RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
106+
103107
# Restore user workspace
104108
USER 1001
105109

tests/fms/resources/config_allam_beta_13b_chat_gptq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model_name_or_path": "/mnt/model/model/allam-beta-13b-chat-gptq/allam-beta-13b-chat-gptq-20241001T185023",
2+
"model_name_or_path": "/mnt/model/model/allam-beta-13b-chat-gptq-20241001T185023",
33
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
44
"output_dir": "/mnt/output/model",
55
"save_model_dir": "/mnt/output/model",

tests/fms/resources/config_granite_34b_code_base_gptq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model_name_or_path": "/mnt/model/model/granite-34b-code-base-gptq/granite-34b-code-base-gptq-20241001T150701",
2+
"model_name_or_path": "/mnt/model/model/granite-34b-code-base-gptq-20241001T150701",
33
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
44
"output_dir": "/mnt/output/model",
55
"save_model_dir": "/mnt/output/model",

tests/fms/resources/config_granite_8b_code_instruct_gptq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model_name_or_path": "/mnt/model/model/granite-8b-code-instruct-gptq/granite-8b-code-instruct-gptq-20241001T153432",
2+
"model_name_or_path": "/mnt/model/model/granite-8b-code-instruct-gptq-20241001T153432",
33
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
44
"output_dir": "/mnt/output/model",
55
"save_model_dir": "/mnt/output/model",

tests/fms/resources/config_meta_llama3_1_405b_gptq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model_name_or_path": "/mnt/model/model/llama-3.1-405b-gptq/llama-3.1-405b-gptq-20241001T160356",
2+
"model_name_or_path": "/mnt/model/model/llama-3.1-405b-gptq-20241001T160356",
33
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
44
"output_dir": "/mnt/output/model",
55
"save_model_dir": "/mnt/output/model",

tests/fms/resources/config_meta_llama3_70b_instruct_gptq_blue.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model_name_or_path": "/mnt/model/model/llama3-70b-instruct-gptq-blue/llama3-70b-instruct-gptq-blue-20240620223000",
2+
"model_name_or_path": "/mnt/model/model/llama3-70b-instruct-gptq-blue-20240620223000",
33
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
44
"output_dir": "/mnt/output/model",
55
"save_model_dir": "/mnt/output/model",

tests/fms/resources/config_mistral_7b_v03_gptq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model_name_or_path": "/mnt/model/model/mistral-7b-v0.3-gptq/mistral-7b-v0.3-gptq-20241001T174851",
2+
"model_name_or_path": "/mnt/model/model/mistral-7b-v0.3-gptq-20241001T174851",
33
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
44
"output_dir": "/mnt/output/model",
55
"save_model_dir": "/mnt/output/model",

tests/fms/resources/config_mixtral_8x7b_instruct_v01_gptq.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model_name_or_path": "/mnt/model/model/mixtral-8x7b-instruct-v0.1-gptq/mixtral-8x7b-instruct-v0.1-gptq-20241001T175603",
2+
"model_name_or_path": "/mnt/model/model/mixtral-8x7b-instruct-v0.1-gptq-20241001T175603",
33
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
44
"output_dir": "/mnt/output/model",
55
"save_model_dir": "/mnt/output/model",

tests/kfto/kfto_kueue_mnist_upgrade_training_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,12 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
136136
test.T().Fatalf("Error retrieving PyTorchJob with name `%s`: %v", pyTorchJobName, err)
137137
}
138138

139+
storage_bucket_endpoint, storage_bucket_endpoint_exists := GetStorageBucketDefaultEndpoint()
140+
storage_bucket_access_key_id, storage_bucket_access_key_id_exists := GetStorageBucketAccessKeyId()
141+
storage_bucket_secret_key, storage_bucket_secret_key_exists := GetStorageBucketSecretKey()
142+
storage_bucket_name, storage_bucket_name_exists := GetStorageBucketName()
143+
storage_bucket_mnist_dir, storage_bucket_mnist_dir_exists := GetStorageBucketMnistDir()
144+
139145
tuningJob := &kftov1.PyTorchJob{
140146
TypeMeta: metav1.TypeMeta{
141147
APIVersion: corev1.SchemeGroupVersion.String(),
@@ -321,6 +327,62 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
321327
},
322328
}
323329

330+
// Add PIP Index to download python packages, use provided custom PYPI mirror index url in case of disconnected environemnt
331+
tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Env = []corev1.EnvVar{
332+
{
333+
Name: "PIP_INDEX_URL",
334+
Value: GetPipIndexURL(),
335+
},
336+
{
337+
Name: "PIP_TRUSTED_HOST",
338+
Value: GetPipTrustedHost(),
339+
},
340+
}
341+
tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Env = []corev1.EnvVar{
342+
{
343+
Name: "PIP_INDEX_URL",
344+
Value: GetPipIndexURL(),
345+
},
346+
{
347+
Name: "PIP_TRUSTED_HOST",
348+
Value: GetPipTrustedHost(),
349+
},
350+
}
351+
352+
// Use storage bucket to download the MNIST datasets if required environment variables are provided, else use default MNIST mirror references as the fallback
353+
if storage_bucket_endpoint_exists && storage_bucket_access_key_id_exists && storage_bucket_secret_key_exists && storage_bucket_name_exists && storage_bucket_mnist_dir_exists {
354+
storage_bucket_env_vars := []corev1.EnvVar{
355+
{
356+
Name: "AWS_DEFAULT_ENDPOINT",
357+
Value: storage_bucket_endpoint,
358+
},
359+
{
360+
Name: "AWS_ACCESS_KEY_ID",
361+
Value: storage_bucket_access_key_id,
362+
},
363+
{
364+
Name: "AWS_SECRET_ACCESS_KEY",
365+
Value: storage_bucket_secret_key,
366+
},
367+
{
368+
Name: "AWS_STORAGE_BUCKET",
369+
Value: storage_bucket_name,
370+
},
371+
{
372+
Name: "AWS_STORAGE_BUCKET_MNIST_DIR",
373+
Value: storage_bucket_mnist_dir,
374+
},
375+
}
376+
377+
// Append the list of environment variables for the worker container
378+
for _, envVar := range storage_bucket_env_vars {
379+
tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Env = upsert(tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Env, envVar, withEnvVarName(envVar.Name))
380+
tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Env = upsert(tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Env, envVar, withEnvVarName(envVar.Name))
381+
}
382+
} else {
383+
test.T().Logf("Skipped usage of S3 storage bucket, because required environment variables aren't provided!\nRequired environment variables : AWS_DEFAULT_ENDPOINT, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_STORAGE_BUCKET, AWS_STORAGE_BUCKET_MNIST_DIR")
384+
}
385+
324386
tuningJob, err = test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
325387
test.Expect(err).NotTo(HaveOccurred())
326388
test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name)

0 commit comments

Comments
 (0)