Merge remote-tracking branch 'upstream/main'

dchourasia · dchourasia · commit 354b00243289 · 2025-02-26T00:14:11.000Z
diff --git a/images/runtime/training/cuda/Dockerfile b/images/runtime/training/cuda/Dockerfile
@@ -100,6 +100,10 @@ RUN micropipenv install && \
     chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
     fix-permissions /opt/app-root -P
 
+# Install Flash Attention
+RUN pip install wheel
+RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
+
 # Restore user workspace
 USER 1001
 
diff --git a/tests/fms/resources/config_allam_beta_13b_chat_gptq.json b/tests/fms/resources/config_allam_beta_13b_chat_gptq.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "/mnt/model/model/allam-beta-13b-chat-gptq/allam-beta-13b-chat-gptq-20241001T185023",
+    "model_name_or_path": "/mnt/model/model/allam-beta-13b-chat-gptq-20241001T185023",
     "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
     "output_dir": "/mnt/output/model",
     "save_model_dir": "/mnt/output/model",
diff --git a/tests/fms/resources/config_granite_34b_code_base_gptq.json b/tests/fms/resources/config_granite_34b_code_base_gptq.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "/mnt/model/model/granite-34b-code-base-gptq/granite-34b-code-base-gptq-20241001T150701",
+    "model_name_or_path": "/mnt/model/model/granite-34b-code-base-gptq-20241001T150701",
     "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
     "output_dir": "/mnt/output/model",
     "save_model_dir": "/mnt/output/model",
diff --git a/tests/fms/resources/config_granite_8b_code_instruct_gptq.json b/tests/fms/resources/config_granite_8b_code_instruct_gptq.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "/mnt/model/model/granite-8b-code-instruct-gptq/granite-8b-code-instruct-gptq-20241001T153432",
+    "model_name_or_path": "/mnt/model/model/granite-8b-code-instruct-gptq-20241001T153432",
     "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
     "output_dir": "/mnt/output/model",
     "save_model_dir": "/mnt/output/model",
diff --git a/tests/fms/resources/config_meta_llama3_1_405b_gptq.json b/tests/fms/resources/config_meta_llama3_1_405b_gptq.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "/mnt/model/model/llama-3.1-405b-gptq/llama-3.1-405b-gptq-20241001T160356",
+    "model_name_or_path": "/mnt/model/model/llama-3.1-405b-gptq-20241001T160356",
     "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
     "output_dir": "/mnt/output/model",
     "save_model_dir": "/mnt/output/model",
diff --git a/tests/fms/resources/config_meta_llama3_70b_instruct_gptq_blue.json b/tests/fms/resources/config_meta_llama3_70b_instruct_gptq_blue.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "/mnt/model/model/llama3-70b-instruct-gptq-blue/llama3-70b-instruct-gptq-blue-20240620223000",
+    "model_name_or_path": "/mnt/model/model/llama3-70b-instruct-gptq-blue-20240620223000",
     "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
     "output_dir": "/mnt/output/model",
     "save_model_dir": "/mnt/output/model",
diff --git a/tests/fms/resources/config_mistral_7b_v03_gptq.json b/tests/fms/resources/config_mistral_7b_v03_gptq.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "/mnt/model/model/mistral-7b-v0.3-gptq/mistral-7b-v0.3-gptq-20241001T174851",
+    "model_name_or_path": "/mnt/model/model/mistral-7b-v0.3-gptq-20241001T174851",
     "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
     "output_dir": "/mnt/output/model",
     "save_model_dir": "/mnt/output/model",
diff --git a/tests/fms/resources/config_mixtral_8x7b_instruct_v01_gptq.json b/tests/fms/resources/config_mixtral_8x7b_instruct_v01_gptq.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "/mnt/model/model/mixtral-8x7b-instruct-v0.1-gptq/mixtral-8x7b-instruct-v0.1-gptq-20241001T175603",
+    "model_name_or_path": "/mnt/model/model/mixtral-8x7b-instruct-v0.1-gptq-20241001T175603",
     "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
     "output_dir": "/mnt/output/model",
     "save_model_dir": "/mnt/output/model",
diff --git a/tests/kfto/kfto_kueue_mnist_upgrade_training_test.go b/tests/kfto/kfto_kueue_mnist_upgrade_training_test.go
@@ -136,6 +136,12 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
 		test.T().Fatalf("Error retrieving PyTorchJob with name `%s`: %v", pyTorchJobName, err)
 	}
 
+	storage_bucket_endpoint, storage_bucket_endpoint_exists := GetStorageBucketDefaultEndpoint()
+	storage_bucket_access_key_id, storage_bucket_access_key_id_exists := GetStorageBucketAccessKeyId()
+	storage_bucket_secret_key, storage_bucket_secret_key_exists := GetStorageBucketSecretKey()
+	storage_bucket_name, storage_bucket_name_exists := GetStorageBucketName()
+	storage_bucket_mnist_dir, storage_bucket_mnist_dir_exists := GetStorageBucketMnistDir()
+
 	tuningJob := &kftov1.PyTorchJob{
 		TypeMeta: metav1.TypeMeta{
 			APIVersion: corev1.SchemeGroupVersion.String(),
@@ -321,6 +327,62 @@ func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config
 		},
 	}
 
+	// Add PIP Index to download python packages, use provided custom PYPI mirror index url in case of disconnected environemnt
+	tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Env = []corev1.EnvVar{
+		{
+			Name:  "PIP_INDEX_URL",
+			Value: GetPipIndexURL(),
+		},
+		{
+			Name:  "PIP_TRUSTED_HOST",
+			Value: GetPipTrustedHost(),
+		},
+	}
+	tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Env = []corev1.EnvVar{
+		{
+			Name:  "PIP_INDEX_URL",
+			Value: GetPipIndexURL(),
+		},
+		{
+			Name:  "PIP_TRUSTED_HOST",
+			Value: GetPipTrustedHost(),
+		},
+	}
+
+	// Use storage bucket to download the MNIST datasets if required environment variables are provided, else use default MNIST mirror references as the fallback
+	if storage_bucket_endpoint_exists && storage_bucket_access_key_id_exists && storage_bucket_secret_key_exists && storage_bucket_name_exists && storage_bucket_mnist_dir_exists {
+		storage_bucket_env_vars := []corev1.EnvVar{
+			{
+				Name:  "AWS_DEFAULT_ENDPOINT",
+				Value: storage_bucket_endpoint,
+			},
+			{
+				Name:  "AWS_ACCESS_KEY_ID",
+				Value: storage_bucket_access_key_id,
+			},
+			{
+				Name:  "AWS_SECRET_ACCESS_KEY",
+				Value: storage_bucket_secret_key,
+			},
+			{
+				Name:  "AWS_STORAGE_BUCKET",
+				Value: storage_bucket_name,
+			},
+			{
+				Name:  "AWS_STORAGE_BUCKET_MNIST_DIR",
+				Value: storage_bucket_mnist_dir,
+			},
+		}
+
+		// Append the list of environment variables for the worker container
+		for _, envVar := range storage_bucket_env_vars {
+			tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Env = upsert(tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeMaster].Template.Spec.Containers[0].Env, envVar, withEnvVarName(envVar.Name))
+			tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Env = upsert(tuningJob.Spec.PyTorchReplicaSpecs[kftov1.PyTorchJobReplicaTypeWorker].Template.Spec.Containers[0].Env, envVar, withEnvVarName(envVar.Name))
+		}
+	} else {
+		test.T().Logf("Skipped usage of S3 storage bucket, because required environment variables aren't provided!\nRequired environment variables : AWS_DEFAULT_ENDPOINT, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_STORAGE_BUCKET, AWS_STORAGE_BUCKET_MNIST_DIR")
+	}
+
 	tuningJob, err = test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
 	test.Expect(err).NotTo(HaveOccurred())
 	test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name)

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "model_name_or_path": "/mnt/model/model/allam-beta-13b-chat-gptq/allam-beta-13b-chat-gptq-20241001T185023",`
	`2`	`+ "model_name_or_path": "/mnt/model/model/allam-beta-13b-chat-gptq-20241001T185023",`
`3`	`3`	`"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",`
`4`	`4`	`"output_dir": "/mnt/output/model",`
`5`	`5`	`"save_model_dir": "/mnt/output/model",`