@@ -61,6 +61,7 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
6161 namespace := test .NewTestNamespace ()
6262
6363 mnist := ReadFile (test , "resources/mnist.py" )
64+ download_mnist_dataset := ReadFile (test , "resources/download_mnist_datasets.py" )
6465 requirementsFileName := ReadFile (test , requirementsFile )
6566
6667 if accelerator .isGpu () {
@@ -69,9 +70,9 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
6970 mnist = bytes .Replace (mnist , []byte ("accelerator=\" has to be specified\" " ), []byte ("accelerator=\" cpu\" " ), 1 )
7071 }
7172 config := CreateConfigMap (test , namespace .Name , map [string ][]byte {
72- // MNIST Ray Notebook
73- "mnist .py" : mnist ,
74- "requirements.txt" : requirementsFileName ,
73+ "mnist.py" : mnist ,
74+ "download_mnist_datasets .py" : download_mnist_dataset ,
75+ "requirements.txt" : requirementsFileName ,
7576 })
7677
7778 // Create training PyTorch job
@@ -117,6 +118,12 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
117118 backend = "gloo"
118119 }
119120
121+ storage_bucket_endpoint , storage_bucket_endpoint_exists := GetStorageBucketDefaultEndpoint ()
122+ storage_bucket_access_key_id , storage_bucket_access_key_id_exists := GetStorageBucketAccessKeyId ()
123+ storage_bucket_secret_key , storage_bucket_secret_key_exists := GetStorageBucketSecretKey ()
124+ storage_bucket_name , storage_bucket_name_exists := GetStorageBucketName ()
125+ storage_bucket_mnist_dir , storage_bucket_mnist_dir_exists := GetStorageBucketMnistDir ()
126+
120127 tuningJob := & kftov1.PyTorchJob {
121128 TypeMeta : metav1.TypeMeta {
122129 APIVersion : corev1 .SchemeGroupVersion .String (),
@@ -162,8 +169,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
162169 fmt .Sprintf (`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
163170 pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
164171 echo "Downloading MNIST dataset..." && \
165- python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
166- MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
172+ python3 /mnt/files/download_mnist_datasets.py --dataset_path "/tmp/datasets/mnist" && \
167173 echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
168174 echo -e "\n\n Starting training..." && \
169175 torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
@@ -247,8 +253,7 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
247253 fmt .Sprintf (`mkdir -p /tmp/lib /tmp/datasets/mnist && export PYTHONPATH=$PYTHONPATH:/tmp/lib && \
248254 pip install --no-cache-dir -r /mnt/files/requirements.txt --target=/tmp/lib && \
249255 echo "Downloading MNIST dataset..." && \
250- python3 -c "from torchvision.datasets import MNIST; from torchvision.transforms import Compose, ToTensor; \
251- MNIST('/tmp/datasets/mnist', train=False, download=True, transform=Compose([ToTensor()]))" && \
256+ python3 /mnt/files/download_mnist_datasets.py --dataset_path "/tmp/datasets/mnist" && \
252257 echo -e "\n\n Dataset downloaded to /tmp/datasets/mnist" && ls -R /tmp/datasets/mnist && \
253258 echo -e "\n\n Starting training..." && \
254259 torchrun --nproc_per_node=%d /mnt/files/mnist.py --dataset_path "/tmp/datasets/mnist" --epochs 7 --save_every 2 --batch_size 128 --lr 0.001 --snapshot_path "mnist_snapshot.pt" --backend %s` , numProcPerNode , backend ),
@@ -344,6 +349,40 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
344349 }
345350 }
346351
352+ // Use storage bucket to download the MNIST datasets if required environment variables are provided, else use default MNIST mirror references as the fallback
353+ if storage_bucket_endpoint_exists && storage_bucket_access_key_id_exists && storage_bucket_secret_key_exists && storage_bucket_name_exists && storage_bucket_mnist_dir_exists {
354+ storage_bucket_env_vars := []corev1.EnvVar {
355+ {
356+ Name : "AWS_DEFAULT_ENDPOINT" ,
357+ Value : storage_bucket_endpoint ,
358+ },
359+ {
360+ Name : "AWS_ACCESS_KEY_ID" ,
361+ Value : storage_bucket_access_key_id ,
362+ },
363+ {
364+ Name : "AWS_SECRET_ACCESS_KEY" ,
365+ Value : storage_bucket_secret_key ,
366+ },
367+ {
368+ Name : "AWS_STORAGE_BUCKET" ,
369+ Value : storage_bucket_name ,
370+ },
371+ {
372+ Name : "AWS_STORAGE_BUCKET_MNIST_DIR" ,
373+ Value : storage_bucket_mnist_dir ,
374+ },
375+ }
376+
377+ // Append the list of environment variables for the worker container
378+ for _ , envVar := range storage_bucket_env_vars {
379+ tuningJob .Spec .PyTorchReplicaSpecs [kftov1 .PyTorchJobReplicaTypeMaster ].Template .Spec .Containers [0 ].Env = upsert (tuningJob .Spec .PyTorchReplicaSpecs [kftov1 .PyTorchJobReplicaTypeMaster ].Template .Spec .Containers [0 ].Env , envVar , withEnvVarName (envVar .Name ))
380+ }
381+
382+ } else {
383+ test .T ().Logf ("Skipped usage of S3 storage bucket, because required environment variables aren't provided!\n Required environment variables : AWS_DEFAULT_ENDPOINT, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_STORAGE_BUCKET, AWS_STORAGE_BUCKET_MNIST_DIR" )
384+ }
385+
347386 tuningJob , err := test .Client ().Kubeflow ().KubeflowV1 ().PyTorchJobs (namespace ).Create (test .Ctx (), tuningJob , metav1.CreateOptions {})
348387 test .Expect (err ).NotTo (HaveOccurred ())
349388 test .T ().Logf ("Created PytorchJob %s/%s successfully" , tuningJob .Namespace , tuningJob .Name )
0 commit comments