Add test coverage for multi node multi gpu mnist training

ChughShilpa · openshift-merge-bot[bot] · commit 65cc09e5b50c · 2025-11-27T08:06:31.000Z
diff --git a/tests/common/support/environment.go b/tests/common/support/environment.go
@@ -51,12 +51,13 @@ const (
 	pipTrustedHost = "PIP_TRUSTED_HOST"
 
 	// Storage bucket credentials
-	storageDefaultEndpoint = "AWS_DEFAULT_ENDPOINT"
-	storageDefaultRegion   = "AWS_DEFAULT_REGION"
-	storageAccessKeyId     = "AWS_ACCESS_KEY_ID"
-	storageSecretKey       = "AWS_SECRET_ACCESS_KEY"
-	storageBucketName      = "AWS_STORAGE_BUCKET"
-	storageBucketMnistDir  = "AWS_STORAGE_BUCKET_MNIST_DIR"
+	storageDefaultEndpoint       = "AWS_DEFAULT_ENDPOINT"
+	storageDefaultRegion         = "AWS_DEFAULT_REGION"
+	storageAccessKeyId           = "AWS_ACCESS_KEY_ID"
+	storageSecretKey             = "AWS_SECRET_ACCESS_KEY"
+	storageBucketName            = "AWS_STORAGE_BUCKET"
+	storageBucketMnistDir        = "AWS_STORAGE_BUCKET_MNIST_DIR"
+	storageBucketFashionMnistDir = "AWS_STORAGE_BUCKET_FASHION_MNIST_DIR"
 
 	// Name of existing namespace to be used for test
 	testNamespaceNameEnvVar = "TEST_NAMESPACE_NAME"
@@ -179,6 +180,11 @@ func GetStorageBucketMnistDir() (string, bool) {
 	return storage_bucket_mnist_dir, exists
 }
 
+func GetStorageBucketFashionMnistDir() (string, bool) {
+	storage_bucket_fashion_mnist_dir, exists := os.LookupEnv(storageBucketFashionMnistDir)
+	return storage_bucket_fashion_mnist_dir, exists
+}
+
 func GetPipIndexURL() string {
 	return lookupEnvOrDefault(pipIndexURL, "https://pypi.python.org/simple")
 }
diff --git a/tests/common/support/jobset.go b/tests/common/support/jobset.go
@@ -68,6 +68,10 @@ func JobSetConditionFailed(jobset *jobsetv1alpha2.JobSet) metav1.ConditionStatus
 	return JobSetCondition(jobset, jobsetv1alpha2.JobSetFailed)
 }
 
+func JobSetConditionCompleted(jobset *jobsetv1alpha2.JobSet) metav1.ConditionStatus {
+	return JobSetCondition(jobset, jobsetv1alpha2.JobSetCompleted)
+}
+
 func JobSetFailureMessage(jobset *jobsetv1alpha2.JobSet) string {
 	if jobset == nil {
 		return ""
diff --git a/tests/trainer/resources/download_fashion_mnist.py b/tests/trainer/resources/download_fashion_mnist.py
@@ -0,0 +1,91 @@
+import os, gzip, shutil
+from minio import Minio
+from torchvision import datasets
+from torchvision.transforms import Compose, ToTensor
+
+def main(dataset_path):
+    # Download and Load Fashion-MNIST dataset 
+    if all(var in os.environ for var in ["AWS_DEFAULT_ENDPOINT","AWS_ACCESS_KEY_ID","AWS_SECRET_ACCESS_KEY","AWS_STORAGE_BUCKET","AWS_STORAGE_BUCKET_FASHION_MNIST_DIR"]):
+        print("Using provided storage bucket to download Fashion-MNIST datasets...")
+        dataset_dir = os.path.join(dataset_path, "FashionMNIST/raw")
+        endpoint = os.environ.get("AWS_DEFAULT_ENDPOINT")
+        access_key = os.environ.get("AWS_ACCESS_KEY_ID")
+        secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
+        bucket_name = os.environ.get("AWS_STORAGE_BUCKET")
+        print(f"Storage bucket endpoint: {endpoint}")
+        print(f"Storage bucket name: {bucket_name}\n")
+
+        # remove prefix if specified in storage bucket endpoint url
+        secure = True
+        if endpoint.startswith("https://"):
+            endpoint = endpoint[len("https://") :]
+        elif endpoint.startswith("http://"):
+            endpoint = endpoint[len("http://") :]
+            secure = False
+
+        client = Minio(
+            endpoint,
+            access_key=access_key,
+            secret_key=secret_key,
+            cert_check=False,
+            secure=secure
+        )
+        if not os.path.exists(dataset_dir):
+            os.makedirs(dataset_dir)
+        else:
+            print(f"Directory '{dataset_dir}' already exists")
+
+        # To download datasets from storage bucket's specific directory, use prefix to provide directory name
+        prefix=os.environ.get("AWS_STORAGE_BUCKET_FASHION_MNIST_DIR")
+        print(f"Storage bucket Fashion-MNIST directory prefix: {prefix}\n")
+
+        # download all files from prefix folder of storage bucket recursively
+        for item in client.list_objects(
+            bucket_name, prefix=prefix, recursive=True
+        ):  
+            file_name=item.object_name[len(prefix)+1:]
+            dataset_file_path = os.path.join(dataset_dir, file_name)
+            print(f"Downloading dataset file {file_name} to {dataset_file_path}..")
+            if not os.path.exists(dataset_file_path):
+                client.fget_object(
+                    bucket_name, item.object_name, dataset_file_path
+                )
+                # Unzip files -- 
+                ## Sample zipfilepath : ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz
+                with gzip.open(dataset_file_path, "rb") as f_in:
+                    filename=file_name.split(".")[0]    #-> t10k-images-idx3-ubyte
+                    file_path=("/".join(dataset_file_path.split("/")[:-1]))     #->../data/FashionMNIST/raw
+                    full_file_path=os.path.join(file_path,filename)     #->../data/FashionMNIST/raw/t10k-images-idx3-ubyte
+                    print(f"Extracting {dataset_file_path} to {file_path}..")
+
+                    with open(full_file_path, "wb") as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+                    print(f"Dataset file downloaded : {full_file_path}\n")
+                # delete zip file
+                os.remove(dataset_file_path)
+            else:
+                print(f"File-path '{dataset_file_path}' already exists")
+        download_datasets = False
+    else:
+        print("Using default Fashion-MNIST mirror references to download datasets ...")
+        print("Skipped usage of S3 storage bucket, because required environment variables aren't provided!\nRequired environment variables : AWS_DEFAULT_ENDPOINT, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_STORAGE_BUCKET, AWS_STORAGE_BUCKET_FASHION_MNIST_DIR")
+        download_datasets = True
+
+    datasets.FashionMNIST(
+        dataset_path, 
+        train=True, 
+        download=download_datasets, 
+        transform=Compose([ToTensor()])
+        )
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Fashion-MNIST dataset download")
+    parser.add_argument('--dataset_path', type=str, default="./data", help='Path to Fashion-MNIST datasets (default: ./data)')
+
+    args = parser.parse_args()
+
+    main(
+        dataset_path=args.dataset_path,
+    )
+
diff --git a/tests/trainer/resources/fashion_mnist.py b/tests/trainer/resources/fashion_mnist.py
@@ -0,0 +1,139 @@
+def train_pytorch():
+    import os
+    import logging
+
+    import torch
+    from torch import nn
+    import torch.nn.functional as F
+
+    from torchvision import datasets, transforms
+    import torch.distributed as dist
+    from torch.utils.data import DataLoader, DistributedSampler
+
+    # Configure logger (similar to KFTO mnist.py)
+    log_formatter = logging.Formatter(
+        "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ"
+    )
+    logger = logging.getLogger(__file__)
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(log_formatter)
+    logger.addHandler(console_handler)
+    logger.setLevel(logging.INFO)
+
+    # [1] Configure CPU/GPU device and distributed backend.
+    # Kubeflow Trainer will automatically configure the distributed environment.
+    device, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo")
+    dist.init_process_group(backend=backend)
+
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    logger.info(
+        "Distributed Training with WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}.".format(
+            dist.get_world_size(),
+            dist.get_rank(),
+            local_rank,
+        )
+    )
+
+    # [2] Define PyTorch CNN Model to be trained.
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 32, 3, 1)
+            self.conv2 = nn.Conv2d(32, 64, 3, 1)
+            self.fc1 = nn.Linear(9216, 128)
+            self.fc2 = nn.Linear(128, 10)
+
+        def forward(self, x):
+            x = F.relu(self.conv1(x))
+            x = F.relu(self.conv2(x))
+            x = F.max_pool2d(x, 2)
+            x = x.view(-1, 9216)
+            x = F.relu(self.fc1(x))
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    # [3] Attach model to the correct device.
+    device = torch.device(f"{device}:{local_rank}")
+    model = nn.parallel.DistributedDataParallel(Net().to(device))
+    model.train()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+    # [4] Get the Fashion-MNIST dataset.
+    # Dataset should be pre-downloaded to avoid network dependencies.
+    dataset_path = os.getenv("DATASET_PATH", "./data")
+    
+    # Load dataset (download=False assumes dataset is already present)
+    dataset = datasets.FashionMNIST(
+        dataset_path,
+        train=True,
+        download=False,
+        transform=transforms.Compose([transforms.ToTensor()]),
+    )
+    # Batch size configurable via env var (smaller = more iterations = longer training)
+    batch_size = int(os.getenv("BATCH_SIZE", "64"))
+    train_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=DistributedSampler(dataset),
+    )
+
+    # [5] Define the training loop.
+    num_epochs = int(os.getenv("NUM_EPOCHS", "1"))
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    
+    for epoch in range(num_epochs):
+        # Log epoch start from ALL ranks
+        num_batches = len(train_loader)
+        device_type = "GPU" if torch.cuda.is_available() else "CPU"
+        logger.info(f"[{device_type}{global_rank}] Epoch {epoch} | Batchsize: {batch_size} | Steps: {num_batches} | World Size: {world_size}")
+        
+        # Set epoch for DistributedSampler to ensure proper shuffling
+        if isinstance(train_loader.sampler, DistributedSampler):
+            train_loader.sampler.set_epoch(epoch)
+        
+        epoch_loss = 0.0
+        num_batches_processed = 0
+        
+        for batch_idx, (inputs, labels) in enumerate(train_loader):
+            # Attach tensors to the device.
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            # Forward pass
+            outputs = model(inputs)
+            loss = F.nll_loss(outputs, labels)
+
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            
+            # Track loss for epoch summary
+            epoch_loss += loss.item()
+            num_batches_processed += 1
+            
+            # Log detailed training progress from rank 0 only (to avoid log spam)
+            if batch_idx % 10 == 0 and global_rank == 0:
+                logger.info(
+                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                        epoch,
+                        batch_idx * len(inputs) * world_size,  # Adjust for distributed training
+                        len(train_loader.dataset),
+                        100.0 * batch_idx / num_batches,
+                        loss.item(),
+                    )
+                )
+        
+        # End-of-epoch summary from ALL ranks
+        avg_loss = epoch_loss / num_batches_processed
+        logger.info(f"[{device_type}{global_rank}] Epoch {epoch} completed | Avg Loss: {avg_loss:.6f} | Batches: {num_batches_processed}")
+
+    # Wait for the training to complete and destroy to PyTorch distributed process group.
+    dist.barrier()
+    # All ranks report completion
+    logger.info(f"[{device_type}{global_rank}] Training is finished")
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    train_pytorch()
diff --git a/tests/trainer/resources/requirements.txt b/tests/trainer/resources/requirements.txt
@@ -0,0 +1,6 @@
+# Install only packages which are not present in base image
+# Use "# no-deps" marker for packages that should be installed without dependencies
+
+minio==7.2.13
+torchvision==0.23.0  # no-deps
+pillow==11.0.0  # no-deps
diff --git a/tests/trainer/support.go b/tests/trainer/support.go
@@ -0,0 +1,35 @@
+/*
+Copyright 2025.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package trainer
+
+import (
+	"embed"
+
+	"github.com/onsi/gomega"
+
+	"github.com/opendatahub-io/distributed-workloads/tests/common/support"
+)
+
+//go:embed resources/*
+var files embed.FS
+
+func readFile(t support.Test, fileName string) []byte {
+	t.T().Helper()
+	file, err := files.ReadFile(fileName)
+	t.Expect(err).NotTo(gomega.HaveOccurred())
+	return file
+}
diff --git a/tests/trainer/trainer_fashion_mnist_training_test.go b/tests/trainer/trainer_fashion_mnist_training_test.go