address comments and make training CPU only

MStokluska · openshift-merge-bot[bot] · commit 39f5ccd5d366 · 2025-11-25T14:12:35.000Z
diff --git a/tests/trainer/kubeflow_sdk_test.go b/tests/trainer/kubeflow_sdk_test.go
@@ -23,7 +23,7 @@ import (
 	sdktests "github.com/opendatahub-io/distributed-workloads/tests/trainer/sdk_tests"
 )
 
-func TestKubeflowSDK_Sanity(t *testing.T) {
+func TestKubeflowSdkSanity(t *testing.T) {
 	Tags(t, Sanity)
 	sdktests.RunFashionMnistCpuDistributedTraining(t)
 }
diff --git a/tests/trainer/resources/mnist.ipynb b/tests/trainer/resources/mnist.ipynb
@@ -44,12 +44,13 @@
     "            x = self.fc2(x)\n",
     "            return F.log_softmax(x, dim=1)\n",
     "\n",
-    "    # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.\n",
-    "    device, backend = (\"cuda\", \"nccl\") if torch.cuda.is_available() else (\"cpu\", \"gloo\")\n",
-    "    print(f\"Using Device: {device}, Backend: {backend}\")\n",
+    "    # Force CPU-only for this test to avoid accidental NCCL/GPU usage\n",
+    "    backend = \"gloo\"\n",
+    "    device = torch.device(\"cpu\")\n",
+    "    print(f\"Using Device: cpu, Backend: {backend}\")\n",
     "\n",
     "    # Setup PyTorch distributed.\n",
-    "    local_rank = int(os.getenv(\"PET_NODE_RANK\", 0))\n",
+    "    local_rank = int(os.getenv(\"LOCAL_RANK\") or os.getenv(\"PET_NODE_RANK\") or 0)\n",
     "    dist.init_process_group(backend=backend)\n",
     "    print(\n",
     "        \"Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}\".format(\n",
@@ -60,19 +61,16 @@
     "    )\n",
     "\n",
     "    # Create the model and load it into the device.\n",
-    "    device = torch.device(f\"{device}:{local_rank}\")\n",
     "    model = nn.parallel.DistributedDataParallel(Net().to(device))\n",
     "    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n",
     "\n",
-    "    \n",
     "    # Prefer shared PVC if present; else fallback to internet download (rank 0 only)\n",
     "    from urllib.parse import urlparse\n",
     "    import gzip, shutil\n",
-    "    \n",
+    "\n",
     "    pvc_root = \"/mnt/shared\"\n",
     "    pvc_raw = os.path.join(pvc_root, \"FashionMNIST\", \"raw\")\n",
     "\n",
-    "\n",
     "    use_pvc = os.path.isdir(pvc_raw) and any(os.scandir(pvc_raw))\n",
     "\n",
     "    if not use_pvc:\n",
@@ -126,7 +124,7 @@
     "\n",
     "        # Iterate over mini-batches from the training set\n",
     "        for batch_idx, (inputs, labels) in enumerate(train_loader):\n",
-    "            # Copy the data to the GPU device if available\n",
+    "            # Move the data to the selected device\n",
     "            inputs, labels = inputs.to(device), labels.to(device)\n",
     "            # Forward pass\n",
     "            outputs = model(inputs)\n",
diff --git a/tests/trainer/sdk_tests/fashion_mnist_tests.go b/tests/trainer/sdk_tests/fashion_mnist_tests.go
@@ -109,8 +109,7 @@ func RunFashionMnistCpuDistributedTraining(t *testing.T) {
 	podName, containerName := trainerutils.WaitForNotebookPodRunning(test, namespace.Name)
 
 	// Poll logs to check if the notebook execution completed successfully
-	if err := trainerutils.PollNotebookLogsForStatus(test, namespace.Name, podName, containerName, support.TestTimeoutDouble); err != nil {
-		test.Expect(err).To(Succeed(), "Notebook execution reported FAILURE")
-	}
+	err = trainerutils.PollNotebookLogsForStatus(test, namespace.Name, podName, containerName, support.TestTimeoutDouble)
+	test.Expect(err).ShouldNot(HaveOccurred(), "Notebook execution reported FAILURE")
 
 }

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ import (`
`23`	`23`	`sdktests "github.com/opendatahub-io/distributed-workloads/tests/trainer/sdk_tests"`
`24`	`24`	`)`
`25`	`25`
`26`		`-func TestKubeflowSDK_Sanity(t *testing.T) {`
	`26`	`+func TestKubeflowSdkSanity(t *testing.T) {`
`27`	`27`	`Tags(t, Sanity)`
`28`	`28`	`sdktests.RunFashionMnistCpuDistributedTraining(t)`
`29`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -109,8 +109,7 @@ func RunFashionMnistCpuDistributedTraining(t *testing.T) {`
`109`	`109`	`podName, containerName := trainerutils.WaitForNotebookPodRunning(test, namespace.Name)`
`110`	`110`
`111`	`111`	`// Poll logs to check if the notebook execution completed successfully`
`112`		`- if err := trainerutils.PollNotebookLogsForStatus(test, namespace.Name, podName, containerName, support.TestTimeoutDouble); err != nil {`
`113`		`- test.Expect(err).To(Succeed(), "Notebook execution reported FAILURE")`
`114`		`- }`
	`112`	`+ err = trainerutils.PollNotebookLogsForStatus(test, namespace.Name, podName, containerName, support.TestTimeoutDouble)`
	`113`	`+ test.Expect(err).ShouldNot(HaveOccurred(), "Notebook execution reported FAILURE")`
`115`	`114`
`116`	`115`	`}`