|
44 | 44 | " x = self.fc2(x)\n", |
45 | 45 | " return F.log_softmax(x, dim=1)\n", |
46 | 46 | "\n", |
47 | | - " # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.\n", |
48 | | - " device, backend = (\"cuda\", \"nccl\") if torch.cuda.is_available() else (\"cpu\", \"gloo\")\n", |
49 | | - " print(f\"Using Device: {device}, Backend: {backend}\")\n", |
| 47 | + " # Force CPU-only for this test to avoid accidental NCCL/GPU usage\n", |
| 48 | + " backend = \"gloo\"\n", |
| 49 | + " device = torch.device(\"cpu\")\n", |
| 50 | + " print(f\"Using Device: cpu, Backend: {backend}\")\n", |
50 | 51 | "\n", |
51 | 52 | " # Setup PyTorch distributed.\n", |
52 | | - " local_rank = int(os.getenv(\"PET_NODE_RANK\", 0))\n", |
| 53 | + " local_rank = int(os.getenv(\"LOCAL_RANK\") or os.getenv(\"PET_NODE_RANK\") or 0)\n", |
53 | 54 | " dist.init_process_group(backend=backend)\n", |
54 | 55 | " print(\n", |
55 | 56 | " \"Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}\".format(\n", |
|
60 | 61 | " )\n", |
61 | 62 | "\n", |
62 | 63 | " # Create the model and load it into the device.\n", |
63 | | - " device = torch.device(f\"{device}:{local_rank}\")\n", |
64 | 64 | " model = nn.parallel.DistributedDataParallel(Net().to(device))\n", |
65 | 65 | " optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n", |
66 | 66 | "\n", |
67 | | - " \n", |
68 | 67 | " # Prefer shared PVC if present; else fallback to internet download (rank 0 only)\n", |
69 | 68 | " from urllib.parse import urlparse\n", |
70 | 69 | " import gzip, shutil\n", |
71 | | - " \n", |
| 70 | + "\n", |
72 | 71 | " pvc_root = \"/mnt/shared\"\n", |
73 | 72 | " pvc_raw = os.path.join(pvc_root, \"FashionMNIST\", \"raw\")\n", |
74 | 73 | "\n", |
75 | | - "\n", |
76 | 74 | " use_pvc = os.path.isdir(pvc_raw) and any(os.scandir(pvc_raw))\n", |
77 | 75 | "\n", |
78 | 76 | " if not use_pvc:\n", |
|
126 | 124 | "\n", |
127 | 125 | " # Iterate over mini-batches from the training set\n", |
128 | 126 | " for batch_idx, (inputs, labels) in enumerate(train_loader):\n", |
129 | | - " # Copy the data to the GPU device if available\n", |
| 127 | + " # Move the data to the selected device\n", |
130 | 128 | " inputs, labels = inputs.to(device), labels.to(device)\n", |
131 | 129 | " # Forward pass\n", |
132 | 130 | " outputs = model(inputs)\n", |
|
0 commit comments