addressing CI master port assignment

apbose · apbose · commit 112f217ccc17 · 2025-11-20T17:31:59.000-08:00
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
@@ -584,6 +584,10 @@ jobs:
         echo "[CONFIG] Number of GPUs to use: ${NUM_GPUS}"
         echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
 
+        # Set master port for distributed communication (must be same across all ranks)
+        export MASTER_ADDR=127.0.0.1
+        export MASTER_PORT=29500
+
         # Use a wrapper script to ensure only rank 0 writes the JUnit XML
         # Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
         RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
diff --git a/tests/py/dynamo/distributed/distributed_utils.py b/tests/py/dynamo/distributed/distributed_utils.py
@@ -22,7 +22,11 @@ def set_environment_variables_pytest_single_process():
 def set_environment_variables_pytest_multi_process(
     rank: int = 0, world_size: int = 1
 ) -> None:
-    port = 29500 + random.randint(1, 1000)
+    # Use existing MASTER_PORT if set, otherwise generate random one
+    if "MASTER_PORT" not in os.environ:
+        port = 29500 + random.randint(1, 1000)
+        os.environ["MASTER_PORT"] = str(port)
+
     # these variables are set by mpirun -n 2
     local_rank = int(
         os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", rank % torch.cuda.device_count())
@@ -32,11 +36,15 @@ def set_environment_variables_pytest_multi_process(
     # Set up environment variable to run with mpirun
     os.environ["RANK"] = str(local_rank)
     os.environ["WORLD_SIZE"] = str(world_size)
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = str(port)
-
-    # Necessary to assign a device to each rank.
-    torch.cuda.set_device(local_rank)
+    os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "127.0.0.1")
+
+    # Takes into account 2 processes on 1 GPU
+    num_gpus = torch.cuda.device_count()
+    if num_gpus > 0:
+        gpu_id = local_rank % num_gpus
+        torch.cuda.set_device(gpu_id)
+    else:
+        raise RuntimeError("No CUDA devices available for distributed testing")
 
     # We use nccl backend
     dist.init_process_group("nccl")