@@ -519,7 +519,7 @@ jobs:
519519
520520 L2-dynamo-distributed-tests :
521521 name : L2 dynamo distributed tests
522- needs : [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests, L1-torchscript-tests ]
522+ needs : [filter-matrix, build]
523523 strategy :
524524 fail-fast : false
525525 matrix :
@@ -568,17 +568,36 @@ jobs:
568568
569569 # Install MPI (required for TensorRT-LLM plugins)
570570 echo "Installing MPI..."
571- dnf install -y mpich mpich-devel openmpi openmpi-devel
571+ dnf install -y openmpi openmpi-devel
572+
573+ # Add OpenMPI to PATH (RHEL/AlmaLinux specific location)
574+ export PATH="/usr/lib64/openmpi/bin:$PATH"
575+ export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"
576+
577+ # Verify mpirun is accessible
578+ which mpirun
579+ mpirun --version
572580
573581 # Run distributed tests
574582 pushd .
575583 cd tests/py/dynamo
576584
577585 echo "Running distributed tests with mpirun..."
586+ echo "[CONFIG] Number of GPUs to use: ${{ matrix.num_gpus }}"
587+ echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
588+
589+ # Use a wrapper script to ensure only rank 0 writes the JUnit XML
590+ # Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
591+ RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
578592 mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
579- python -m pytest -ra \
580- --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
581- distributed/test_nccl_ops.py
593+ bash -c '
594+ echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
595+ if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
596+ python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
597+ else
598+ python -m pytest -ra distributed/test_nccl_ops.py
599+ fi
600+ '
582601
583602 popd
584603
0 commit comments