Skip to content

Commit 57abf82

Browse files
committed
debug prints
1 parent c658233 commit 57abf82

File tree

1 file changed

+24
-5
lines changed

1 file changed

+24
-5
lines changed

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ jobs:
519519
520520
L2-dynamo-distributed-tests:
521521
name: L2 dynamo distributed tests
522-
needs: [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests, L1-torchscript-tests]
522+
needs: [filter-matrix, build]
523523
strategy:
524524
fail-fast: false
525525
matrix:
@@ -568,17 +568,36 @@ jobs:
568568
569569
# Install MPI (required for TensorRT-LLM plugins)
570570
echo "Installing MPI..."
571-
dnf install -y mpich mpich-devel openmpi openmpi-devel
571+
dnf install -y openmpi openmpi-devel
572+
573+
# Add OpenMPI to PATH (RHEL/AlmaLinux specific location)
574+
export PATH="/usr/lib64/openmpi/bin:$PATH"
575+
export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"
576+
577+
# Verify mpirun is accessible
578+
which mpirun
579+
mpirun --version
572580
573581
# Run distributed tests
574582
pushd .
575583
cd tests/py/dynamo
576584
577585
echo "Running distributed tests with mpirun..."
586+
echo "[CONFIG] Number of GPUs to use: ${{ matrix.num_gpus }}"
587+
echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
588+
589+
# Use a wrapper script to ensure only rank 0 writes the JUnit XML
590+
# Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
591+
RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
578592
mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
579-
python -m pytest -ra \
580-
--junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
581-
distributed/test_nccl_ops.py
593+
bash -c '
594+
echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
595+
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
596+
python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
597+
else
598+
python -m pytest -ra distributed/test_nccl_ops.py
599+
fi
600+
'
582601
583602
popd
584603

0 commit comments

Comments
 (0)