Skip to content

Commit 22d297a

Browse files
committed
debug prints
1 parent c658233 commit 22d297a

File tree

3 files changed

+31
-9
lines changed

3 files changed

+31
-9
lines changed

.github/scripts/filter-matrix.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ def main(args: list[str]) -> None:
150150
options.limit_pr_builds == "true",
151151
):
152152
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
153+
# Add tensorrt version to all items (required by linux-test.yml)
154+
item["tensorrt"] = {"version": "10.13.3"}
153155
filtered_includes.append(item)
154156

155157
# NEW: Create distributed variant for specific configs

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ jobs:
519519
520520
L2-dynamo-distributed-tests:
521521
name: L2 dynamo distributed tests
522-
needs: [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests, L1-torchscript-tests]
522+
needs: [filter-matrix, build]
523523
strategy:
524524
fail-fast: false
525525
matrix:
@@ -542,6 +542,8 @@ jobs:
542542
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
543543
}
544544
pre-script: ${{ matrix.pre-script }}
545+
post-script: ${{ matrix.post-script }}
546+
smoke-test-script: ${{ matrix.smoke-test-script }}
545547
script: |
546548
set -euo pipefail
547549
@@ -551,9 +553,7 @@ jobs:
551553
echo "=========================================="
552554
echo "Python version: ${PYTHON_VERSION}"
553555
echo "CUDA version: ${CU_VERSION}"
554-
echo "Runner: ${{ matrix.validation_runner }}"
555-
echo "Num GPUs: ${{ matrix.num_gpus }}"
556-
echo "Config: ${{ matrix.config }}"
556+
echo "Num GPUs: ${NUM_GPUS}"
557557
echo "=========================================="
558558
559559
# Verify GPUs are available
@@ -568,17 +568,36 @@ jobs:
568568
569569
# Install MPI (required for TensorRT-LLM plugins)
570570
echo "Installing MPI..."
571-
dnf install -y mpich mpich-devel openmpi openmpi-devel
571+
dnf install -y openmpi openmpi-devel
572+
573+
# Add OpenMPI to PATH (RHEL/AlmaLinux specific location)
574+
export PATH="/usr/lib64/openmpi/bin:$PATH"
575+
export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"
576+
577+
# Verify mpirun is accessible
578+
which mpirun
579+
mpirun --version
572580
573581
# Run distributed tests
574582
pushd .
575583
cd tests/py/dynamo
576584
577585
echo "Running distributed tests with mpirun..."
578-
mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
579-
python -m pytest -ra \
580-
--junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
581-
distributed/test_nccl_ops.py
586+
echo "[CONFIG] Number of GPUs to use: ${NUM_GPUS}"
587+
echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
588+
589+
# Use a wrapper script to ensure only rank 0 writes the JUnit XML
590+
# Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
591+
RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
592+
mpirun --allow-run-as-root -n ${NUM_GPUS} \
593+
bash -c '
594+
echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
595+
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
596+
python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
597+
else
598+
python -m pytest -ra distributed/test_nccl_ops.py
599+
fi
600+
'
582601
583602
popd
584603

.github/workflows/linux-test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ jobs:
7474
RUNNER_TEST_RESULTS_DIR: /tmp/test_results
7575
ARCH: ${{ inputs.architecture }}
7676
USE_TRT_RTX: ${{ inputs.use-rtx }}
77+
NUM_GPUS: ${{ matrix.num_gpus || '' }}
7778
DOWNLOAD_ARTIFACT_NAME: pytorch_tensorrt_${{ matrix.tensorrt.version }}_${{ matrix.python_version }}_${{ matrix.desired_cuda }}_${{ inputs.architecture }}
7879
name: ${{ inputs.job-name }}-${{ matrix.tensorrt.version }}-${{ matrix.python_version }}-${{ matrix.desired_cuda }}
7980
runs-on: ${{ matrix.validation_runner }}

0 commit comments

Comments
 (0)