Skip to content

Commit 41e8582

Browse files
committed
multigpu ci test
1 parent f758a21 commit 41e8582

File tree

2 files changed

+91
-6
lines changed

2 files changed

+91
-6
lines changed

.github/scripts/filter-matrix.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,39 @@ def filter_matrix_item(
6060
return True
6161

6262

63+
def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
64+
"""Create distributed test configuration from a regular config.
65+
66+
Takes a standard test config and modifies it for distributed testing:
67+
- Changes runner to multi-GPU instance
68+
- Adds num_gpus field
69+
- Adds config marker
70+
"""
71+
import sys
72+
73+
# Create a copy to avoid modifying the original
74+
dist_item = item.copy()
75+
76+
# Debug: Show original config
77+
print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
78+
print(f"[DEBUG] Python: {item.get('python_version')}", file=sys.stderr)
79+
print(f"[DEBUG] CUDA: {item.get('desired_cuda')}", file=sys.stderr)
80+
print(f"[DEBUG] Original runner: {item.get('validation_runner')}", file=sys.stderr)
81+
82+
# Override runner to use multi-GPU instance
83+
dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"
84+
85+
# Add distributed-specific fields
86+
dist_item["num_gpus"] = 2
87+
dist_item["config"] = "distributed"
88+
89+
# Debug: Show modified config
90+
print(f"[DEBUG] New runner: {dist_item['validation_runner']}", file=sys.stderr)
91+
print(f"[DEBUG] GPUs: {dist_item['num_gpus']}", file=sys.stderr)
92+
93+
return dist_item
94+
95+
6396
def main(args: list[str]) -> None:
6497
parser = argparse.ArgumentParser()
6598
parser.add_argument(
@@ -99,6 +132,9 @@ def main(args: list[str]) -> None:
99132

100133
includes = matrix_dict["include"]
101134
filtered_includes = []
135+
distributed_includes = [] # NEW: separate list for distributed configs
136+
137+
print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)
102138

103139
for item in includes:
104140
if filter_matrix_item(
@@ -107,8 +143,25 @@ def main(args: list[str]) -> None:
107143
options.limit_pr_builds == "true",
108144
):
109145
filtered_includes.append(item)
110-
111-
filtered_matrix_dict = {"include": filtered_includes}
146+
147+
# NEW: Create distributed variant for specific configs
148+
# Only Python 3.10 + CUDA 13.0 for now
149+
if item["python_version"] == "3.10" and item["desired_cuda"] == "cu130":
150+
print(f"[DEBUG] Creating distributed config for py3.10+cu130", file=sys.stderr)
151+
distributed_includes.append(create_distributed_config(item))
152+
153+
# Debug: Show summary
154+
print(f"[DEBUG] Final counts:", file=sys.stderr)
155+
print(f"[DEBUG] Regular configs: {len(filtered_includes)}", file=sys.stderr)
156+
print(f"[DEBUG] Distributed configs: {len(distributed_includes)}", file=sys.stderr)
157+
158+
# NEW: Output both regular and distributed configs
159+
filtered_matrix_dict = {
160+
"include": filtered_includes,
161+
"distributed_include": distributed_includes # NEW field
162+
}
163+
164+
# Output to stdout (consumed by GitHub Actions)
112165
print(json.dumps(filtered_matrix_dict))
113166

114167

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,18 +480,50 @@ jobs:
480480
ref: ""
481481
test-infra-repository: pytorch/test-infra
482482
test-infra-ref: main
483-
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
483+
# Extract the distributed_include array from filter-matrix output
484+
build-matrix: |
485+
{
486+
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
487+
}
484488
pre-script: ${{ matrix.pre-script }}
485489
script: |
486490
set -euo pipefail
491+
492+
# Debug: Show what config we're using
493+
echo "=========================================="
494+
echo "DISTRIBUTED TEST CONFIGURATION"
495+
echo "=========================================="
496+
echo "Python version: ${PYTHON_VERSION}"
497+
echo "CUDA version: ${CU_VERSION}"
498+
echo "Runner: ${{ matrix.validation_runner }}"
499+
echo "Num GPUs: ${{ matrix.num_gpus }}"
500+
echo "Config: ${{ matrix.config }}"
501+
echo "=========================================="
502+
503+
# Verify GPUs are available
504+
echo "Checking GPU availability:"
505+
nvidia-smi
506+
echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
507+
echo "=========================================="
508+
487509
export USE_HOST_DEPS=1
488510
export CI_BUILD=1
489511
export USE_TRTLLM_PLUGINS=1
512+
513+
# Install MPI (required for TensorRT-LLM plugins)
514+
echo "Installing MPI..."
490515
dnf install -y mpich mpich-devel openmpi openmpi-devel
516+
517+
# Run distributed tests
491518
pushd .
492-
cd tests/py
493-
cd dynamo
494-
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
519+
cd tests/py/dynamo
520+
521+
echo "Running distributed tests with torchrun..."
522+
torchrun --nproc_per_node=${{ matrix.num_gpus }} \
523+
-m pytest -ra \
524+
--junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
525+
distributed/test_nccl_ops.py
526+
495527
popd
496528
497529
concurrency:

0 commit comments

Comments
 (0)