Skip to content

Commit a61153d

Browse files
committed
mmultigpu ci test
1 parent f758a21 commit a61153d

File tree

3 files changed

+141
-6
lines changed

3 files changed

+141
-6
lines changed

.github/scripts/filter-matrix.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,41 @@ def filter_matrix_item(
6060
return True
6161

6262

63+
def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
64+
"""Create distributed test configuration from a regular config.
65+
66+
Takes a standard test config and modifies it for distributed testing:
67+
- Changes runner to multi-GPU instance
68+
- Adds num_gpus field
69+
- Adds config marker
70+
"""
71+
import sys
72+
73+
# Create a copy to avoid modifying the original
74+
dist_item = item.copy()
75+
76+
# Debug: Show original config
77+
print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
78+
print(f"[DEBUG] Python: {item.get('python_version')}", file=sys.stderr)
79+
print(f"[DEBUG] CUDA: {item.get('desired_cuda')}", file=sys.stderr)
80+
print(
81+
f"[DEBUG] Original runner: {item.get('validation_runner')}", file=sys.stderr
82+
)
83+
84+
# Override runner to use multi-GPU instance
85+
dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"
86+
87+
# Add distributed-specific fields
88+
dist_item["num_gpus"] = 2
89+
dist_item["config"] = "distributed"
90+
91+
# Debug: Show modified config
92+
print(f"[DEBUG] New runner: {dist_item['validation_runner']}", file=sys.stderr)
93+
print(f"[DEBUG] GPUs: {dist_item['num_gpus']}", file=sys.stderr)
94+
95+
return dist_item
96+
97+
6398
def main(args: list[str]) -> None:
6499
parser = argparse.ArgumentParser()
65100
parser.add_argument(
@@ -99,16 +134,69 @@ def main(args: list[str]) -> None:
99134

100135
includes = matrix_dict["include"]
101136
filtered_includes = []
137+
distributed_includes = [] # NEW: separate list for distributed configs
138+
139+
print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)
102140

103141
for item in includes:
142+
py_ver = item.get("python_version", "unknown")
143+
cuda_ver = item.get("desired_cuda", "unknown")
144+
145+
print(f"[DEBUG] Checking config: py={py_ver}, cuda={cuda_ver}", file=sys.stderr)
146+
104147
if filter_matrix_item(
105148
item,
106149
options.jetpack == "true",
107150
options.limit_pr_builds == "true",
108151
):
152+
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
109153
filtered_includes.append(item)
110154

111-
filtered_matrix_dict = {"include": filtered_includes}
155+
# NEW: Create distributed variant for specific configs
156+
# Only Python 3.10 + CUDA 13.0 for now
157+
if item["python_version"] == "3.10" and item["desired_cuda"] == "cu130":
158+
print(
159+
f"[DEBUG] Creating distributed config for py3.10+cu130",
160+
file=sys.stderr,
161+
)
162+
distributed_includes.append(create_distributed_config(item))
163+
else:
164+
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
165+
166+
# Debug: Show summary
167+
print(f"[DEBUG] Final counts:", file=sys.stderr)
168+
print(f"[DEBUG] Regular configs: {len(filtered_includes)}", file=sys.stderr)
169+
print(
170+
f"[DEBUG] Distributed configs: {len(distributed_includes)}", file=sys.stderr
171+
)
172+
173+
# Debug: Show which configs will be built
174+
print(
175+
f"[DEBUG] Configs that will be BUILT (in filtered_includes):", file=sys.stderr
176+
)
177+
for item in filtered_includes:
178+
print(
179+
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}",
180+
file=sys.stderr,
181+
)
182+
183+
print(
184+
f"[DEBUG] Configs for DISTRIBUTED TESTS (in distributed_includes):",
185+
file=sys.stderr,
186+
)
187+
for item in distributed_includes:
188+
print(
189+
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}, gpus={item.get('num_gpus')}",
190+
file=sys.stderr,
191+
)
192+
193+
# NEW: Output both regular and distributed configs
194+
filtered_matrix_dict = {
195+
"include": filtered_includes,
196+
"distributed_include": distributed_includes, # NEW field
197+
}
198+
199+
# Output to stdout (consumed by GitHub Actions)
112200
print(json.dumps(filtered_matrix_dict))
113201

114202

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,11 @@ jobs:
6868
ref: ""
6969
test-infra-repository: pytorch/test-infra
7070
test-infra-ref: main
71-
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
71+
# Extract the include array from filter-matrix output
72+
build-matrix: |
73+
{
74+
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
75+
}
7276
pre-script: ${{ matrix.pre-script }}
7377
env-var-script: ${{ matrix.env-var-script }}
7478
post-script: ${{ matrix.post-script }}
@@ -480,18 +484,50 @@ jobs:
480484
ref: ""
481485
test-infra-repository: pytorch/test-infra
482486
test-infra-ref: main
483-
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
487+
# Extract the distributed_include array from filter-matrix output
488+
build-matrix: |
489+
{
490+
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
491+
}
484492
pre-script: ${{ matrix.pre-script }}
485493
script: |
486494
set -euo pipefail
495+
496+
# Debug: Show what config we're using
497+
echo "=========================================="
498+
echo "DISTRIBUTED TEST CONFIGURATION"
499+
echo "=========================================="
500+
echo "Python version: ${PYTHON_VERSION}"
501+
echo "CUDA version: ${CU_VERSION}"
502+
echo "Runner: ${{ matrix.validation_runner }}"
503+
echo "Num GPUs: ${{ matrix.num_gpus }}"
504+
echo "Config: ${{ matrix.config }}"
505+
echo "=========================================="
506+
507+
# Verify GPUs are available
508+
echo "Checking GPU availability:"
509+
nvidia-smi
510+
echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
511+
echo "=========================================="
512+
487513
export USE_HOST_DEPS=1
488514
export CI_BUILD=1
489515
export USE_TRTLLM_PLUGINS=1
516+
517+
# Install MPI (required for TensorRT-LLM plugins)
518+
echo "Installing MPI..."
490519
dnf install -y mpich mpich-devel openmpi openmpi-devel
520+
521+
# Run distributed tests
491522
pushd .
492-
cd tests/py
493-
cd dynamo
494-
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
523+
cd tests/py/dynamo
524+
525+
echo "Running distributed tests with mpirun..."
526+
mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
527+
python -m pytest -ra \
528+
--junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
529+
distributed/test_nccl_ops.py
530+
495531
popd
496532
497533
concurrency:

.github/workflows/build_linux.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,17 @@ jobs:
160160
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
161161
timeout-minutes: ${{ inputs.timeout }}
162162
steps:
163+
- name: Debug matrix configuration
164+
shell: bash
165+
run: |
166+
echo "=========================================="
167+
echo "BUILD MATRIX DEBUG"
168+
echo "=========================================="
169+
echo "Python version: ${{ matrix.python_version }}"
170+
echo "CUDA version: ${{ matrix.desired_cuda }}"
171+
echo "GPU arch type: ${{ matrix.gpu_arch_type }}"
172+
echo "Runner: ${{ matrix.validation_runner }}"
173+
echo "=========================================="
163174
- name: Clean workspace
164175
shell: bash -l {0}
165176
run: |

0 commit comments

Comments
 (0)