multigpu ci test

apbose · apbose · commit 41e85820303c · 2025-11-18T18:00:34.000-08:00
diff --git a/.github/scripts/filter-matrix.py b/.github/scripts/filter-matrix.py
@@ -60,6 +60,39 @@ def filter_matrix_item(
         return True
 
 
+def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Create distributed test configuration from a regular config.
+    
+    Takes a standard test config and modifies it for distributed testing:
+    - Changes runner to multi-GPU instance
+    - Adds num_gpus field
+    - Adds config marker
+    """
+    import sys
+    
+    # Create a copy to avoid modifying the original
+    dist_item = item.copy()
+    
+    # Debug: Show original config
+    print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
+    print(f"[DEBUG]   Python: {item.get('python_version')}", file=sys.stderr)
+    print(f"[DEBUG]   CUDA: {item.get('desired_cuda')}", file=sys.stderr)
+    print(f"[DEBUG]   Original runner: {item.get('validation_runner')}", file=sys.stderr)
+    
+    # Override runner to use multi-GPU instance
+    dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"
+    
+    # Add distributed-specific fields
+    dist_item["num_gpus"] = 2
+    dist_item["config"] = "distributed"
+    
+    # Debug: Show modified config
+    print(f"[DEBUG]   New runner: {dist_item['validation_runner']}", file=sys.stderr)
+    print(f"[DEBUG]   GPUs: {dist_item['num_gpus']}", file=sys.stderr)
+    
+    return dist_item
+
+
 def main(args: list[str]) -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -99,6 +132,9 @@ def main(args: list[str]) -> None:
 
     includes = matrix_dict["include"]
     filtered_includes = []
+    distributed_includes = []  # NEW: separate list for distributed configs
+    
+    print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)
 
     for item in includes:
         if filter_matrix_item(
@@ -107,8 +143,25 @@ def main(args: list[str]) -> None:
             options.limit_pr_builds == "true",
         ):
             filtered_includes.append(item)
-
-    filtered_matrix_dict = {"include": filtered_includes}
+            
+            # NEW: Create distributed variant for specific configs
+            # Only Python 3.10 + CUDA 13.0 for now
+            if item["python_version"] == "3.10" and item["desired_cuda"] == "cu130":
+                print(f"[DEBUG] Creating distributed config for py3.10+cu130", file=sys.stderr)
+                distributed_includes.append(create_distributed_config(item))
+    
+    # Debug: Show summary
+    print(f"[DEBUG] Final counts:", file=sys.stderr)
+    print(f"[DEBUG]   Regular configs: {len(filtered_includes)}", file=sys.stderr)
+    print(f"[DEBUG]   Distributed configs: {len(distributed_includes)}", file=sys.stderr)
+
+    # NEW: Output both regular and distributed configs
+    filtered_matrix_dict = {
+        "include": filtered_includes,
+        "distributed_include": distributed_includes  # NEW field
+    }
+    
+    # Output to stdout (consumed by GitHub Actions)
     print(json.dumps(filtered_matrix_dict))
 
 
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
@@ -480,18 +480,50 @@ jobs:
       ref: ""
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
-      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
+      # Extract the distributed_include array from filter-matrix output
+      build-matrix: |
+        {
+          "include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
+        }
       pre-script: ${{ matrix.pre-script }}
       script: |
         set -euo pipefail
+        
+        # Debug: Show what config we're using
+        echo "=========================================="
+        echo "DISTRIBUTED TEST CONFIGURATION"
+        echo "=========================================="
+        echo "Python version: ${PYTHON_VERSION}"
+        echo "CUDA version: ${CU_VERSION}"
+        echo "Runner: ${{ matrix.validation_runner }}"
+        echo "Num GPUs: ${{ matrix.num_gpus }}"
+        echo "Config: ${{ matrix.config }}"
+        echo "=========================================="
+        
+        # Verify GPUs are available
+        echo "Checking GPU availability:"
+        nvidia-smi
+        echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
+        echo "=========================================="
+        
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         export USE_TRTLLM_PLUGINS=1
+        
+        # Install MPI (required for TensorRT-LLM plugins)
+        echo "Installing MPI..."
         dnf install -y mpich mpich-devel openmpi openmpi-devel
+        
+        # Run distributed tests
         pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        cd tests/py/dynamo
+        
+        echo "Running distributed tests with torchrun..."
+        torchrun --nproc_per_node=${{ matrix.num_gpus }} \
+          -m pytest -ra \
+          --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
+          distributed/test_nccl_ops.py
+        
         popd
 
 concurrency: