|
1 | 1 | import concurrent |
2 | 2 | import contextlib |
| 3 | +import functools |
3 | 4 | import itertools |
4 | 5 | import json |
5 | 6 | import os |
| 7 | +import re |
| 8 | +import subprocess |
6 | 9 | import tempfile |
7 | 10 | import time |
8 | 11 | from collections import namedtuple |
@@ -49,6 +52,40 @@ def result(self): |
49 | 52 | DEFAULT_SERVER_WAITING_TIMEOUT = 1200 |
50 | 53 |
|
51 | 54 |
|
| 55 | +@functools.lru_cache(maxsize=1) |
| 56 | +def has_nvlink(): |
| 57 | + """ |
| 58 | + Check if the system has NVLink connectivity between GPUs. |
| 59 | +
|
| 60 | + Returns: |
| 61 | + bool: True if NVLink is detected, False otherwise. |
| 62 | + """ |
| 63 | + try: |
| 64 | + # Execute nvidia-smi nvlink command to query NVLink status |
| 65 | + result = subprocess.run(['nvidia-smi', 'nvlink', '-s'], |
| 66 | + capture_output=True, |
| 67 | + text=True, |
| 68 | + check=False) |
| 69 | + |
| 70 | + # Check if the command executed successfully |
| 71 | + if result.returncode != 0: |
| 72 | + return False |
| 73 | + |
| 74 | + # Look for bandwidth information (Link X: XX.XXX GB/s pattern) |
| 75 | + # which indicates active NVLink connections |
| 76 | + if re.search(r'Link \d+:\s+[\d.]+\s+GB/s', result.stdout): |
| 77 | + return True |
| 78 | + |
| 79 | + return False |
| 80 | + |
| 81 | + except (FileNotFoundError, subprocess.SubprocessError): |
| 82 | + # nvidia-smi not found or execution failed |
| 83 | + return False |
| 84 | + except Exception: |
| 85 | + # Any other unexpected error |
| 86 | + return False |
| 87 | + |
| 88 | + |
52 | 89 | class MyThreadPoolExecutor(ThreadPoolExecutor): |
53 | 90 |
|
54 | 91 | def __init__(self, *args, **kwargs) -> None: |
@@ -196,6 +233,8 @@ def _apply_perf_flags(cfg: Optional[Dict[str, Any]]): |
196 | 233 | gpu_range = range(current_gpu_offset, |
197 | 234 | current_gpu_offset + gen_total_gpus) |
198 | 235 | env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_range)) |
| 236 | + if not has_nvlink(): |
| 237 | + env["UCX_TLS"] = "^cuda_ipc" |
199 | 238 | current_gpu_offset += gen_total_gpus |
200 | 239 |
|
201 | 240 | gen_server_args = gen_args + [ |
@@ -966,6 +1005,7 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): |
966 | 1005 |
|
967 | 1006 | @pytest.mark.skip_less_device(2) |
968 | 1007 | @pytest.mark.parametrize("block_reuse", [False, True]) |
| 1008 | + @skip_pre_hopper |
969 | 1009 | def test_auto_dtype(self, block_reuse): |
970 | 1010 |
|
971 | 1011 | ctx_server_config = { |
|
0 commit comments