Skip to content

Commit 914dd39

Browse files
authored
[None][fix] disable cuda ipc on device without nvlink (L40s) for disagg test (#9735)
Signed-off-by: Chuang Zhu <[email protected]>
1 parent d274a4c commit 914dd39

File tree

2 files changed

+40
-2
lines changed

2 files changed

+40
-2
lines changed

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import concurrent
22
import contextlib
3+
import functools
34
import itertools
45
import json
56
import os
7+
import re
8+
import subprocess
69
import tempfile
710
import time
811
from collections import namedtuple
@@ -49,6 +52,40 @@ def result(self):
4952
DEFAULT_SERVER_WAITING_TIMEOUT = 1200
5053

5154

55+
@functools.lru_cache(maxsize=1)
56+
def has_nvlink():
57+
"""
58+
Check if the system has NVLink connectivity between GPUs.
59+
60+
Returns:
61+
bool: True if NVLink is detected, False otherwise.
62+
"""
63+
try:
64+
# Execute nvidia-smi nvlink command to query NVLink status
65+
result = subprocess.run(['nvidia-smi', 'nvlink', '-s'],
66+
capture_output=True,
67+
text=True,
68+
check=False)
69+
70+
# Check if the command executed successfully
71+
if result.returncode != 0:
72+
return False
73+
74+
# Look for bandwidth information (Link X: XX.XXX GB/s pattern)
75+
# which indicates active NVLink connections
76+
if re.search(r'Link \d+:\s+[\d.]+\s+GB/s', result.stdout):
77+
return True
78+
79+
return False
80+
81+
except (FileNotFoundError, subprocess.SubprocessError):
82+
# nvidia-smi not found or execution failed
83+
return False
84+
except Exception:
85+
# Any other unexpected error
86+
return False
87+
88+
5289
class MyThreadPoolExecutor(ThreadPoolExecutor):
5390

5491
def __init__(self, *args, **kwargs) -> None:
@@ -196,6 +233,8 @@ def _apply_perf_flags(cfg: Optional[Dict[str, Any]]):
196233
gpu_range = range(current_gpu_offset,
197234
current_gpu_offset + gen_total_gpus)
198235
env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_range))
236+
if not has_nvlink():
237+
env["UCX_TLS"] = "^cuda_ipc"
199238
current_gpu_offset += gen_total_gpus
200239

201240
gen_server_args = gen_args + [
@@ -966,6 +1005,7 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
9661005

9671006
@pytest.mark.skip_less_device(2)
9681007
@pytest.mark.parametrize("block_reuse", [False, True])
1008+
@skip_pre_hopper
9691009
def test_auto_dtype(self, block_reuse):
9701010

9711011
ctx_server_config = {

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,8 +307,6 @@ full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtyp
307307
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
308308
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
309309
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313)
310-
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5569696)
311-
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5569696)
312310
triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359)
313311
triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)
314312
accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/5606233)

0 commit comments

Comments
 (0)