Skip to content

Commit fb05cd7

Browse files
EmmaQiaoChchzblych
andauthored
[None][infra] Enable single-gpu CI on spark (#9304)
Signed-off-by: qqiao <[email protected]> Signed-off-by: Emma Qiao <[email protected]> Signed-off-by: Jenny Liu <[email protected]> Co-authored-by: Yanchao Lu <[email protected]>
1 parent cce7247 commit fb05cd7

File tree

4 files changed

+219
-53
lines changed

4 files changed

+219
-53
lines changed

jenkins/L0_Test.groovy

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
100100
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]
101101

102102
// GPU types that don't support dynamic driver flashing
103-
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
103+
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200", "gb10x"]
104104

105105
// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
106106
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
@@ -672,7 +672,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
672672

673673
if (cluster.host.contains("dlcluster")) {
674674
dockerArgs += " " + sh(script: 'echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"', returnStdout: true).trim()
675-
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
675+
if (fileExists('/dev/gdrdrv')) {
676+
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
677+
}
676678
}
677679
}
678680

@@ -1562,7 +1564,7 @@ EOF_TIMEOUT_XML
15621564

15631565
def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMode = false)
15641566
{
1565-
def targetCould = "kubernetes-cpu"
1567+
def targetCloud = "kubernetes-cpu"
15661568
def selectors = """
15671569
nvidia.com/node_type: builder
15681570
kubernetes.io/arch: ${arch}
@@ -1571,6 +1573,8 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
15711573
def nodeLabelPrefix = ""
15721574
def jobName = getShortenedJobName(env.JOB_NAME)
15731575
def buildID = env.BUILD_ID
1576+
def tolerations = ""
1577+
def extraDeviceEnv = ""
15741578

15751579
def archSuffix = arch == "arm64" ? "arm" : "amd"
15761580
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
@@ -1653,14 +1657,40 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
16531657
def gpuType = KubernetesManager.selectGPU(type)
16541658
nodeLabelPrefix = type
16551659

1656-
targetCould = "kubernetes"
1660+
targetCloud = "kubernetes"
1661+
// DGX Spark requires a special setting for accessing the device.
1662+
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
1663+
if (type == "gb10x") {
1664+
targetCloud = "nvks-sparks-cloud"
1665+
memorySize = "64Gi"
1666+
tolerations = """
1667+
tolerations:
1668+
- key: "node_for_blossom_trt"
1669+
operator: "Exists"
1670+
effect: "NoSchedule"
1671+
"""
1672+
extraDeviceEnv = """
1673+
- name: NVIDIA_VISIBLE_DEVICES
1674+
value: "all"
1675+
- name: NVIDIA_DRIVER_CAPABILITIES
1676+
value: "compute,utility"
1677+
"""
1678+
}
16571679

16581680
// The following GPU types doesn't support dynamic driver flashing.
16591681
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
1660-
selectors = """
1682+
if (type == "gb10x") {
1683+
selectors = """
1684+
kubernetes.io/arch: ${arch}
1685+
kubernetes.io/os: linux
1686+
nvidia.com/gpu.machine: NVIDIA_DGX_Spark
1687+
nvidia.com/tenant: blossom_trt"""
1688+
} else {
1689+
selectors = """
16611690
kubernetes.io/arch: ${arch}
16621691
kubernetes.io/os: linux
16631692
nvidia.com/gpu_type: ${gpuType}"""
1693+
}
16641694
} else if (perfMode && !hasMultipleGPUs) {
16651695
// Use single GPU machine with "tensorrt/test_type: perf" for stable perf testing.
16661696
// H100 / A100 single GPU machine has this unique label in TensorRT Blossom pool.
@@ -1744,7 +1774,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
17441774
}
17451775

17461776
def podConfig = [
1747-
cloud: targetCould,
1777+
cloud: targetCloud,
17481778
namespace: "sw-tensorrt",
17491779
label: nodeLabel,
17501780
yaml: """
@@ -1771,6 +1801,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
17711801
valueFrom:
17721802
fieldRef:
17731803
fieldPath: spec.nodeName
1804+
${extraDeviceEnv}
17741805
- name: jnlp
17751806
image: ${jnlpImage}
17761807
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
@@ -1790,6 +1821,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
17901821
medium: Memory
17911822
${llmModelVolume}
17921823
${pvcVolume}
1824+
${tolerations}
17931825
""".stripIndent(),
17941826
]
17951827

@@ -3202,16 +3234,19 @@ def launchTestJobs(pipeline, testFilter)
32023234
parallelJobs += parallelSlurmJobs
32033235

32043236
// Try to match what are being tested on x86 H100_PCIe.
3205-
// The total machine time is scaled proportionally according to the number of each GPU.
3237+
// SBSA machines from the Blossom machine pool
32063238
SBSATestConfigs = [
32073239
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
3240+
// DGX Spark is also named as GB10 Grace Blackwell Superchip.
3241+
"GB10-PyTorch-1": ["gb10x", "l0_gb10", 1, 1],
32083242
]
32093243
fullSet += SBSATestConfigs.keySet()
32103244

32113245
SBSASlurmTestConfigs = [
32123246
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
32133247
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
32143248
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
3249+
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
32153250
// Perf sanity post merge test
32163251
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
32173252
// Disable GB300 stages due to nodes will be offline temporarily.

tests/integration/defs/conftest.py

Lines changed: 128 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import datetime
1818
import gc
19+
import logging
1920
import os
2021
import platform
2122
import re
@@ -55,6 +56,9 @@
5556
except ImportError:
5657
trt_environment = None
5758

59+
# Logger
60+
logger = logging.getLogger(__name__)
61+
5862
# TODO: turn off this when the nightly storage issue is resolved.
5963
DEBUG_CI_STORAGE = os.environ.get("DEBUG_CI_STORAGE", False)
6064
GITLAB_API_USER = os.environ.get("GITLAB_API_USER")
@@ -2681,60 +2685,139 @@ def skip_by_host_memory(request):
26812685
gpu_warning_threshold = 1024 * 1024 * 1024
26822686

26832687

2688+
def get_gpu_memory_wo_pynvml():
2689+
import psutil
2690+
2691+
logger.warning(
2692+
f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
2693+
)
2694+
2695+
gpu_memory = {}
2696+
system_total_mb = 0
2697+
system_used_mb = 0
2698+
try:
2699+
mem_output = check_output("free -m | awk '/^Mem:/ {print $3, $2}'",
2700+
shell=True)
2701+
parts = mem_output.strip().split()
2702+
system_used_mb = int(parts[0])
2703+
system_total_mb = int(parts[1])
2704+
except Exception:
2705+
pass
2706+
2707+
# Parse nvidia-smi pmon to get GPU memory usage
2708+
try:
2709+
gpu_output = check_output("nvidia-smi pmon -s m -c 1", shell=True)
2710+
lines = gpu_output.strip().split('\n')
2711+
2712+
for line in lines:
2713+
parts = line.split()
2714+
try:
2715+
gpu_idx = int(parts[0])
2716+
2717+
# Initialize GPU entry if not exists
2718+
if gpu_idx not in gpu_memory:
2719+
gpu_memory[gpu_idx] = {
2720+
"total_used": 0,
2721+
"total": system_total_mb,
2722+
"process": {}
2723+
}
2724+
2725+
# Skip if no active process (pid is '-')
2726+
if parts[1] == '-':
2727+
continue
2728+
2729+
pid = int(parts[1])
2730+
mem_mb = int(parts[3])
2731+
gpu_memory[gpu_idx]["total_used"] += mem_mb
2732+
2733+
# Get process info (same as pynvml version)
2734+
try:
2735+
p = psutil.Process(pid)
2736+
host_memory_in_mbs = p.memory_full_info(
2737+
).uss // 1024 // 1024
2738+
gpu_memory[gpu_idx]["process"][pid] = (
2739+
mem_mb,
2740+
host_memory_in_mbs,
2741+
p.cmdline(),
2742+
)
2743+
except Exception:
2744+
pass
2745+
except (ValueError, IndexError):
2746+
continue
2747+
except Exception as gpu_err:
2748+
logging.warning(f"nvidia-smi pmon error: {gpu_err}")
2749+
2750+
# Create default entry for GPU 0 if no GPUs detected
2751+
if not gpu_memory:
2752+
gpu_memory[0] = {
2753+
"total_used": system_used_mb,
2754+
"total": system_total_mb,
2755+
"process": {}
2756+
}
2757+
return gpu_memory
2758+
2759+
26842760
def collect_status(item: pytest.Item):
26852761
if not IS_UNDER_CI_ENV:
26862762
return
26872763

26882764
import psutil
2689-
import pynvml
2690-
2691-
pynvml.nvmlInit()
2692-
2693-
handles = {
2694-
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
2695-
for idx in range(pynvml.nvmlDeviceGetCount())
2696-
}
2697-
2698-
deadline = time.perf_counter() + 60 # 1 min
2699-
observed_used = 0
2700-
global gpu_warning_threshold
2701-
2702-
while time.perf_counter() < deadline:
2703-
observed_used = max(
2704-
pynvml.nvmlDeviceGetMemoryInfo(device).used
2705-
for device in handles.values())
2706-
if observed_used <= gpu_warning_threshold:
2707-
break
2708-
time.sleep(1)
2709-
else:
2710-
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
2711-
warnings.warn(
2712-
f"Test {item.name} does not free up GPU memory correctly!")
27132765

27142766
gpu_memory = {}
2715-
for idx, device in handles.items():
2716-
total_used = pynvml.nvmlDeviceGetMemoryInfo(device).used // 1024 // 1024
2717-
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
2718-
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
2719-
process = {}
27202767

2721-
for entry in detail:
2722-
try:
2723-
p = psutil.Process(entry.pid)
2724-
host_memory_in_mbs = p.memory_full_info().uss // 1024 // 1024
2725-
process[entry.pid] = (
2726-
entry.usedGpuMemory // 1024 // 1024,
2727-
host_memory_in_mbs,
2728-
p.cmdline(),
2729-
)
2730-
except Exception:
2731-
pass
2732-
2733-
gpu_memory[idx] = {
2734-
"total_used": total_used,
2735-
"total": total,
2736-
"process": process
2768+
try:
2769+
import pynvml
2770+
pynvml.nvmlInit()
2771+
2772+
handles = {
2773+
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
2774+
for idx in range(pynvml.nvmlDeviceGetCount())
27372775
}
2776+
2777+
deadline = time.perf_counter() + 60 # 1 min
2778+
observed_used = 0
2779+
global gpu_warning_threshold
2780+
2781+
while time.perf_counter() < deadline:
2782+
observed_used = max(
2783+
pynvml.nvmlDeviceGetMemoryInfo(device).used
2784+
for device in handles.values())
2785+
if observed_used <= gpu_warning_threshold:
2786+
break
2787+
time.sleep(1)
2788+
else:
2789+
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
2790+
warnings.warn(
2791+
f"Test {item.name} does not free up GPU memory correctly!")
2792+
2793+
for idx, device in handles.items():
2794+
total_used = pynvml.nvmlDeviceGetMemoryInfo(
2795+
device).used // 1024 // 1024
2796+
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
2797+
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
2798+
process = {}
2799+
2800+
for entry in detail:
2801+
try:
2802+
p = psutil.Process(entry.pid)
2803+
host_memory_in_mbs = p.memory_full_info(
2804+
).uss // 1024 // 1024
2805+
process[entry.pid] = (
2806+
entry.usedGpuMemory // 1024 // 1024,
2807+
host_memory_in_mbs,
2808+
p.cmdline(),
2809+
)
2810+
except Exception:
2811+
pass
2812+
2813+
gpu_memory[idx] = {
2814+
"total_used": total_used,
2815+
"total": total,
2816+
"process": process
2817+
}
2818+
except Exception:
2819+
gpu_memory = get_gpu_memory_wo_pynvml()
2820+
27382821
print("\nCurrent memory status:")
27392822
print(gpu_memory)
27402823

tests/integration/defs/sysinfo/get_sysinfo.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,13 @@ def construct_gpu_properties(mako_opts, device_index=0):
191191
assert gpu_name != "", "device_product_name is empty after removing substring 'NVIDIA' and leading/trailing whitespaces."
192192

193193
compute_capability = get_compute_capability(device_index)
194-
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**2)
194+
try:
195+
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**
196+
2)
197+
except pynvml.NVMLError_NotSupported as e:
198+
logger.warning("Unable to get GPU memory info: {}".format(e))
199+
# Fallback to 8 GiB, expressed in MiB to match the nvml path above.
200+
gpu_memory = 8 * 1024
195201
# Gather GPU information
196202
mako_opt_dict["gpu"] = gpu_name
197203
mako_opt_dict["gpu_memory"] = gpu_memory
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
version: 0.0.1
2+
# DGX Spark is also named as GB10 Grace Blackwell Superchip.
3+
l0_gb10:
4+
- condition:
5+
ranges:
6+
system_gpu_count:
7+
gte: 1
8+
lte: 1
9+
wildcards:
10+
gpu:
11+
- '*gb10*'
12+
linux_distribution_name: ubuntu*
13+
cpu: aarch64
14+
terms:
15+
stage: post_merge
16+
backend: pytorch
17+
tests:
18+
# ------------- PyTorch tests ---------------
19+
- unittest/_torch/attention/test_attention_mla.py
20+
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
21+
- test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
22+
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency]
23+
- condition:
24+
ranges:
25+
system_gpu_count:
26+
gte: 1
27+
lte: 1
28+
wildcards:
29+
gpu:
30+
- '*gb10*'
31+
linux_distribution_name: ubuntu*
32+
cpu: aarch64
33+
terms:
34+
stage: pre_merge
35+
backend: pytorch
36+
tests:
37+
# ------------- PyTorch tests ---------------
38+
# Below cases which are commented out due to they failed on gb10
39+
# - unittest/_torch/modeling -k "modeling_mllama"
40+
- unittest/_torch/modeling -k "modeling_out_of_tree"
41+
# - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype0]
42+
# - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype1]

0 commit comments

Comments
 (0)