Skip to content

Commit ae68954

Browse files
committed
soft rebase off of emma spark ci enablement
Signed-off-by: list <[email protected]>
1 parent 24f9272 commit ae68954

File tree

4 files changed

+209
-53
lines changed

4 files changed

+209
-53
lines changed

jenkins/L0_Test.groovy

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
1+
@Library(['bloom-jenkins-shared-lib@emma/add_spark_for_slurm', 'trtllm-jenkins-shared-lib@main']) _
22

33
import java.lang.InterruptedException
44
import groovy.transform.Field
@@ -100,7 +100,7 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
100100
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]
101101

102102
// GPU types that don't support dynamic driver flashing
103-
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
103+
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200", "gb10x"]
104104

105105
// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
106106
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
@@ -672,7 +672,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
672672

673673
if (cluster.host.contains("dlcluster")) {
674674
dockerArgs += " " + sh(script: 'echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"', returnStdout: true).trim()
675-
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
675+
if (fileExists('/dev/gdrdrv')) {
676+
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
677+
}
676678
}
677679
}
678680

@@ -1355,7 +1357,7 @@ EOF_TIMEOUT_XML
13551357

13561358
def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMode = false)
13571359
{
1358-
def targetCould = "kubernetes-cpu"
1360+
def targetCloud = "kubernetes-cpu"
13591361
def selectors = """
13601362
nvidia.com/node_type: builder
13611363
kubernetes.io/arch: ${arch}
@@ -1364,9 +1366,13 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
13641366
def nodeLabelPrefix = ""
13651367
def jobName = getShortenedJobName(env.JOB_NAME)
13661368
def buildID = env.BUILD_ID
1369+
def tolerations = ""
13671370

13681371
def archSuffix = arch == "arm64" ? "arm" : "amd"
13691372
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
1373+
if ( type == "gb10x" ) {
1374+
println "Using type: ${type} to create Kubernetes Pod config"
1375+
}
13701376

13711377
switch(type)
13721378
{
@@ -1446,14 +1452,30 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
14461452
def gpuType = KubernetesManager.selectGPU(type)
14471453
nodeLabelPrefix = type
14481454

1449-
targetCould = "kubernetes"
1455+
targetCloud = "kubernetes"
14501456

14511457
// The following GPU types doesn't support dynamic driver flashing.
14521458
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
1453-
selectors = """
1459+
if (type == "gb10x") {
1460+
targetCloud = "nvks-sparks-cloud"
1461+
selectors = """
1462+
kubernetes.io/arch: ${arch}
1463+
kubernetes.io/os: linux
1464+
nvidia.com/gpu.machine: NVIDIA_DGX_Spark
1465+
nvidia.com/tenant: blossom_trt"""
1466+
memorySize = "64Gi"
1467+
tolerations = """
1468+
tolerations:
1469+
- key: "node_for_blossom_trt"
1470+
operator: "Exists"
1471+
effect: "NoSchedule"
1472+
"""
1473+
} else {
1474+
selectors = """
14541475
kubernetes.io/arch: ${arch}
14551476
kubernetes.io/os: linux
14561477
nvidia.com/gpu_type: ${gpuType}"""
1478+
}
14571479
} else if (perfMode && !hasMultipleGPUs) {
14581480
// Use single GPU machine with "tensorrt/test_type: perf" for stable perf testing.
14591481
// H100 / A100 single GPU machine has this unique label in TensorRT Blossom pool.
@@ -1537,7 +1559,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
15371559
}
15381560

15391561
def podConfig = [
1540-
cloud: targetCould,
1562+
cloud: targetCloud,
15411563
namespace: "sw-tensorrt",
15421564
label: nodeLabel,
15431565
yaml: """
@@ -1560,6 +1582,10 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
15601582
containers:
15611583
${containerConfig}
15621584
env:
1585+
- name: NVIDIA_VISIBLE_DEVICES
1586+
value: "all"
1587+
- name: NVIDIA_DRIVER_CAPABILITIES
1588+
value: "compute,utility"
15631589
- name: HOST_NODE_NAME
15641590
valueFrom:
15651591
fieldRef:
@@ -1583,8 +1609,12 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
15831609
medium: Memory
15841610
${llmModelVolume}
15851611
${pvcVolume}
1612+
${tolerations}
15861613
""".stripIndent(),
15871614
]
1615+
if (type.contains("gb10x")) {
1616+
print(podConfig)
1617+
}
15881618

15891619
return podConfig
15901620
}
@@ -2971,13 +3001,15 @@ def launchTestJobs(pipeline, testFilter)
29713001
// The total machine time is scaled proportionally according to the number of each GPU.
29723002
SBSATestConfigs = [
29733003
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
3004+
"GB10-PyTorch-1": ["gb10x", "l0_gb10", 1, 1],
29743005
]
29753006
fullSet += SBSATestConfigs.keySet()
29763007

29773008
SBSASlurmTestConfigs = [
29783009
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
29793010
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
29803011
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
3012+
"GB10-PyTorch-Post-Merge-1": ["gb10x", "l0_gb10", 1, 1],
29813013
// Perf sanity post merge test
29823014
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
29833015
// Disable GB300 stages due to nodes will be offline temporarily.

tests/integration/defs/conftest.py

Lines changed: 124 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2680,60 +2680,139 @@ def skip_by_host_memory(request):
26802680
gpu_warning_threshold = 1024 * 1024 * 1024
26812681

26822682

2683+
def get_gpu_memory_wo_pynvml():
2684+
import psutil
2685+
2686+
print(
2687+
f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
2688+
)
2689+
2690+
gpu_memory = {}
2691+
system_total_mb = 0
2692+
system_used_mb = 0
2693+
try:
2694+
mem_output = check_output("free -m | awk '/^Mem:/ {print $3, $2}'",
2695+
shell=True)
2696+
parts = mem_output.strip().split()
2697+
system_used_mb = int(parts[0])
2698+
system_total_mb = int(parts[1])
2699+
except Exception:
2700+
pass
2701+
2702+
# Parse nvidia-smi pmon to get GPU memory usage
2703+
try:
2704+
gpu_output = check_output("nvidia-smi pmon -s m -c 1", shell=True)
2705+
lines = gpu_output.strip().split('\n')
2706+
2707+
for line in lines:
2708+
parts = line.split()
2709+
try:
2710+
gpu_idx = int(parts[0])
2711+
2712+
# Initialize GPU entry if not exists
2713+
if gpu_idx not in gpu_memory:
2714+
gpu_memory[gpu_idx] = {
2715+
"total_used": 0,
2716+
"total": system_total_mb,
2717+
"process": {}
2718+
}
2719+
2720+
# Skip if no active process (pid is '-')
2721+
if parts[1] == '-':
2722+
continue
2723+
2724+
pid = int(parts[1])
2725+
mem_mb = int(parts[3])
2726+
gpu_memory[gpu_idx]["total_used"] += mem_mb
2727+
2728+
# Get process info (same as pynvml version)
2729+
try:
2730+
p = psutil.Process(pid)
2731+
host_memory_in_mbs = p.memory_full_info(
2732+
).uss // 1024 // 1024
2733+
gpu_memory[gpu_idx]["process"][pid] = (
2734+
mem_mb,
2735+
host_memory_in_mbs,
2736+
p.cmdline(),
2737+
)
2738+
except Exception:
2739+
pass
2740+
except (ValueError, IndexError):
2741+
continue
2742+
except Exception as gpu_err:
2743+
print(f"nvidia-smi pmon error: {gpu_err}")
2744+
2745+
# Create default entry for GPU 0 if no GPUs detected
2746+
if not gpu_memory:
2747+
gpu_memory[0] = {
2748+
"total_used": system_used_mb,
2749+
"total": system_total_mb,
2750+
"process": {}
2751+
}
2752+
return gpu_memory
2753+
2754+
26832755
def collect_status(item: pytest.Item):
26842756
if not IS_UNDER_CI_ENV:
26852757
return
26862758

26872759
import psutil
2688-
import pynvml
2689-
2690-
pynvml.nvmlInit()
2691-
2692-
handles = {
2693-
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
2694-
for idx in range(pynvml.nvmlDeviceGetCount())
2695-
}
2696-
2697-
deadline = time.perf_counter() + 60 # 1 min
2698-
observed_used = 0
2699-
global gpu_warning_threshold
2700-
2701-
while time.perf_counter() < deadline:
2702-
observed_used = max(
2703-
pynvml.nvmlDeviceGetMemoryInfo(device).used
2704-
for device in handles.values())
2705-
if observed_used <= gpu_warning_threshold:
2706-
break
2707-
time.sleep(1)
2708-
else:
2709-
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
2710-
warnings.warn(
2711-
f"Test {item.name} does not free up GPU memory correctly!")
27122760

27132761
gpu_memory = {}
2714-
for idx, device in handles.items():
2715-
total_used = pynvml.nvmlDeviceGetMemoryInfo(device).used // 1024 // 1024
2716-
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
2717-
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
2718-
process = {}
27192762

2720-
for entry in detail:
2721-
try:
2722-
p = psutil.Process(entry.pid)
2723-
host_memory_in_mbs = p.memory_full_info().uss // 1024 // 1024
2724-
process[entry.pid] = (
2725-
entry.usedGpuMemory // 1024 // 1024,
2726-
host_memory_in_mbs,
2727-
p.cmdline(),
2728-
)
2729-
except Exception:
2730-
pass
2731-
2732-
gpu_memory[idx] = {
2733-
"total_used": total_used,
2734-
"total": total,
2735-
"process": process
2763+
try:
2764+
import pynvml
2765+
pynvml.nvmlInit()
2766+
2767+
handles = {
2768+
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
2769+
for idx in range(pynvml.nvmlDeviceGetCount())
27362770
}
2771+
2772+
deadline = time.perf_counter() + 60 # 1 min
2773+
observed_used = 0
2774+
global gpu_warning_threshold
2775+
2776+
while time.perf_counter() < deadline:
2777+
observed_used = max(
2778+
pynvml.nvmlDeviceGetMemoryInfo(device).used
2779+
for device in handles.values())
2780+
if observed_used <= gpu_warning_threshold:
2781+
break
2782+
time.sleep(1)
2783+
else:
2784+
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
2785+
warnings.warn(
2786+
f"Test {item.name} does not free up GPU memory correctly!")
2787+
2788+
for idx, device in handles.items():
2789+
total_used = pynvml.nvmlDeviceGetMemoryInfo(
2790+
device).used // 1024 // 1024
2791+
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
2792+
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
2793+
process = {}
2794+
2795+
for entry in detail:
2796+
try:
2797+
p = psutil.Process(entry.pid)
2798+
host_memory_in_mbs = p.memory_full_info(
2799+
).uss // 1024 // 1024
2800+
process[entry.pid] = (
2801+
entry.usedGpuMemory // 1024 // 1024,
2802+
host_memory_in_mbs,
2803+
p.cmdline(),
2804+
)
2805+
except Exception:
2806+
pass
2807+
2808+
gpu_memory[idx] = {
2809+
"total_used": total_used,
2810+
"total": total,
2811+
"process": process
2812+
}
2813+
except Exception:
2814+
gpu_memory = get_gpu_memory_wo_pynvml()
2815+
27372816
print("\nCurrent memory status:")
27382817
print(gpu_memory)
27392818

tests/integration/defs/sysinfo/get_sysinfo.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,12 @@ def construct_gpu_properties(mako_opts, device_index=0):
191191
assert gpu_name != "", "device_product_name is empty after removing substring 'NVIDIA' and leading/trailing whitespaces."
192192

193193
compute_capability = get_compute_capability(device_index)
194-
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**2)
194+
try:
195+
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**
196+
2)
197+
except pynvml.NVMLError_NotSupported as e:
198+
logger.warning("Unable to get GPU memory info: {}".format(e))
199+
gpu_memory = 8 * 1024**3
195200
# Gather GPU information
196201
mako_opt_dict["gpu"] = gpu_name
197202
mako_opt_dict["gpu_memory"] = gpu_memory
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
version: 0.0.1
2+
l0_gb10:
3+
- condition:
4+
ranges:
5+
system_gpu_count:
6+
gte: 1
7+
lte: 1
8+
wildcards:
9+
gpu:
10+
- '*gb10*'
11+
linux_distribution_name: ubuntu*
12+
cpu: aarch64
13+
terms:
14+
stage: post_merge
15+
backend: pytorch
16+
tests:
17+
# ------------- PyTorch tests ---------------
18+
- unittest/_torch/attention/test_attention_mla.py
19+
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
20+
- test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
21+
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency]
22+
- condition:
23+
ranges:
24+
system_gpu_count:
25+
gte: 1
26+
lte: 1
27+
wildcards:
28+
gpu:
29+
- '*gb10*'
30+
linux_distribution_name: ubuntu*
31+
cpu: aarch64
32+
terms:
33+
stage: pre_merge
34+
backend: pytorch
35+
tests:
36+
# ------------- PyTorch tests ---------------
37+
- unittest/_torch/modeling -k "modeling_mllama"
38+
- unittest/_torch/modeling -k "modeling_out_of_tree"
39+
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype0]
40+
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype1]

0 commit comments

Comments
 (0)