Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1481e89
Add a spart test stage
EmmaQiaoCh Nov 19, 2025
dc89ec7
Check path /dev/gdrdrv first before mount in dlcluster
EmmaQiaoCh Nov 20, 2025
6be1b77
Using a new pynvml package since onld one doesn't support spark
EmmaQiaoCh Nov 21, 2025
42a27c8
Set a default value for spark
EmmaQiaoCh Nov 24, 2025
b0617b5
No need to install nvidia-ml-py
EmmaQiaoCh Nov 24, 2025
a21e89d
Correct stage name
EmmaQiaoCh Nov 25, 2025
9a1ffbe
correct test-db file path and name
EmmaQiaoCh Nov 25, 2025
16aec7d
Merge branch 'main' into emma/enable_spark_ci
EmmaQiaoCh Nov 25, 2025
58b92e9
Add blossom ci for spark
EmmaQiaoCh Nov 26, 2025
f966823
Fix typo
EmmaQiaoCh Nov 27, 2025
ad44142
Add some debug
EmmaQiaoCh Nov 27, 2025
fda6530
Add more debug info
EmmaQiaoCh Dec 1, 2025
efab04c
Fix yml format
EmmaQiaoCh Dec 1, 2025
d1f3f34
Add more info to podtemplate
EmmaQiaoCh Dec 2, 2025
eb0ec59
Merge branch 'main' into emma/enable_spark_ci
EmmaQiaoCh Dec 2, 2025
10b177a
Reduce memory
EmmaQiaoCh Dec 3, 2025
dc0c01a
Remove some properties to debug
EmmaQiaoCh Dec 3, 2025
eb25f1f
Reduce memory
EmmaQiaoCh Dec 3, 2025
9125c9a
Correct podTemplate for gb10
EmmaQiaoCh Dec 3, 2025
6060287
Add tolerations to template
EmmaQiaoCh Dec 3, 2025
1eb5751
Merge pull request #2 from JennyLiu-nv/dev-jenny-dgx-spark-gpu-mem
EmmaQiaoCh Dec 3, 2025
fa032ef
Merge branch 'main' into emma/enable_spark_ci
EmmaQiaoCh Dec 4, 2025
bb47ca9
Fix a typo
EmmaQiaoCh Dec 4, 2025
8c324db
Fix yml format
EmmaQiaoCh Dec 4, 2025
9b0e414
Update test list for gb10
EmmaQiaoCh Dec 4, 2025
0879ac4
Merge branch 'main' into emma/enable_spark_ci
EmmaQiaoCh Dec 18, 2025
41b9c01
Comment out failed cases on spark
EmmaQiaoCh Dec 19, 2025
4fdadaa
Fix some typo and test for flashing driver
EmmaQiaoCh Dec 22, 2025
bd5b92f
Update for the spark cloud env
EmmaQiaoCh Dec 22, 2025
f133942
Move back to not support flash driver
EmmaQiaoCh Dec 23, 2025
bfd660f
Merge branch 'main' into emma/enable_spark_ci
EmmaQiaoCh Dec 23, 2025
14563b7
Update key for slurm run
EmmaQiaoCh Dec 24, 2025
2f1f98a
Change to use BSL main branch since the change is merged
EmmaQiaoCh Dec 25, 2025
2246a1d
Update jenkins/L0_Test.groovy
EmmaQiaoCh Dec 25, 2025
3d4c0e8
Update jenkins/L0_Test.groovy
EmmaQiaoCh Dec 25, 2025
bd0b54d
Update jenkins/L0_Test.groovy
EmmaQiaoCh Dec 25, 2025
0d8c34d
Update tests/integration/test_lists/test-db/l0_gb10.yml
EmmaQiaoCh Dec 25, 2025
03df582
Fix for comments
EmmaQiaoCh Dec 25, 2025
f9d8dd0
Remove a debug info
EmmaQiaoCh Dec 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]

// GPU types that don't support dynamic driver flashing
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200", "gb10x"]

// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
Expand Down Expand Up @@ -672,7 +672,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,

if (cluster.host.contains("dlcluster")) {
dockerArgs += " " + sh(script: 'echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"', returnStdout: true).trim()
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
if (fileExists('/dev/gdrdrv')) {
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
}
}
}

Expand Down Expand Up @@ -1461,7 +1463,7 @@ EOF_TIMEOUT_XML

def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMode = false)
{
def targetCould = "kubernetes-cpu"
def targetCloud = "kubernetes-cpu"
def selectors = """
nvidia.com/node_type: builder
kubernetes.io/arch: ${arch}
Expand All @@ -1470,6 +1472,8 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
def nodeLabelPrefix = ""
def jobName = getShortenedJobName(env.JOB_NAME)
def buildID = env.BUILD_ID
def tolerations = ""
def extraDeviceEnv = ""

def archSuffix = arch == "arm64" ? "arm" : "amd"
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
Expand Down Expand Up @@ -1552,14 +1556,40 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
def gpuType = KubernetesManager.selectGPU(type)
nodeLabelPrefix = type

targetCould = "kubernetes"
targetCloud = "kubernetes"
// DGX Spark requires a special setting for accessing the device.
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
if (type == "gb10x") {
targetCloud = "nvks-sparks-cloud"
memorySize = "64Gi"
tolerations = """
tolerations:
- key: "node_for_blossom_trt"
operator: "Exists"
effect: "NoSchedule"
"""
extraDeviceEnv = """
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
"""
}

// The following GPU types doesn't support dynamic driver flashing.
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
selectors = """
if (type == "gb10x") {
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
nvidia.com/gpu.machine: NVIDIA_DGX_Spark
nvidia.com/tenant: blossom_trt"""
} else {
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
nvidia.com/gpu_type: ${gpuType}"""
}
} else if (perfMode && !hasMultipleGPUs) {
// Use single GPU machine with "tensorrt/test_type: perf" for stable perf testing.
// H100 / A100 single GPU machine has this unique label in TensorRT Blossom pool.
Expand Down Expand Up @@ -1643,7 +1673,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
}

def podConfig = [
cloud: targetCould,
cloud: targetCloud,
namespace: "sw-tensorrt",
label: nodeLabel,
yaml: """
Expand All @@ -1670,6 +1700,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
valueFrom:
fieldRef:
fieldPath: spec.nodeName
${extraDeviceEnv}
- name: jnlp
image: ${jnlpImage}
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
Expand All @@ -1689,6 +1720,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
medium: Memory
${llmModelVolume}
${pvcVolume}
${tolerations}
""".stripIndent(),
]

Expand Down Expand Up @@ -3083,16 +3115,19 @@ def launchTestJobs(pipeline, testFilter)
parallelJobs += parallelSlurmJobs

// Try to match what are being tested on x86 H100_PCIe.
// The total machine time is scaled proportionally according to the number of each GPU.
// SBSA machines from the Blossom machine pool
SBSATestConfigs = [
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
// DGX Spark is also named as GB10 Grace Blackwell Superchip.
"GB10-PyTorch-1": ["gb10x", "l0_gb10", 1, 1],
]
fullSet += SBSATestConfigs.keySet()

SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
// Perf sanity post merge test
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
// Disable GB300 stages due to nodes will be offline temporarily.
Expand Down
173 changes: 128 additions & 45 deletions tests/integration/defs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import datetime
import gc
import logging
import os
import platform
import re
Expand Down Expand Up @@ -56,6 +57,9 @@
except ImportError:
trt_environment = None

# Logger
logger = logging.getLogger(__name__)

# TODO: turn off this when the nightly storage issue is resolved.
DEBUG_CI_STORAGE = os.environ.get("DEBUG_CI_STORAGE", False)
GITLAB_API_USER = os.environ.get("GITLAB_API_USER")
Expand Down Expand Up @@ -2681,60 +2685,139 @@ def skip_by_host_memory(request):
gpu_warning_threshold = 1024 * 1024 * 1024


def get_gpu_memory_wo_pynvml():
import psutil

logger.warning(
f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
)

gpu_memory = {}
system_total_mb = 0
system_used_mb = 0
try:
mem_output = check_output("free -m | awk '/^Mem:/ {print $3, $2}'",
shell=True)
parts = mem_output.strip().split()
system_used_mb = int(parts[0])
system_total_mb = int(parts[1])
except Exception:
pass

# Parse nvidia-smi pmon to get GPU memory usage
try:
gpu_output = check_output("nvidia-smi pmon -s m -c 1", shell=True)
lines = gpu_output.strip().split('\n')

for line in lines:
parts = line.split()
try:
gpu_idx = int(parts[0])

# Initialize GPU entry if not exists
if gpu_idx not in gpu_memory:
gpu_memory[gpu_idx] = {
"total_used": 0,
"total": system_total_mb,
"process": {}
}

# Skip if no active process (pid is '-')
if parts[1] == '-':
continue

pid = int(parts[1])
mem_mb = int(parts[3])
gpu_memory[gpu_idx]["total_used"] += mem_mb

# Get process info (same as pynvml version)
try:
p = psutil.Process(pid)
host_memory_in_mbs = p.memory_full_info(
).uss // 1024 // 1024
gpu_memory[gpu_idx]["process"][pid] = (
mem_mb,
host_memory_in_mbs,
p.cmdline(),
)
except Exception:
pass
except (ValueError, IndexError):
continue
except Exception as gpu_err:
logging.warning(f"nvidia-smi pmon error: {gpu_err}")

# Create default entry for GPU 0 if no GPUs detected
if not gpu_memory:
gpu_memory[0] = {
"total_used": system_used_mb,
"total": system_total_mb,
"process": {}
}
return gpu_memory


def collect_status(item: pytest.Item):
if not IS_UNDER_CI_ENV:
return

import psutil
import pynvml

pynvml.nvmlInit()

handles = {
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
for idx in range(pynvml.nvmlDeviceGetCount())
}

deadline = time.perf_counter() + 60 # 1 min
observed_used = 0
global gpu_warning_threshold

while time.perf_counter() < deadline:
observed_used = max(
pynvml.nvmlDeviceGetMemoryInfo(device).used
for device in handles.values())
if observed_used <= gpu_warning_threshold:
break
time.sleep(1)
else:
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
warnings.warn(
f"Test {item.name} does not free up GPU memory correctly!")

gpu_memory = {}
for idx, device in handles.items():
total_used = pynvml.nvmlDeviceGetMemoryInfo(device).used // 1024 // 1024
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
process = {}

for entry in detail:
try:
p = psutil.Process(entry.pid)
host_memory_in_mbs = p.memory_full_info().uss // 1024 // 1024
process[entry.pid] = (
entry.usedGpuMemory // 1024 // 1024,
host_memory_in_mbs,
p.cmdline(),
)
except Exception:
pass

gpu_memory[idx] = {
"total_used": total_used,
"total": total,
"process": process
try:
import pynvml
pynvml.nvmlInit()

handles = {
idx: pynvml.nvmlDeviceGetHandleByIndex(idx)
for idx in range(pynvml.nvmlDeviceGetCount())
}

deadline = time.perf_counter() + 60 # 1 min
observed_used = 0
global gpu_warning_threshold

while time.perf_counter() < deadline:
observed_used = max(
pynvml.nvmlDeviceGetMemoryInfo(device).used
for device in handles.values())
if observed_used <= gpu_warning_threshold:
break
time.sleep(1)
else:
gpu_warning_threshold = max(observed_used, gpu_warning_threshold)
warnings.warn(
f"Test {item.name} does not free up GPU memory correctly!")

for idx, device in handles.items():
total_used = pynvml.nvmlDeviceGetMemoryInfo(
device).used // 1024 // 1024
total = pynvml.nvmlDeviceGetMemoryInfo(device).total // 1024 // 1024
detail = pynvml.nvmlDeviceGetComputeRunningProcesses(device)
process = {}

for entry in detail:
try:
p = psutil.Process(entry.pid)
host_memory_in_mbs = p.memory_full_info(
).uss // 1024 // 1024
process[entry.pid] = (
entry.usedGpuMemory // 1024 // 1024,
host_memory_in_mbs,
p.cmdline(),
)
except Exception:
pass

gpu_memory[idx] = {
"total_used": total_used,
"total": total,
"process": process
}
except Exception:
gpu_memory = get_gpu_memory_wo_pynvml()

print("\nCurrent memory status:")
print(gpu_memory)

Expand Down
8 changes: 7 additions & 1 deletion tests/integration/defs/sysinfo/get_sysinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@ def construct_gpu_properties(mako_opts, device_index=0):
assert gpu_name != "", "device_product_name is empty after removing substring 'NVIDIA' and leading/trailing whitespaces."

compute_capability = get_compute_capability(device_index)
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**2)
try:
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**
2)
except pynvml.NVMLError_NotSupported as e:
logger.warning("Unable to get GPU memory info: {}".format(e))
# Fallback to 8 GiB, expressed in MiB to match the nvml path above.
gpu_memory = 8 * 1024
# Gather GPU information
mako_opt_dict["gpu"] = gpu_name
mako_opt_dict["gpu_memory"] = gpu_memory
Expand Down
42 changes: 42 additions & 0 deletions tests/integration/test_lists/test-db/l0_gb10.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
version: 0.0.1
# DGX Spark is also named as GB10 Grace Blackwell Superchip.
l0_gb10:
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*gb10*'
linux_distribution_name: ubuntu*
cpu: aarch64
terms:
stage: post_merge
backend: pytorch
tests:
# ------------- PyTorch tests ---------------
- unittest/_torch/attention/test_attention_mla.py
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
- test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency]
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*gb10*'
linux_distribution_name: ubuntu*
cpu: aarch64
terms:
stage: pre_merge
backend: pytorch
tests:
# ------------- PyTorch tests ---------------
# Below cases which are commented out due to they failed on gb10
# - unittest/_torch/modeling -k "modeling_mllama"
- unittest/_torch/modeling -k "modeling_out_of_tree"
# - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype0]
# - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype1]