Skip to content

Commit d81ebb5

Browse files
authored
[https://nvbugs/5444687][fix] Cherrypick online EPLB CI fix from main to release 1.1 (#8854)
Signed-off-by: Dongxu Yang <78518666+dongxuy04@users.noreply.github.com>
1 parent f5575a9 commit d81ebb5

File tree

7 files changed

+11
-11
lines changed

7 files changed

+11
-11
lines changed

cpp/tensorrt_llm/runtime/moeLoadBalancer/gdrwrap.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,13 +201,13 @@ void driver_get_version(gdr_t g, int* major, int* minor)
201201
int copy_to_mapping(gdr_mh_t handle, void* map_d_ptr, void const* h_ptr, size_t size)
202202
{
203203
CHECK_INITIALIZED();
204-
return GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
204+
return gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size);
205205
}
206206

207207
int copy_from_mapping(gdr_mh_t handle, void* h_ptr, void const* map_d_ptr, size_t size)
208208
{
209209
CHECK_INITIALIZED();
210-
return GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
210+
return gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size);
211211
}
212212

213213
void gdrCudaMalloc(void** ptr, void** devPtr, size_t mapSize, GdrMemDesc** memDesc, gdr_t handle)

cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -871,7 +871,7 @@ MoeLoadBalancer::MoeLoadBalancer(int epRank, int epSize, int layerUpdatesPerIter
871871
}
872872
}
873873

874-
mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads));
874+
mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads, mCudaDeviceId));
875875
}
876876

877877
MoeLoadBalancer::~MoeLoadBalancer()
@@ -1064,8 +1064,9 @@ void MoeLoadBalancer::waitCopyTaskDone(int64_t taskId)
10641064
}
10651065
}
10661066

1067-
MultiThreadWorker::MultiThreadWorker(int numThreads)
1067+
MultiThreadWorker::MultiThreadWorker(int numThreads, int cudaDeviceId)
10681068
: mNumThreads(numThreads)
1069+
, mCudaDeviceId(cudaDeviceId)
10691070
, mRunning(false)
10701071
, mNextTaskId(0)
10711072
{
@@ -1139,6 +1140,7 @@ void MultiThreadWorker::stop()
11391140

11401141
void MultiThreadWorker::workerLoop(int rank)
11411142
{
1143+
TLLM_CUDA_CHECK(cudaSetDevice(mCudaDeviceId));
11421144
auto& topologyDetector = TopologyDetector::getInstance();
11431145
topologyDetector.bindThreadByCurrentGpu(); // use relaxed mode
11441146
while (true)

cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ class SingleLayerMoeLoadBalancer
219219
class MultiThreadWorker
220220
{
221221
public:
222-
explicit MultiThreadWorker(int numThreads);
222+
explicit MultiThreadWorker(int numThreads, int cudaDeviceId);
223223
~MultiThreadWorker();
224224

225225
void start();
@@ -239,6 +239,7 @@ class MultiThreadWorker
239239
void workerLoop(int rank);
240240

241241
int mNumThreads;
242+
int mCudaDeviceId;
242243
std::vector<std::thread> mThreads;
243244
std::mutex mMutex;
244245
std::condition_variable mCondition;

jenkins/L0_Test.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
443443

444444
if (partition.clusterName == "dlcluster") {
445445
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
446+
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
446447
}
447448
echo "Final dockerArgs: ${dockerArgs}"
448449
} else {

tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,4 @@ l0_gb200_multi_gpus:
7070
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
7171
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
7272
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
73-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
74-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
7573
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)

tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ l0_rtx_pro_6000:
8585
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
8686
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
8787
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
88-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
89-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
88+
# - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] # Verify GDRCopy availability on Blossom pods
89+
# - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] # Verify GDRCopy availability on Blossom pods
9090
# - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] # hopper only
9191
# - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
9292
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_
325325
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/5537738)
326326
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5503479)
327327
unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5541545)
328-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687)
329-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687)
330328
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5565604)
331329
unittest/_torch/multi_gpu_modeling/test_llama3.py::test_llama_3_3 SKIP (https://nvbugs/5565559)
332330
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized SKIP (https://nvbugs/5565521)

0 commit comments

Comments
 (0)