[https://nvbugs/5444687][fix] Cherrypick online EPLB CI fix from main to release 1.1 (#8854)

dongxuy04 · web-flow · commit d81ebb5b4dcd · 2025-11-03T09:17:51.000+08:00
Signed-off-by: Dongxu Yang &lt;78518666+dongxuy04@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/runtime/moeLoadBalancer/gdrwrap.cpp b/cpp/tensorrt_llm/runtime/moeLoadBalancer/gdrwrap.cpp
@@ -201,13 +201,13 @@ void driver_get_version(gdr_t g, int* major, int* minor)
 int copy_to_mapping(gdr_mh_t handle, void* map_d_ptr, void const* h_ptr, size_t size)
 {
     CHECK_INITIALIZED();
-    return GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
+    return gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size);
 }
 
 int copy_from_mapping(gdr_mh_t handle, void* h_ptr, void const* map_d_ptr, size_t size)
 {
     CHECK_INITIALIZED();
-    return GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
+    return gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size);
 }
 
 void gdrCudaMalloc(void** ptr, void** devPtr, size_t mapSize, GdrMemDesc** memDesc, gdr_t handle)
diff --git a/cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp b/cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp
@@ -871,7 +871,7 @@ MoeLoadBalancer::MoeLoadBalancer(int epRank, int epSize, int layerUpdatesPerIter
         }
     }
 
-    mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads));
+    mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads, mCudaDeviceId));
 }
 
 MoeLoadBalancer::~MoeLoadBalancer()
@@ -1064,8 +1064,9 @@ void MoeLoadBalancer::waitCopyTaskDone(int64_t taskId)
     }
 }
 
-MultiThreadWorker::MultiThreadWorker(int numThreads)
+MultiThreadWorker::MultiThreadWorker(int numThreads, int cudaDeviceId)
     : mNumThreads(numThreads)
+    , mCudaDeviceId(cudaDeviceId)
     , mRunning(false)
     , mNextTaskId(0)
 {
@@ -1139,6 +1140,7 @@ void MultiThreadWorker::stop()
 
 void MultiThreadWorker::workerLoop(int rank)
 {
+    TLLM_CUDA_CHECK(cudaSetDevice(mCudaDeviceId));
     auto& topologyDetector = TopologyDetector::getInstance();
     topologyDetector.bindThreadByCurrentGpu(); // use relaxed mode
     while (true)
diff --git a/cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h b/cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h
@@ -219,7 +219,7 @@ class SingleLayerMoeLoadBalancer
 class MultiThreadWorker
 {
 public:
-    explicit MultiThreadWorker(int numThreads);
+    explicit MultiThreadWorker(int numThreads, int cudaDeviceId);
     ~MultiThreadWorker();
 
     void start();
@@ -239,6 +239,7 @@ class MultiThreadWorker
     void workerLoop(int rank);
 
     int mNumThreads;
+    int mCudaDeviceId;
     std::vector<std::thread> mThreads;
     std::mutex mMutex;
     std::condition_variable mCondition;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -443,6 +443,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
 
                 if (partition.clusterName == "dlcluster") {
                     dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
+                    dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
                 }
                 echo "Final dockerArgs: ${dockerArgs}"
             } else {
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
@@ -70,6 +70,4 @@ l0_gb200_multi_gpus:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -85,8 +85,8 @@ l0_rtx_pro_6000:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
+  # - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] # Verify GDRCopy availability on Blossom pods
+  # - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] # Verify GDRCopy availability on Blossom pods
   # - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] # hopper only
   # - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -325,8 +325,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/5537738)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5503479)
 unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5541545)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5565604)
 unittest/_torch/multi_gpu_modeling/test_llama3.py::test_llama_3_3 SKIP (https://nvbugs/5565559)
 accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized SKIP (https://nvbugs/5565521)

Original file line number	Diff line number	Diff line change
`@@ -201,13 +201,13 @@ void driver_get_version(gdr_t g, int* major, int* minor)`
`201`	`201`	`int copy_to_mapping(gdr_mh_t handle, void* map_d_ptr, void const* h_ptr, size_t size)`
`202`	`202`	`{`
`203`	`203`	`CHECK_INITIALIZED();`
`204`		`- return GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size));`
	`204`	`+ return gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size);`
`205`	`205`	`}`
`206`	`206`
`207`	`207`	`int copy_from_mapping(gdr_mh_t handle, void* h_ptr, void const* map_d_ptr, size_t size)`
`208`	`208`	`{`
`209`	`209`	`CHECK_INITIALIZED();`
`210`		`- return GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size));`
	`210`	`+ return gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size);`
`211`	`211`	`}`
`212`	`212`
`213`	`213`	`void gdrCudaMalloc(void ptr, void devPtr, size_t mapSize, GdrMemDesc** memDesc, gdr_t handle)`
Original file line number	Diff line number	Diff line change
`@@ -871,7 +871,7 @@ MoeLoadBalancer::MoeLoadBalancer(int epRank, int epSize, int layerUpdatesPerIter`
`871`	`871`	`}`
`872`	`872`	`}`
`873`	`873`
`874`		`- mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads));`
	`874`	`+ mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads, mCudaDeviceId));`
`875`	`875`	`}`
`876`	`876`
`877`	`877`	`MoeLoadBalancer::~MoeLoadBalancer()`
`@@ -1064,8 +1064,9 @@ void MoeLoadBalancer::waitCopyTaskDone(int64_t taskId)`
`1064`	`1064`	`}`
`1065`	`1065`	`}`
`1066`	`1066`
`1067`		`-MultiThreadWorker::MultiThreadWorker(int numThreads)`
	`1067`	`+MultiThreadWorker::MultiThreadWorker(int numThreads, int cudaDeviceId)`
`1068`	`1068`	`: mNumThreads(numThreads)`
	`1069`	`+ , mCudaDeviceId(cudaDeviceId)`
`1069`	`1070`	`, mRunning(false)`
`1070`	`1071`	`, mNextTaskId(0)`
`1071`	`1072`	`{`
`@@ -1139,6 +1140,7 @@ void MultiThreadWorker::stop()`
`1139`	`1140`
`1140`	`1141`	`void MultiThreadWorker::workerLoop(int rank)`
`1141`	`1142`	`{`
	`1143`	`+ TLLM_CUDA_CHECK(cudaSetDevice(mCudaDeviceId));`
`1142`	`1144`	`auto& topologyDetector = TopologyDetector::getInstance();`
`1143`	`1145`	`topologyDetector.bindThreadByCurrentGpu(); // use relaxed mode`
`1144`	`1146`	`while (true)`
Original file line number	Diff line number	Diff line change
`@@ -443,6 +443,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p`
`443`	`443`
`444`	`444`	`if (partition.clusterName == "dlcluster") {`
`445`	`445`	`dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"`
	`446`	`+ dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"`
`446`	`447`	`}`
`447`	`448`	`echo "Final dockerArgs: ${dockerArgs}"`
`448`	`449`	`} else {`