diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 3af32ebe4bd..db197606eb8 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -452,6 +452,12 @@ def shutdown(self):
         """
         Signals the server to shutdown.
         """
+        import traceback
+        traceback.print_stack()
+        import os
+        print(
+            f"====================== shutdown in executor is called pid:  {os.getpid()}"
+        )
         self.executor_request_queue.enqueue_shutdown_request()
         self.shutdown_event.wait()
         self.worker_thread.join()
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index 5d7c23940f7..05c22de3183 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -221,7 +221,10 @@ def create_py_executor(
     tokenizer: Optional[TokenizerBase] = None,
     profiling_stage_data: Optional[dict] = None,
 ) -> PyExecutor:
-
+    # import os
+    # print(f"====================== create_py_executor pid:  {os.getpid()}")
+    # import traceback
+    # print(f"====================== backtrace: {traceback.print_stack()}")
     garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold
     lora_config = llm_args.lora_config
     kv_connector_config = llm_args.kv_connector_config
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 6943df0c1ab..cf7c77b100a 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -44,6 +44,9 @@
 
 def _signal_handler_cleanup_child(signum, frame):
     """Signal handler to clean up the child process."""
+    print(
+        f"================================================ server received signal {signal.Signals(signum).name}"
+    )
     global _child_p_global
     if _child_p_global and _child_p_global.poll() is None:
         # Using print for safety in signal handlers
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
index 00f04a1d0a1..2fb23ab0588 100644
--- a/tensorrt_llm/executor/base_worker.py
+++ b/tensorrt_llm/executor/base_worker.py
@@ -638,6 +638,7 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
         return result
 
     def shutdown(self):
+        print(f"========================== Shutting down worker {self.rank}")
         if self.doing_shutdown:
             return
         else:
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index f9c502f85d8..3415d31ed5d 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -290,6 +290,9 @@ def pre_shutdown(self):
             self.request_queue.put_noblock(None, retry=4)
 
     def shutdown(self):
+        print(
+            f"====================== shutdown in GenerationExecutorProxy is called pid:  {os.getpid()}"
+        )
         if not self.workers_started:
             return
 
@@ -325,6 +328,10 @@ def shutdown(self):
         self.result_queue.close()
 
         self.workers_started = False
+        print(
+            f"====================== shutdown in GenerationExecutorProxy 2 is called pid:  {os.getpid()}"
+        )
+
         self.mpi_session.shutdown()
 
         # Process the errors in-case error during shutting down the threads
diff --git a/tensorrt_llm/executor/ray_executor.py b/tensorrt_llm/executor/ray_executor.py
index 0fc4fa28105..cc19cb593c4 100644
--- a/tensorrt_llm/executor/ray_executor.py
+++ b/tensorrt_llm/executor/ray_executor.py
@@ -294,7 +294,9 @@ def shutdown(self):
             self._shutdown_event.set()
 
         logger_debug(f"Shutting down RayExecutor", color="yellow")
-
+        print(
+            f"====================== shutdown in RayExecutor is called pid:  {os.getpid()}"
+        )
         if hasattr(self, 'main_loop') and self.main_loop and hasattr(
                 self, 'main_loop_task_obj') and self.main_loop_task_obj:
             logger_debug("Cancelling main loop task.", color="yellow")
diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py
index b8a22af4720..371fe9d2938 100644
--- a/tensorrt_llm/executor/ray_gpu_worker.py
+++ b/tensorrt_llm/executor/ray_gpu_worker.py
@@ -151,6 +151,9 @@ def call_worker_method(self, method_name: str, *args, **kwargs):
                 f"The RayGPUWorker has no method called '{method_name}'.")
 
     def shutdown(self):
+        print(
+            f"====================== shutdown in RayWorkerWrapper is called pid:  {os.getpid()}"
+        )
         if hasattr(self, 'worker'):
             self.worker.shutdown()
 
@@ -298,6 +301,9 @@ def shutdown(self):
             return
         else:
             self.doing_shutdown = True
+        print(
+            f"====================== shutdown in RayWorkerWrapper is called pid:  {os.getpid()}"
+        )
 
         logger.debug(f'Worker {self.rank} shutting down...')
 
diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py
index 6b598b98ea5..f3e8a2530ca 100644
--- a/tensorrt_llm/executor/rpc/rpc_server.py
+++ b/tensorrt_llm/executor/rpc/rpc_server.py
@@ -134,6 +134,9 @@ def shutdown(self, is_remote_call: bool = False) -> None:
         # Set the stop event to True, this will trigger immediate shutdown
         self._stop_event.set()
 
+        print(
+            f"==================================== RPCServer shutdown called, is_remote_call={is_remote_call}"
+        )
         # Log pending requests that will be cancelled
         logger_debug(
             f"[server] RPCServer is shutting down: {self._num_pending_requests} pending requests will be cancelled"
diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py
index 722609dea61..211e6add89c 100644
--- a/tensorrt_llm/executor/rpc_proxy.py
+++ b/tensorrt_llm/executor/rpc_proxy.py
@@ -187,8 +187,17 @@ def abort_request(self, request_id: int) -> None:
         return self.rpc_client.abort_request(request_id).remote()
 
     def shutdown(self):
+        import traceback
+        traceback.print_stack()
+        import os
+        print(
+            f"====================== shutdown in GenerationExecutorRpcProxy is called pid:  {os.getpid()}"
+        )
         if self._shutdown_event.is_set():
             return
+        print(
+            f"====================== shutdown in GenerationExecutorRpcProxy 2 is called pid:  {os.getpid()}"
+        )
         self._shutdown_event.set()
         logger_debug(f"Shutting down GenerationExecutorRpcProxy",
                      color="yellow")
diff --git a/tensorrt_llm/executor/rpc_worker.py b/tensorrt_llm/executor/rpc_worker.py
index 665e8a07234..66681f7496d 100644
--- a/tensorrt_llm/executor/rpc_worker.py
+++ b/tensorrt_llm/executor/rpc_worker.py
@@ -102,6 +102,9 @@ def shutdown(self):
         logger_debug(f"[worker] RpcWorker #{mpi_rank()} is shutting down",
                      color="yellow")
         self.shutdown_event.set()
+        print(
+            f"====================== shutdown in RpcWorker is called pid:  {os.getpid()}"
+        )
         super().shutdown()
         logger_debug(f"[worker] RpcWorker #{mpi_rank()} is shutdown",
                      color="yellow")
diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py
index e52ea481fb0..c9e815bf339 100644
--- a/tensorrt_llm/executor/utils.py
+++ b/tensorrt_llm/executor/utils.py
@@ -101,6 +101,9 @@ def submit_sync(self, task: Callable, *args, **kwargs) -> List[Any]:
         return [future.result() for future in futures]
 
     def shutdown(self):
+        print(
+            f"==================================== shutdown ProcessPoolExecutor session"
+        )
         self.mpi_pool.shutdown(wait=True)
 
 
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index c4917a86a53..77ff025f348 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -84,6 +84,9 @@ def start(self):
         self.start_thread(self.await_response_thread)
 
     def shutdown(self):
+        print(
+            f"====================== GenerationExecutorWorker shutdown is called pid:  {os.getpid()}"
+        )
 
         if self.doing_shutdown:
             return
@@ -98,6 +101,9 @@ def shutdown(self):
                     self.await_response_thread.stop()
                     self.await_response_thread.join()
 
+            print(
+                f"====================== GenerationExecutorWorker engine shutdown is called pid:  {os.getpid()}"
+            )
             self.engine.shutdown()
             self.engine = None
 
@@ -317,6 +323,9 @@ def notify_proxy_threads_to_quit():
                     else:
                         raise ValueError(f"Unknown request type: {type(req)}")
 
+                print(
+                    f"====================== Worker {mpi_rank()} received shutdown signal from proxy process."
+                )
                 notify_proxy_threads_to_quit()
 
         except GenerationExecutorWorker.WorkerExit as e:
@@ -325,7 +334,11 @@ def notify_proxy_threads_to_quit():
 
         except Exception as e:  # other critical errors
             if is_leader:
+                print(
+                    f"====================== Worker {mpi_rank()} received shutdown signal from proxy process."
+                )
                 notify_proxy_threads_to_quit()
             logger.error(traceback.format_exc())
             # This will be captured by mpi4py and handled by future.done_callback
             raise e
+    logger_debug(f"Worker {mpi_rank()} exiting worker_main...\n", "green")
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 6d3410bf3c2..c0be8080a22 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -172,6 +172,10 @@ def __init__(self,
                         f"{self.__class__.__name__} got invalid argument: {key}"
                     )
 
+            import os
+            print(
+                f"====================== llm class is: {llm_args_cls} pid:  {os.getpid()}"
+            )
             self.args = llm_args_cls.from_kwargs(
                 model=model,
                 tokenizer=tokenizer,
@@ -814,11 +818,21 @@ def _try_load_hf_model_config(
 
     @set_api_status("beta")
     def shutdown(self) -> None:
+        import traceback
+        traceback.print_stack()
+        import os
+        print(f"====================== shutdown is called pid:  {os.getpid()}")
         if hasattr(self, "_executor") and self._executor is not None:
+            print(
+                f"====================== _executor shutdown is called pid:  {os.getpid()}"
+            )
             self._executor.shutdown()
             self._executor = None
 
         if hasattr(self, 'mpi_session') and self.mpi_session is not None:
+            print(
+                f"====================== mpi_session shutdown is called pid:  {os.getpid()}"
+            )
             self.mpi_session.shutdown()
             self.mpi_session = None
 
@@ -837,6 +851,12 @@ def _check_health(self) -> bool:
     def _shutdown_wrapper(self_ref):
         # Retrieve the instance if it still exists
         instance = self_ref()
+        import traceback
+        traceback.print_stack()
+        import os
+        print(
+            f"====================== _shutdown_wrapper is called pid in:  {os.getpid()}"
+        )
         if instance is not None:
             instance.shutdown()
 
@@ -848,6 +868,12 @@ def __exit__(
     ) -> Literal[
             False]:  # https://github.com/microsoft/pyright/issues/7009#issuecomment-1894135045
         del exc_value, traceback
+        import traceback
+        traceback.print_stack()
+        import os
+        print(
+            f"====================== LLM __exit__ is called pid:  {os.getpid()}"
+        )
         self.shutdown()
         return False  # propagate exceptions
 
@@ -855,6 +881,8 @@ def __getstate__(self):
         raise RuntimeError("LLM object can not be pickled.")
 
     def __del__(self):
+        print(
+            f"====================== LLM __del__ is called pid:  {os.getpid()}")
         self.shutdown()
 
 
diff --git a/tensorrt_llm/llmapi/mpi_session.py b/tensorrt_llm/llmapi/mpi_session.py
index d32e5a7b7aa..8f856c7900b 100644
--- a/tensorrt_llm/llmapi/mpi_session.py
+++ b/tensorrt_llm/llmapi/mpi_session.py
@@ -160,6 +160,7 @@ def submit_sync(self, task: Callable[..., T], *args, **kwargs) -> List[T]:
         return [future.result() for future in futures]
 
     def shutdown(self, wait=True):
+        print(f"==================================== shutdown MPI pool session")
         if self.mpi_pool is not None:
             self.mpi_pool.shutdown(wait=wait)
             self.mpi_pool = None
@@ -237,6 +238,9 @@ def submit_sync(self, task: Callable[..., T], *args, **kwargs) -> List[T]:
     def shutdown(self, wait=True):
         # Only shutdown the mpi_pool if this instance created it
         # For shared global mpi_pool, we don't shut it down
+        print(
+            f"==================================== shutdown is called MPI comm session"
+        )
         if self.mpi_pool is not None and self.owns_mpi_pool:
             self.mpi_pool.shutdown(wait=wait)
         self.mpi_pool = None
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index ab9c1591c17..a2c323c460c 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import os
 import sys
+import time
 
 import pytest
 import torch
@@ -59,9 +60,9 @@ def patched_start_mpi_pool(self):
 from tensorrt_llm.quantization import QuantAlgo
 
 from ..conftest import (get_device_count, get_device_memory, llm_models_root,
-                        parametrize_with_ids, skip_no_hopper,
-                        skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
-                        skip_pre_hopper, skip_ray)
+                        parametrize_with_ids, print_device_memory,
+                        skip_no_hopper, skip_post_blackwell, skip_pre_ada,
+                        skip_pre_blackwell, skip_pre_hopper, skip_ray)
 from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
                             JsonModeEval, LlmapiAccuracyTestHarness,
                             LongBenchV2)
@@ -533,7 +534,9 @@ class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B"
     EXAMPLE_FOLDER = "models/core/llama"
 
-    def test_auto_dtype(self):
+    @pytest.mark.parametrize("pp_size", [2, 4], ids=["pp2", "pp4"])
+    def test_auto_dtype(self, pp_size):
+        print_device_memory()
         with LLM(self.MODEL_PATH) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1328,6 +1331,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     @parametrize_with_ids("mtp_nextn", [0, 2])
     def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
                       overlap_scheduler, torch_compile, enable_chunked_prefill):
+        print_device_memory()
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -1351,6 +1355,11 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
                  speculative_config=mtp_config) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+            print_device_memory()
+
+        time.sleep(60)
+        print(f"================= print mem after 60s")
+        print_device_memory()
 
     @pytest.mark.skip_less_device_memory(60000)
     def test_bfloat16_2_model_mtp(self):
@@ -1406,6 +1415,10 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+
+        #time.sleep(5)
+        print(f"================= print mem before testing")
+        print_device_memory()
         with LLM(self.MODEL_PATH,
                  tensor_parallel_size=tp_size,
                  pipeline_parallel_size=pp_size,
@@ -1417,6 +1430,18 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                  speculative_config=mtp_config) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+            print(f"================= print mem after testing")
+            print_device_memory()
+
+        #time.sleep(5)
+        print(f"================= print mem after testing outside")
+        print_device_memory()
+
+        print(f"++++++++++++++++++++++++++++++++++++++++\n\n\n")
+
+        #time.sleep(60)
+        #print(f"================= print mem after 60s")
+        #print_device_memory()
 
     @skip_pre_hopper
     @parametrize_with_ids("torch_compile", [False, True])
@@ -2263,6 +2288,13 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
         if moe_backend == "TRTLLM" and sm_version in (120, 121):
             pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
 
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        print(f"\n--- nvidia-smi start to test  ---")
+        print_device_memory()
+
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
@@ -2297,9 +2329,19 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             # Commented out because GPQA takes too long to run
-            # task = GPQADiamond(self.MODEL_NAME)
-            # task.evaluate(llm,
-            #               extra_evaluator_kwargs=dict(apply_chat_template=True))
+            task = GPQADiamond(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=dict(apply_chat_template=True))
+            print("=================================== test finishes")
+            print_device_memory()
+
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        time.sleep(180)
+        print(f"\n--- nvidia-smi after testing after 180s  ---")
+        print_device_memory()
 
     @skip_pre_blackwell
     @pytest.mark.parametrize(
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
index 92d593bf206..933c1562532 100644
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@@ -2004,40 +2004,65 @@ def get_device_count():
     return len(get_gpu_device_list())
 
 
-def get_device_memory():
-    "get gpu memory"
-    memory = 0
+def get_device_memory_str():
     with tempfile.TemporaryDirectory() as temp_dirname:
         suffix = ".exe" if is_windows() else ""
-        # TODO: Use NRSU because we can't assume nvidia-smi across all platforms.
         cmd = " ".join([
-            "nvidia-smi" + suffix, "--query-gpu=memory.total",
+            "nvidia-smi" + suffix,
+            "--query-gpu=memory.total,memory.reserved,memory.used,memory.free",
             "--format=csv,noheader"
         ])
-        # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
-        # This fallback is needed for systems with unified memory (e.g. DGX Spark)
+        output = check_output(cmd, shell=True, cwd=temp_dirname)
+        return output.strip()
+
+
+def get_device_memory():
+    "get gpu memory"
+    memory = 0
+    # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
+    # This fallback is needed for systems with unified memory (e.g. DGX Spark)
+    try:
+        output = get_device_memory_str()
+        memory_str = output.strip().split()[0]
+        # Check if nvidia-smi returned a valid numeric value
+        if "N/A" in memory_str:
+            raise ValueError("nvidia-smi returned invalid memory info")
+        memory = int(memory_str)
+    except (sp.CalledProcessError, ValueError, IndexError):
+        # Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
         try:
-            output = check_output(cmd, shell=True, cwd=temp_dirname)
-            memory_str = output.strip().split()[0]
-            # Check if nvidia-smi returned a valid numeric value
-            if "N/A" in memory_str:
-                raise ValueError("nvidia-smi returned invalid memory info")
-            memory = int(memory_str)
-        except (sp.CalledProcessError, ValueError, IndexError):
-            # Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
-            try:
-                with open("/proc/meminfo", "r") as f:
-                    for line in f:
-                        if line.startswith("MemTotal:"):
-                            memory = int(
-                                line.split()[1]) // 1024  # Convert kB to MiB
-                            break
-            except:
-                memory = 8192  # Default 8GB if all else fails
+            with open("/proc/meminfo", "r") as f:
+                for line in f:
+                    if line.startswith("MemTotal:"):
+                        memory = int(
+                            line.split()[1]) // 1024  # Convert kB to MiB
+                        break
+        except:
+            memory = 8192  # Default 8GB if all else fails
 
     return memory
 
 
+def print_device_memory():
+    memory_str = get_device_memory_str()
+    print(f"Device Memory:\ntotal:   reserved:   used:   free:  \n{memory_str}")
+
+    mem_stats = torch.cuda.memory_stats()
+    torch_used_bytes = mem_stats["allocated_bytes.all.current"]
+    torch_used_bytes = mem_stats["reserved_bytes.all.current"]
+    print(
+        f"================================== torch mem stats: allocated {torch_used_bytes}  reserved  {torch_used_bytes}"
+    )
+    print(f"\n--- nvidia-smi in print_device_memory  ---")
+    sp.run(["nvidia-smi"], check=False)
+
+    end, total_gpu_memory = torch.cuda.mem_get_info()
+    total_used_bytes = total_gpu_memory - end
+    print(
+        f"================================== torch mem info: free {end}, total {total_gpu_memory},  used {total_used_bytes}"
+    )
+
+
 def pytest_addoption(parser):
     parser.addoption(
         "--test-list",
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
index 57c3b6fd810..1174f6066c9 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -32,10 +32,10 @@ l0_gb200_multi_nodes:
       backend: pytorch
   tests:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180) ISOLATION
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) ISOLATION
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180) ISOLATION
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180) ISOLATION
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 6de1fa6b552..47b4d5ba673 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -324,7 +324,6 @@ accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (
 accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5644632)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] SKIP (https://nvbugs/5629136)
 examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
 accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
@@ -359,8 +358,6 @@ accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] S
 test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5701457)
 triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5701480)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701425)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5701425)
 unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421)
 accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] SKIP (https://nvbugs/5702795)
 accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] SKIP (https://nvbugs/5702795)
@@ -389,10 +386,7 @@ accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-c
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)
 unittest/_torch/speculative/test_spec_gate.py::test_spec_gate_e2e SKIP (https://nvbugs/5710045)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5569696)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/5715568)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5715568)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568)
 unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
@@ -400,19 +394,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_mode
 unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5727475)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740087)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740087)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740359)
 unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981)
 unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[True] SKIP (https://nvbugs/5739981)
 unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_serve[True] SKIP (https://nvbugs/5739981)
@@ -420,26 +404,14 @@ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_
 full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5596337)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377, https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740087)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740359)
 unittest/_torch/multi_gpu/test_allreduce.py::test_allreduce_fusion_patterns[2-residual_rms_norm_out_quant_fp8-hidden:7168-seqlen:8192] SKIP (https://nvbugs/5741392)
 unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740377)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740359)
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] SKIP (https://nvbugs/5744293)
 examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] SKIP (https://nvbugs/5744293)
 examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] SKIP (https://nvbugs/5744293)
 test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5744432)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740087)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
 test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920)
 test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938)
 unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args8] SKIP (https://nvbugs/5747878)
@@ -495,7 +467,6 @@ examples/serve/test_serve.py::test_config_file_loading[--config] SKIP (https://n
 full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)
 examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5748683)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5715568)
 unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741)
 unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741)
 unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_dist_backend.py::test_dist_backend_all_gather[torch] SKIP (https://nvbugs/5766986)