diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 3af32ebe4bd..db197606eb8 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -452,6 +452,12 @@ def shutdown(self): """ Signals the server to shutdown. """ + import traceback + traceback.print_stack() + import os + print( + f"====================== shutdown in executor is called pid: {os.getpid()}" + ) self.executor_request_queue.enqueue_shutdown_request() self.shutdown_event.wait() self.worker_thread.join() diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 5d7c23940f7..05c22de3183 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -221,7 +221,10 @@ def create_py_executor( tokenizer: Optional[TokenizerBase] = None, profiling_stage_data: Optional[dict] = None, ) -> PyExecutor: - + # import os + # print(f"====================== create_py_executor pid: {os.getpid()}") + # import traceback + # print(f"====================== backtrace: {traceback.print_stack()}") garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold lora_config = llm_args.lora_config kv_connector_config = llm_args.kv_connector_config diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 6943df0c1ab..cf7c77b100a 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -44,6 +44,9 @@ def _signal_handler_cleanup_child(signum, frame): """Signal handler to clean up the child process.""" + print( + f"================================================ server received signal {signal.Signals(signum).name}" + ) global _child_p_global if _child_p_global and _child_p_global.poll() is None: # Using print for safety in signal handlers diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index 00f04a1d0a1..2fb23ab0588 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -638,6 +638,7 @@ def submit(self, request: GenerationRequest) -> GenerationResult: return result def shutdown(self): + print(f"========================== Shutting down worker {self.rank}") if self.doing_shutdown: return else: diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index f9c502f85d8..3415d31ed5d 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -290,6 +290,9 @@ def pre_shutdown(self): self.request_queue.put_noblock(None, retry=4) def shutdown(self): + print( + f"====================== shutdown in GenerationExecutorProxy is called pid: {os.getpid()}" + ) if not self.workers_started: return @@ -325,6 +328,10 @@ def shutdown(self): self.result_queue.close() self.workers_started = False + print( + f"====================== shutdown in GenerationExecutorProxy 2 is called pid: {os.getpid()}" + ) + self.mpi_session.shutdown() # Process the errors in-case error during shutting down the threads diff --git a/tensorrt_llm/executor/ray_executor.py b/tensorrt_llm/executor/ray_executor.py index 0fc4fa28105..cc19cb593c4 100644 --- a/tensorrt_llm/executor/ray_executor.py +++ b/tensorrt_llm/executor/ray_executor.py @@ -294,7 +294,9 @@ def shutdown(self): self._shutdown_event.set() logger_debug(f"Shutting down RayExecutor", color="yellow") - + print( + f"====================== shutdown in RayExecutor is called pid: {os.getpid()}" + ) if hasattr(self, 'main_loop') and self.main_loop and hasattr( self, 'main_loop_task_obj') and self.main_loop_task_obj: logger_debug("Cancelling main loop task.", color="yellow") diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py index b8a22af4720..371fe9d2938 100644 --- a/tensorrt_llm/executor/ray_gpu_worker.py +++ b/tensorrt_llm/executor/ray_gpu_worker.py @@ -151,6 +151,9 @@ def call_worker_method(self, method_name: str, *args, **kwargs): f"The RayGPUWorker has no method called '{method_name}'.") def shutdown(self): + print( + f"====================== shutdown in RayWorkerWrapper is called pid: {os.getpid()}" + ) if hasattr(self, 'worker'): self.worker.shutdown() @@ -298,6 +301,9 @@ def shutdown(self): return else: self.doing_shutdown = True + print( + f"====================== shutdown in RayWorkerWrapper is called pid: {os.getpid()}" + ) logger.debug(f'Worker {self.rank} shutting down...') diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py index 6b598b98ea5..f3e8a2530ca 100644 --- a/tensorrt_llm/executor/rpc/rpc_server.py +++ b/tensorrt_llm/executor/rpc/rpc_server.py @@ -134,6 +134,9 @@ def shutdown(self, is_remote_call: bool = False) -> None: # Set the stop event to True, this will trigger immediate shutdown self._stop_event.set() + print( + f"==================================== RPCServer shutdown called, is_remote_call={is_remote_call}" + ) # Log pending requests that will be cancelled logger_debug( f"[server] RPCServer is shutting down: {self._num_pending_requests} pending requests will be cancelled" diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py index 722609dea61..211e6add89c 100644 --- a/tensorrt_llm/executor/rpc_proxy.py +++ b/tensorrt_llm/executor/rpc_proxy.py @@ -187,8 +187,17 @@ def abort_request(self, request_id: int) -> None: return self.rpc_client.abort_request(request_id).remote() def shutdown(self): + import traceback + traceback.print_stack() + import os + print( + f"====================== shutdown in GenerationExecutorRpcProxy is called pid: {os.getpid()}" + ) if self._shutdown_event.is_set(): return + print( + f"====================== shutdown in GenerationExecutorRpcProxy 2 is called pid: {os.getpid()}" + ) self._shutdown_event.set() logger_debug(f"Shutting down GenerationExecutorRpcProxy", color="yellow") diff --git a/tensorrt_llm/executor/rpc_worker.py b/tensorrt_llm/executor/rpc_worker.py index 665e8a07234..66681f7496d 100644 --- a/tensorrt_llm/executor/rpc_worker.py +++ b/tensorrt_llm/executor/rpc_worker.py @@ -102,6 +102,9 @@ def shutdown(self): logger_debug(f"[worker] RpcWorker #{mpi_rank()} is shutting down", color="yellow") self.shutdown_event.set() + print( + f"====================== shutdown in RpcWorker is called pid: {os.getpid()}" + ) super().shutdown() logger_debug(f"[worker] RpcWorker #{mpi_rank()} is shutdown", color="yellow") diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py index e52ea481fb0..c9e815bf339 100644 --- a/tensorrt_llm/executor/utils.py +++ b/tensorrt_llm/executor/utils.py @@ -101,6 +101,9 @@ def submit_sync(self, task: Callable, *args, **kwargs) -> List[Any]: return [future.result() for future in futures] def shutdown(self): + print( + f"==================================== shutdown ProcessPoolExecutor session" + ) self.mpi_pool.shutdown(wait=True) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index c4917a86a53..77ff025f348 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -84,6 +84,9 @@ def start(self): self.start_thread(self.await_response_thread) def shutdown(self): + print( + f"====================== GenerationExecutorWorker shutdown is called pid: {os.getpid()}" + ) if self.doing_shutdown: return @@ -98,6 +101,9 @@ def shutdown(self): self.await_response_thread.stop() self.await_response_thread.join() + print( + f"====================== GenerationExecutorWorker engine shutdown is called pid: {os.getpid()}" + ) self.engine.shutdown() self.engine = None @@ -317,6 +323,9 @@ def notify_proxy_threads_to_quit(): else: raise ValueError(f"Unknown request type: {type(req)}") + print( + f"====================== Worker {mpi_rank()} received shutdown signal from proxy process." + ) notify_proxy_threads_to_quit() except GenerationExecutorWorker.WorkerExit as e: @@ -325,7 +334,11 @@ def notify_proxy_threads_to_quit(): except Exception as e: # other critical errors if is_leader: + print( + f"====================== Worker {mpi_rank()} received shutdown signal from proxy process." + ) notify_proxy_threads_to_quit() logger.error(traceback.format_exc()) # This will be captured by mpi4py and handled by future.done_callback raise e + logger_debug(f"Worker {mpi_rank()} exiting worker_main...\n", "green") diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 6d3410bf3c2..c0be8080a22 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -172,6 +172,10 @@ def __init__(self, f"{self.__class__.__name__} got invalid argument: {key}" ) + import os + print( + f"====================== llm class is: {llm_args_cls} pid: {os.getpid()}" + ) self.args = llm_args_cls.from_kwargs( model=model, tokenizer=tokenizer, @@ -814,11 +818,21 @@ def _try_load_hf_model_config( @set_api_status("beta") def shutdown(self) -> None: + import traceback + traceback.print_stack() + import os + print(f"====================== shutdown is called pid: {os.getpid()}") if hasattr(self, "_executor") and self._executor is not None: + print( + f"====================== _executor shutdown is called pid: {os.getpid()}" + ) self._executor.shutdown() self._executor = None if hasattr(self, 'mpi_session') and self.mpi_session is not None: + print( + f"====================== mpi_session shutdown is called pid: {os.getpid()}" + ) self.mpi_session.shutdown() self.mpi_session = None @@ -837,6 +851,12 @@ def _check_health(self) -> bool: def _shutdown_wrapper(self_ref): # Retrieve the instance if it still exists instance = self_ref() + import traceback + traceback.print_stack() + import os + print( + f"====================== _shutdown_wrapper is called pid in: {os.getpid()}" + ) if instance is not None: instance.shutdown() @@ -848,6 +868,12 @@ def __exit__( ) -> Literal[ False]: # https://github.com/microsoft/pyright/issues/7009#issuecomment-1894135045 del exc_value, traceback + import traceback + traceback.print_stack() + import os + print( + f"====================== LLM __exit__ is called pid: {os.getpid()}" + ) self.shutdown() return False # propagate exceptions @@ -855,6 +881,8 @@ def __getstate__(self): raise RuntimeError("LLM object can not be pickled.") def __del__(self): + print( + f"====================== LLM __del__ is called pid: {os.getpid()}") self.shutdown() diff --git a/tensorrt_llm/llmapi/mpi_session.py b/tensorrt_llm/llmapi/mpi_session.py index d32e5a7b7aa..8f856c7900b 100644 --- a/tensorrt_llm/llmapi/mpi_session.py +++ b/tensorrt_llm/llmapi/mpi_session.py @@ -160,6 +160,7 @@ def submit_sync(self, task: Callable[..., T], *args, **kwargs) -> List[T]: return [future.result() for future in futures] def shutdown(self, wait=True): + print(f"==================================== shutdown MPI pool session") if self.mpi_pool is not None: self.mpi_pool.shutdown(wait=wait) self.mpi_pool = None @@ -237,6 +238,9 @@ def submit_sync(self, task: Callable[..., T], *args, **kwargs) -> List[T]: def shutdown(self, wait=True): # Only shutdown the mpi_pool if this instance created it # For shared global mpi_pool, we don't shut it down + print( + f"==================================== shutdown is called MPI comm session" + ) if self.mpi_pool is not None and self.owns_mpi_pool: self.mpi_pool.shutdown(wait=wait) self.mpi_pool = None diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ab9c1591c17..a2c323c460c 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -14,6 +14,7 @@ # limitations under the License. import os import sys +import time import pytest import torch @@ -59,9 +60,9 @@ def patched_start_mpi_pool(self): from tensorrt_llm.quantization import QuantAlgo from ..conftest import (get_device_count, get_device_memory, llm_models_root, - parametrize_with_ids, skip_no_hopper, - skip_post_blackwell, skip_pre_ada, skip_pre_blackwell, - skip_pre_hopper, skip_ray) + parametrize_with_ids, print_device_memory, + skip_no_hopper, skip_post_blackwell, skip_pre_ada, + skip_pre_blackwell, skip_pre_hopper, skip_ray) from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond, JsonModeEval, LlmapiAccuracyTestHarness, LongBenchV2) @@ -533,7 +534,9 @@ class TestLlama3_2_1B(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B" EXAMPLE_FOLDER = "models/core/llama" - def test_auto_dtype(self): + @pytest.mark.parametrize("pp_size", [2, 4], ids=["pp2", "pp4"]) + def test_auto_dtype(self, pp_size): + print_device_memory() with LLM(self.MODEL_PATH) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) @@ -1328,6 +1331,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): @parametrize_with_ids("mtp_nextn", [0, 2]) def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, overlap_scheduler, torch_compile, enable_chunked_prefill): + print_device_memory() kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, @@ -1351,6 +1355,11 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, speculative_config=mtp_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + print_device_memory() + + time.sleep(60) + print(f"================= print mem after 60s") + print_device_memory() @pytest.mark.skip_less_device_memory(60000) def test_bfloat16_2_model_mtp(self): @@ -1406,6 +1415,10 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + + #time.sleep(5) + print(f"================= print mem before testing") + print_device_memory() with LLM(self.MODEL_PATH, tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, @@ -1417,6 +1430,18 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, speculative_config=mtp_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + print(f"================= print mem after testing") + print_device_memory() + + #time.sleep(5) + print(f"================= print mem after testing outside") + print_device_memory() + + print(f"++++++++++++++++++++++++++++++++++++++++\n\n\n") + + #time.sleep(60) + #print(f"================= print mem after 60s") + #print_device_memory() @skip_pre_hopper @parametrize_with_ids("torch_compile", [False, True]) @@ -2263,6 +2288,13 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, if moe_backend == "TRTLLM" and sm_version in (120, 121): pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") + import gc + gc.collect() + torch.cuda.empty_cache() + + print(f"\n--- nvidia-smi start to test ---") + print_device_memory() + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -2297,9 +2329,19 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) # Commented out because GPQA takes too long to run - # task = GPQADiamond(self.MODEL_NAME) - # task.evaluate(llm, - # extra_evaluator_kwargs=dict(apply_chat_template=True)) + task = GPQADiamond(self.MODEL_NAME) + task.evaluate(llm, + extra_evaluator_kwargs=dict(apply_chat_template=True)) + print("=================================== test finishes") + print_device_memory() + + import gc + gc.collect() + torch.cuda.empty_cache() + + time.sleep(180) + print(f"\n--- nvidia-smi after testing after 180s ---") + print_device_memory() @skip_pre_blackwell @pytest.mark.parametrize( diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index 92d593bf206..933c1562532 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2004,40 +2004,65 @@ def get_device_count(): return len(get_gpu_device_list()) -def get_device_memory(): - "get gpu memory" - memory = 0 +def get_device_memory_str(): with tempfile.TemporaryDirectory() as temp_dirname: suffix = ".exe" if is_windows() else "" - # TODO: Use NRSU because we can't assume nvidia-smi across all platforms. cmd = " ".join([ - "nvidia-smi" + suffix, "--query-gpu=memory.total", + "nvidia-smi" + suffix, + "--query-gpu=memory.total,memory.reserved,memory.used,memory.free", "--format=csv,noheader" ]) - # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo - # This fallback is needed for systems with unified memory (e.g. DGX Spark) + output = check_output(cmd, shell=True, cwd=temp_dirname) + return output.strip() + + +def get_device_memory(): + "get gpu memory" + memory = 0 + # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo + # This fallback is needed for systems with unified memory (e.g. DGX Spark) + try: + output = get_device_memory_str() + memory_str = output.strip().split()[0] + # Check if nvidia-smi returned a valid numeric value + if "N/A" in memory_str: + raise ValueError("nvidia-smi returned invalid memory info") + memory = int(memory_str) + except (sp.CalledProcessError, ValueError, IndexError): + # Fallback to system memory from /proc/meminfo (in kB, convert to MiB) try: - output = check_output(cmd, shell=True, cwd=temp_dirname) - memory_str = output.strip().split()[0] - # Check if nvidia-smi returned a valid numeric value - if "N/A" in memory_str: - raise ValueError("nvidia-smi returned invalid memory info") - memory = int(memory_str) - except (sp.CalledProcessError, ValueError, IndexError): - # Fallback to system memory from /proc/meminfo (in kB, convert to MiB) - try: - with open("/proc/meminfo", "r") as f: - for line in f: - if line.startswith("MemTotal:"): - memory = int( - line.split()[1]) // 1024 # Convert kB to MiB - break - except: - memory = 8192 # Default 8GB if all else fails + with open("/proc/meminfo", "r") as f: + for line in f: + if line.startswith("MemTotal:"): + memory = int( + line.split()[1]) // 1024 # Convert kB to MiB + break + except: + memory = 8192 # Default 8GB if all else fails return memory +def print_device_memory(): + memory_str = get_device_memory_str() + print(f"Device Memory:\ntotal: reserved: used: free: \n{memory_str}") + + mem_stats = torch.cuda.memory_stats() + torch_used_bytes = mem_stats["allocated_bytes.all.current"] + torch_used_bytes = mem_stats["reserved_bytes.all.current"] + print( + f"================================== torch mem stats: allocated {torch_used_bytes} reserved {torch_used_bytes}" + ) + print(f"\n--- nvidia-smi in print_device_memory ---") + sp.run(["nvidia-smi"], check=False) + + end, total_gpu_memory = torch.cuda.mem_get_info() + total_used_bytes = total_gpu_memory - end + print( + f"================================== torch mem info: free {end}, total {total_gpu_memory}, used {total_used_bytes}" + ) + + def pytest_addoption(parser): parser.addoption( "--test-list", diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index 57c3b6fd810..1174f6066c9 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -32,10 +32,10 @@ l0_gb200_multi_nodes: backend: pytorch tests: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180) ISOLATION + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) ISOLATION + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180) ISOLATION + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180) ISOLATION - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 6de1fa6b552..47b4d5ba673 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -324,7 +324,6 @@ accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP ( accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5644632) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] SKIP (https://nvbugs/5629136) examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441) accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441) @@ -359,8 +358,6 @@ accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] S test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5701457) triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5701480) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701425) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5701425) unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] SKIP (https://nvbugs/5702795) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] SKIP (https://nvbugs/5702795) @@ -389,10 +386,7 @@ accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-c accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343) unittest/_torch/speculative/test_spec_gate.py::test_spec_gate_e2e SKIP (https://nvbugs/5710045) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5569696) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/5715568) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5715568) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) @@ -400,19 +394,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_mode unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5727475) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740087) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740087) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740359) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[True] SKIP (https://nvbugs/5739981) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_serve[True] SKIP (https://nvbugs/5739981) @@ -420,26 +404,14 @@ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5596337) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377, https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740087) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740359) unittest/_torch/multi_gpu/test_allreduce.py::test_allreduce_fusion_patterns[2-residual_rms_norm_out_quant_fp8-hidden:7168-seqlen:8192] SKIP (https://nvbugs/5741392) unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740377) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740359) examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] SKIP (https://nvbugs/5744293) examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] SKIP (https://nvbugs/5744293) examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] SKIP (https://nvbugs/5744293) test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5744432) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740087) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920) test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938) unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args8] SKIP (https://nvbugs/5747878) @@ -495,7 +467,6 @@ examples/serve/test_serve.py::test_config_file_loading[--config] SKIP (https://n full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075) examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5748683) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5715568) unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741) unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741) unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_dist_backend.py::test_dist_backend_all_gather[torch] SKIP (https://nvbugs/5766986)