[https://nvbugs/5717993][fix] Add execution_stream across PyExecutor, KVCacheManager, PeftCacheManager to ensure proper CUDA stream synchronization between KV cache transfer operations and model forward kernels. (#10060)

SimengLiu-nv · web-flow · commit 84d107b2f08a · 2025-12-31T09:22:54.000-08:00
Signed-off-by: SimengLiu-nv &lt;simengl@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -77,6 +77,7 @@ def __init__(
         speculative_config: SpeculativeConfig,
         sparse_attention_config: SparseAttentionConfig,
         profiling_stage_data: Optional[dict],
+        execution_stream: Optional[torch.cuda.Stream] = None,
     ):
         self._model_engine = model_engine
         self._draft_model_engine = draft_model_engine
@@ -97,6 +98,7 @@ def __init__(
         self._profiling_stage_data = profiling_stage_data
         self._kv_cache_manager_cls = get_kv_cache_manager_cls(
             model_engine.model.model_config)
+        self._execution_stream = execution_stream
 
     def _get_kv_size_per_token(self):
         model_config = self._model_engine.model.model_config
@@ -474,6 +476,7 @@ def _create_kv_cache_manager(
             max_beam_width=self._max_beam_width,
             kv_connector_manager=self._kv_connector_manager,
             estimating_kv_cache=estimating_kv_cache,
+            execution_stream=self._execution_stream,
         )
 
         # KVCacheManager (Non-draft) modifies the max_seq_len field, update it to self
@@ -527,14 +530,20 @@ def teardown_managers(self, resources: Dict) -> None:
 
 
 def _create_kv_cache_manager(
-        model_engine: PyTorchModelEngine, kv_cache_manager_cls,
-        mapping: Mapping, kv_cache_config: KvCacheConfig, tokens_per_block: int,
-        max_seq_len: int, max_batch_size: int,
+        model_engine: PyTorchModelEngine,
+        kv_cache_manager_cls,
+        mapping: Mapping,
+        kv_cache_config: KvCacheConfig,
+        tokens_per_block: int,
+        max_seq_len: int,
+        max_batch_size: int,
         spec_config: Optional[SpeculativeConfig],
         sparse_attn_config: Optional[SparseAttentionConfig],
-        max_num_tokens: int, max_beam_width: int,
+        max_num_tokens: int,
+        max_beam_width: int,
         kv_connector_manager: Optional[KvCacheConnectorManager],
-        estimating_kv_cache: bool) -> KVCacheManager:
+        estimating_kv_cache: bool,
+        execution_stream: Optional[torch.cuda.Stream] = None) -> KVCacheManager:
     """
     Returns:
         A KVCacheManager instance for the given model_engine
@@ -580,6 +589,7 @@ def _create_kv_cache_manager(
             if not estimating_kv_cache else None,
             sparse_attn_config=sparse_attn_config,
             is_estimating_kv_cache=estimating_kv_cache,
+            execution_stream=execution_stream,
         )
     elif is_nemotron_hybrid(config):
         if max_beam_width > 1:
@@ -623,6 +633,7 @@ def _create_kv_cache_manager(
             dtype=kv_cache_dtype,
             spec_config=spec_config,
             is_estimating_kv_cache=estimating_kv_cache,
+            execution_stream=execution_stream,
         )
     elif is_qwen3_next(config):
         if max_beam_width > 1:
@@ -672,6 +683,7 @@ def _create_kv_cache_manager(
             dtype=kv_cache_dtype,
             spec_config=spec_config,
             is_estimating_kv_cache=estimating_kv_cache,
+            execution_stream=execution_stream,
         )
     else:
         # NOTE: this is a workaround for VSWA to switch to calculate_max_num_blocks_from_cpp in KVCahceManager
@@ -700,6 +712,7 @@ def _create_kv_cache_manager(
             if not estimating_kv_cache else None,
             sparse_attn_config=sparse_attn_config,
             is_estimating_kv_cache=estimating_kv_cache,
+            execution_stream=execution_stream,
         )
     return kv_cache_manager
 
@@ -727,6 +740,7 @@ def create_py_executor_instance(
     scheduler_config: Optional[SchedulerConfig] = None,
     cache_transceiver_config: Optional[CacheTransceiverConfig] = None,
     virtual_memory_pools: Optional[dict] = None,
+    execution_stream: Optional[torch.cuda.Stream] = None,
 ) -> PyExecutor:
     kv_cache_manager = resources.get(ResourceManagerType.KV_CACHE_MANAGER, None)
 
@@ -813,6 +827,7 @@ def create_py_executor_instance(
             lora_config=lora_config,
             model_config=model_binding_config,
             world_config=world_config,
+            execution_stream=execution_stream,
         )
         resources[ResourceManagerType.PEFT_CACHE_MANAGER] = peft_cache_manager
         model_engine.set_lora_model_config(
@@ -875,7 +890,8 @@ def create_py_executor_instance(
         kv_connector_manager=kv_connector_manager,
         max_seq_len=max_seq_len,
         peft_cache_config=peft_cache_config,
-        virtual_memory_pools=virtual_memory_pools)
+        virtual_memory_pools=virtual_memory_pools,
+        execution_stream=execution_stream)
 
 
 def create_torch_sampler_args(
diff --git a/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py b/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py
@@ -197,6 +197,7 @@ def __init__(
         dtype: DataType = DataType.HALF,
         spec_config: Optional["DecodingBaseConfig"] = None,
         is_estimating_kv_cache: bool = False,
+        execution_stream: Optional[torch.cuda.Stream] = None,
     ) -> None:
 
         # mamba hybrid cache requires block reuse to be disabled in KV cache config
@@ -234,6 +235,7 @@ def __init__(
             spec_config=spec_config,
             layer_mask=layer_mask,
             is_estimating_kv_cache=is_estimating_kv_cache,
+            execution_stream=execution_stream,
         )
 
     def prepare_resources(self, scheduled_batch: ScheduledRequests):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -136,11 +136,22 @@ def __init__(self,
                  kv_connector_manager: Optional[KvCacheConnectorManager] = None,
                  max_seq_len: Optional[int] = None,
                  peft_cache_config: Optional[PeftCacheConfig] = None,
-                 virtual_memory_pools: Optional[dict] = None):
+                 virtual_memory_pools: Optional[dict] = None,
+                 execution_stream: Optional[torch.cuda.Stream] = None):
         super(PyExecutor, self).__init__()
         self.device_id = torch.cuda.current_device()
         self.global_rank = dist.rank
 
+        # Store the execution stream for model forward operations.
+        # This stream is used for proper synchronization with KVCacheTransferManager.
+        # execution_stream can be provided by create_py_executor
+        # Create a new stream if none provided
+        self.execution_stream = execution_stream if execution_stream is not None else torch.cuda.Stream(
+        )
+        logger.info(
+            f"[PyExecutor] execution_stream initialized: {self.execution_stream}. "
+        )
+
         self.peft_cache_config = peft_cache_config
 
         self.iter_counter = 0
@@ -245,10 +256,19 @@ def __init__(self,
         self.inflight_req_ids = ReqIdsSet()
 
         # During warmup, we don't enable the profiler
+        # Run warmup on the execution_stream for proper synchronization with
+        # KVCacheTransferManager's onboard/offload operations.
         self.is_warmup = True
-        self.model_engine.warmup(self.resource_manager)
-        if self.draft_model_engine is not None:
-            self.draft_model_engine.warmup(self.resource_manager)
+
+        self.execution_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self.execution_stream):
+            self.model_engine.warmup(self.resource_manager)
+            if self.draft_model_engine is not None:
+                self.draft_model_engine.warmup(self.resource_manager)
+
+        # Ensure the default stream waits for execution_stream to complete
+        # before subsequent operations.
+        torch.cuda.current_stream().wait_stream(self.execution_stream)
         self.is_warmup = False
 
         self.is_shutdown = False
@@ -2231,10 +2251,19 @@ def forward(scheduled_requests, resource_manager, new_tensors_device,
                 a.py_return_context_logits
                 for a in scheduled_requests.context_requests)
             cache_indirection_buffer = self.sampler.get_cache_indirection()
-            outputs = forward(scheduled_requests, self.resource_manager,
-                              new_tensors_device, gather_context_logits,
-                              cache_indirection_buffer,
-                              num_accepted_tokens_device)
+
+            # Run model forward on the execution stream for proper synchronization
+            # with KVCacheTransferManager's onboard/offload operations.
+            self.execution_stream.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(self.execution_stream):
+                outputs = forward(scheduled_requests, self.resource_manager,
+                                  new_tensors_device, gather_context_logits,
+                                  cache_indirection_buffer,
+                                  num_accepted_tokens_device)
+
+            # Ensure the default stream waits for execution_stream to complete
+            # before downstream operations use the outputs.
+            torch.cuda.current_stream().wait_stream(self.execution_stream)
 
             self._kv_connector_wait_for_save()
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -601,6 +601,13 @@ def drafting_loop_wrapper(model):
     resources = {}
     estimating_kv_cache = False
     kv_cache_creator = None
+
+    # Create the execution stream for model forward operations
+    # for proper synchronization with KVCacheTransferManager's onboard/offload operations.
+    execution_stream = torch.cuda.Stream()
+    logger.info(
+        f"[create_py_executor] Created execution_stream: {execution_stream}")
+
     if model_engine.model.model_config.is_generation:
         #NOTE: non-generation models do not have kv cache
         kv_cache_creator = KvCacheCreator(
@@ -619,6 +626,7 @@ def drafting_loop_wrapper(model):
             speculative_config=spec_config,
             profiling_stage_data=profiling_stage_data,
             sparse_attention_config=sparse_attention_config,
+            execution_stream=execution_stream,
         )
         estimating_kv_cache = kv_cache_creator.try_prepare_estimation()
         with allocation_scope(
@@ -676,6 +684,7 @@ def drafting_loop_wrapper(model):
             scheduler_config=scheduler_config,
             cache_transceiver_config=cache_transceiver_config,
             virtual_memory_pools=vm_pools if not estimating_kv_cache else None,
+            execution_stream=execution_stream,
         )
         # Originally, peft_cache_config might be mutated inside
         # create_py_executor_instance. Restore it here.
@@ -736,6 +745,7 @@ def drafting_loop_wrapper(model):
                 scheduler_config=scheduler_config,
                 cache_transceiver_config=cache_transceiver_config,
                 virtual_memory_pools=vm_pools,
+                execution_stream=execution_stream,
             )
 
     _adjust_torch_mem_fraction()
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -176,6 +176,7 @@ def __init__(
         indexer_k_cache_quant_block_size: int = 128,
         indexer_k_cache_index_head_dim: int = 0,
         is_estimating_kv_cache: bool = False,
+        execution_stream: Optional[torch.cuda.Stream] = None,
         **kwargs,
     ) -> None:
         self.mapping = mapping
@@ -351,9 +352,13 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
         # Set up temp_attention_window_inputs
         temp_attention_window_inputs = self._set_temp_attention_window_inputs()
 
-        # Note that this stream is unused for now. Will be used for copying to host
-        # when that feature is enabled.
-        self._stream = torch.cuda.Stream()
+        # Use the provided execution stream for proper synchronization with KVCacheTransferManager.
+        # The execution stream is the stream where model forward kernels run, and KVCacheTransferManager
+        # needs to synchronize with it for onboard/offload operations.
+        # If no execution stream is provided, create a new one (for backward compatibility).
+        self._stream = execution_stream if execution_stream is not None else torch.cuda.Stream(
+        )
+        logger.info(f"[KVCacheManager] execution_stream: {self._stream}")
         kwargs = {
             'num_kv_heads_per_layer': self.num_kv_heads_per_layer,
             'size_per_head': head_dim,
@@ -365,7 +370,7 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
             'temp_attention_window_inputs': temp_attention_window_inputs,
             'dtype': dtype,
             'sink_token_length': sink_token_length,
-            'stream': self._stream.cuda_stream,
+            'stream': self._stream.cuda_stream,  # Pass to BufferManager
             'max_sequence_length': max_seq_len,
             'enable_block_reuse': kv_cache_config.enable_block_reuse,
             'onboard_blocks': kv_cache_config.onboard_blocks,
@@ -1442,7 +1447,8 @@ def __init__(self,
                  peft_cache_config: PeftCacheConfig,
                  lora_config: LoraConfig,
                  model_config: ModelConfigCpp,
-                 world_config: WorldConfig | None = None):
+                 world_config: WorldConfig | None = None,
+                 execution_stream: Optional[torch.cuda.Stream] = None):
         import tensorrt_llm.bindings as _tb
 
         peft_cache_config = peft_cache_config._to_pybind()
@@ -1467,8 +1473,12 @@ def __init__(self,
             world_config = _tb.WorldConfig()
 
         BufferManager = tensorrt_llm.bindings.internal.runtime.BufferManager
-        buffer_manager = BufferManager(torch.cuda.current_stream().cuda_stream,
-                                       True)
+        buffer_manager_stream = execution_stream.cuda_stream if execution_stream is not None else torch.cuda.current_stream(
+        ).cuda_stream
+        buffer_manager = BufferManager(buffer_manager_stream, True)
+        logger.info(
+            f"[PeftCacheManager] buffer_manager_stream: {buffer_manager_stream}"
+        )
         self.impl = PeftCacheManagerCpp(config=peft_cache_manager_config,
                                         model_config=model_config,
                                         world_config=world_config,
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
@@ -380,23 +380,43 @@ def _adjust_config(task_dict, random_seed):
 
     @contextmanager
     def _patch_lm_eval(self):
-        if self.dataset_path is None:
-            yield
-            return
+        from pathlib import Path
 
         import lm_eval
-        self._task_config_post_init = lm_eval.api.task.TaskConfig.__post_init__
+        import lm_eval.tasks
+
+        # Patch Path.relative_to to handle custom task paths outside lm_eval/tasks
+        # This is needed with lm_eval>=0.4.9.2 with new function pretty_print_task (a local function inside
+        # get_task_dict) calls yaml_path.relative_to(lm_eval_tasks_path) which fails
+        # when the yaml is from tensorrt_llm/evaluate/lm_eval_tasks
+        original_relative_to = Path.relative_to
+
+        def _patched_relative_to(self, other, *args, **kwargs):
+            try:
+                return original_relative_to(self, other, *args, **kwargs)
+            except ValueError:
+                # Return absolute path if relative_to fails (path not under base)
+                return self
+
+        Path.relative_to = _patched_relative_to
+
+        # Optionally patch dataset_path if provided
+        original_post_init = None
+        if self.dataset_path is not None:
+            original_post_init = lm_eval.api.task.TaskConfig.__post_init__
 
-        def _patched(task_config, *args, **kwargs):
-            task_config.dataset_path = self.dataset_path
-            self._task_config_post_init(task_config, *args, **kwargs)
+            def _patched_post_init(task_config, *args, **kwargs):
+                task_config.dataset_path = self.dataset_path
+                original_post_init(task_config, *args, **kwargs)
 
-        lm_eval.api.task.TaskConfig.__post_init__ = _patched
+            lm_eval.api.task.TaskConfig.__post_init__ = _patched_post_init
 
         try:
             yield
         finally:
-            lm_eval.api.task.TaskConfig.__post_init__ = self._task_config_post_init
+            Path.relative_to = original_relative_to
+            if original_post_init is not None:
+                lm_eval.api.task.TaskConfig.__post_init__ = original_post_init
 
     def generate_samples(self) -> Iterable[tuple]:
         raise NotImplementedError()
diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -15,7 +15,8 @@ l0_a100:
   tests:
     - unittest/llmapi/test_llm_pytorch.py
     - unittest/llmapi/test_mpi_session.py ISOLATION
-    - unittest/llmapi/test_memory_profiling.py # profile kvcache for vision encoder
+    - unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
+    - unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
     - unittest/trt/model_api/test_model_quantization.py
     # executor
     - unittest/executor/test_base_worker.py
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -76,9 +76,10 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_dummy_load_format
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] TIMEOUT (90)
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] TIMEOUT (90)
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] TIMEOUT (90)
+  # Waive known failures in https://nvbugs/5774869
+  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] TIMEOUT (90)
+  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] TIMEOUT (90)
+  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -298,7 +298,7 @@ full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP
 full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337)
 accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847)
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143)
-unittest/llmapi/test_memory_profiling.py SKIP (https://nvbugs/5580781)
+unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
 triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
 full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
diff --git a/tests/unittest/_torch/executor/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py
diff --git a/tests/unittest/_torch/executor/test_resource_manager.py b/tests/unittest/_torch/executor/test_resource_manager.py
diff --git a/tests/unittest/llmapi/test_memory_profiling.py b/tests/unittest/llmapi/test_memory_profiling.py