[https://nvbugs/5717993][fix] Add execution stream in PyExecutor and pass to BufferManager in KVCacheTransferManager to sync kvCache transfers with execution kernels.

SimengLiu-nv · SimengLiu-nv · commit 83e47b9a8f88 · 2025-12-16T11:31:56.000-08:00
Signed-off-by: SimengLiu-nv &lt;simengl@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -149,8 +149,8 @@ def __init__(self,
         self.execution_stream = execution_stream if execution_stream is not None else torch.cuda.Stream(
         )
         logger.info(
-            f"[PyExecutor] execution_stream initialized: {self.execution_stream}, "
-            f"cuda_stream ptr: 0x{self.execution_stream.cuda_stream:x}, ")
+            f"[PyExecutor] execution_stream initialized: {self.execution_stream}. "
+        )
 
         self.peft_cache_config = peft_cache_config
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -605,9 +605,8 @@ def drafting_loop_wrapper(model):
     # Create the execution stream for model forward operations
     # for proper synchronization with KVCacheTransferManager's onboard/offload operations.
     execution_stream = torch.cuda.Stream()
-    logger.debug(
-        f"[create_py_executor] Created execution_stream: {execution_stream}, "
-        f"cuda_stream ptr: 0x{execution_stream.cuda_stream:x}")
+    logger.info(
+        f"[create_py_executor] Created execution_stream: {execution_stream}")
 
     if model_engine.model.model_config.is_generation:
         #NOTE: non-generation models do not have kv cache
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -358,8 +358,7 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
         # If no execution stream is provided, create a new one (for backward compatibility).
         self._stream = execution_stream if execution_stream is not None else torch.cuda.Stream(
         )
-        logger.info(f"[KVCacheManager] execution_stream: {self._stream}, "
-                    f"cuda_stream ptr: 0x{self._stream.cuda_stream:x}, ")
+        logger.info(f"[KVCacheManager] execution_stream: {self._stream}")
         kwargs = {
             'num_kv_heads_per_layer': self.num_kv_heads_per_layer,
             'size_per_head': head_dim,
@@ -371,7 +370,7 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
             'temp_attention_window_inputs': temp_attention_window_inputs,
             'dtype': dtype,
             'sink_token_length': sink_token_length,
-            'stream': self._stream.cuda_stream,  # Passed to BufferManager
+            'stream': self._stream.cuda_stream,  # Pass to BufferManager
             'max_sequence_length': max_seq_len,
             'enable_block_reuse': kv_cache_config.enable_block_reuse,
             'onboard_blocks': kv_cache_config.onboard_blocks,
@@ -1457,8 +1456,12 @@ def __init__(self,
             world_config = _tb.WorldConfig()
 
         BufferManager = tensorrt_llm.bindings.internal.runtime.BufferManager
-        # Use the provided execution stream for proper synchronization with lora cache.
-        buffer_manager = BufferManager(execution_stream.cuda_stream, True)
+        buffer_manager_stream = execution_stream.cuda_stream if execution_stream is not None else torch.cuda.current_stream(
+        ).cuda_stream
+        buffer_manager = BufferManager(buffer_manager_stream, True)
+        logger.info(
+            f"[PeftCacheManager] buffer_manager_stream: {buffer_manager_stream}, "
+            f"cuda_stream ptr: 0x{buffer_manager_stream.cuda_stream:x}, ")
         self.impl = PeftCacheManagerCpp(config=peft_cache_manager_config,
                                         model_config=model_config,
                                         world_config=world_config,
diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -15,7 +15,8 @@ l0_a100:
   tests:
     - unittest/llmapi/test_llm_pytorch.py
     - unittest/llmapi/test_mpi_session.py ISOLATION
-    - unittest/llmapi/test_memory_profiling.py # profile kvcache for vision encoder
+    - unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
+    - unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
     - unittest/trt/model_api/test_model_quantization.py
     # executor
     - unittest/executor/test_base_worker.py
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -302,7 +302,7 @@ full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP
 full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337)
 accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847)
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143)
-unittest/llmapi/test_memory_profiling.py SKIP (https://nvbugs/5580781)
+unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
 triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
 full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
diff --git a/tests/unittest/_torch/executor/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py
@@ -484,9 +484,6 @@ def test_prepare_tp_inputs_with_helix_parallelism(self) -> None:
 
     def test_kv_cache_manager_with_execution_stream(self):
         """Test that KVCacheManager uses the provided execution_stream.
-
-        This test verifies the fix for bug 5717993: Stream synchronization
-        across TRT-LLM, regarding KV cache allocation and kernels' execution.
         """
         # Create a dedicated execution stream
         execution_stream = torch.cuda.Stream()
@@ -509,7 +506,8 @@ def test_kv_cache_manager_with_execution_stream(self):
         batch.context_requests = requests
         batch.generation_requests = []
         kv_cache_manager.prepare_resources(batch)
-        model_engine.forward(batch, resource_manager)
+        with torch.cuda.stream(execution_stream):
+            model_engine.forward(batch, resource_manager)
 
         # Verify the stream is still the same after forward pass
         self.assertEqual(
diff --git a/tests/unittest/llmapi/test_memory_profiling.py b/tests/unittest/llmapi/test_memory_profiling.py
@@ -3,6 +3,7 @@
 
 from tensorrt_llm._torch.pyexecutor.py_executor_creator import \
     create_py_executor
+from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
 from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
                                  DynamicBatchConfig, SchedulerConfig)
 from tensorrt_llm.llmapi.llm_args import (CudaGraphConfig, KvCacheConfig,
@@ -75,3 +76,74 @@ def test_profile_kvcache():
     torch.cuda.empty_cache()
 
     assert vlm_activation_bytes_with_mm_reqs > vlm_activation_bytes_no_mm_reqs, f"Activation bytes should be higher with mm reqs, but got {vlm_activation_bytes_with_mm_reqs} for mm reqs and {vlm_activation_bytes_no_mm_reqs} without mm reqs"
+
+
+def test_pyexecutor_and_kvcache_share_execution_stream():
+    """Test that PyExecutor and KVCacheManager share the same execution_stream.
+
+    The execution_stream is created once in create_py_executor and passed to:
+    - KVCacheManager (via KvCacheCreator -> _create_kv_cache_manager)
+    - PyExecutor (via create_py_executor_instance)
+
+    Both components must use the same stream for proper synchronization.
+    """
+    # Use a simple model for testing
+    MODEL = "llama-3.2-models/llama-3.2-1b-instruct"
+    MODEL_PATH = get_model_path(MODEL)
+
+    kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                    free_gpu_memory_fraction=0.5)
+
+    build_config = BuildConfig(max_beam_width=1, max_num_tokens=4096)
+    scheduler_config = SchedulerConfig(
+        capacity_scheduler_policy=CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, )
+    backend = "pytorch"
+    llm_args = {
+        "model": MODEL,
+        "scheduler_config": scheduler_config,
+        "tokenizer": None,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "moe_expert_parallel_size": None,
+        "gpus_per_node": 1,
+        "trust_remote_code": False,
+        "max_batch_size": build_config.max_batch_size,
+        "max_num_tokens": build_config.max_num_tokens,
+        "max_beam_width": build_config.max_beam_width,
+        "max_seq_len": build_config.max_seq_len,
+        "kv_cache_config": kv_cache_config,
+        "backend": backend,
+        "num_postprocess_workers": 0,
+        "postprocess_tokenizer_dir": MODEL,
+        "reasoning_parser": None,
+        "fail_fast_on_attention_window_too_large": False,
+    }
+
+    torchllm_args = TorchLlmArgs(**llm_args)
+
+    py_executor = create_py_executor(llm_args=torchllm_args,
+                                     checkpoint_dir=MODEL_PATH)
+
+    # Get the KVCacheManager from the resource manager
+    kv_cache_manager = py_executor.resource_manager.get_resource_manager(
+        ResourceManagerType.KV_CACHE_MANAGER)
+
+    # Verify both PyExecutor and KVCacheManager have execution_stream
+    assert py_executor.execution_stream is not None, \
+        "PyExecutor should have an execution_stream"
+    assert kv_cache_manager is not None, \
+        "KVCacheManager should exist in resource_manager"
+    assert hasattr(kv_cache_manager, '_stream'), \
+        "KVCacheManager should have _stream attribute"
+
+    # Verify they share the same CUDA stream pointer
+    assert py_executor.execution_stream.cuda_stream == kv_cache_manager._stream.cuda_stream, \
+        f"PyExecutor.execution_stream ({py_executor.execution_stream.cuda_stream}) " \
+        f"should have the same cuda_stream pointer as KVCacheManager._stream ({kv_cache_manager._stream.cuda_stream})"
+
+    # Verify they are the exact same stream object
+    assert py_executor.execution_stream is kv_cache_manager._stream, \
+        "PyExecutor.execution_stream and KVCacheManager._stream should be the exact same stream object"
+
+    py_executor.shutdown()
+    torch.cuda.empty_cache()

Original file line number	Diff line number	Diff line change
`@@ -149,8 +149,8 @@ def __init__(self,`
`149`	`149`	`self.execution_stream = execution_stream if execution_stream is not None else torch.cuda.Stream(`
`150`	`150`	`)`
`151`	`151`	`logger.info(`
`152`		`- f"[PyExecutor] execution_stream initialized: {self.execution_stream}, "`
`153`		`- f"cuda_stream ptr: 0x{self.execution_stream.cuda_stream:x}, ")`
	`152`	`+ f"[PyExecutor] execution_stream initialized: {self.execution_stream}. "`
	`153`	`+ )`
`154`	`154`
`155`	`155`	`self.peft_cache_config = peft_cache_config`
`156`	`156`