Skip to content

Commit 83e47b9

Browse files
committed
[https://nvbugs/5717993][fix] Add execution stream in PyExecutor and pass to BufferManager in KVCacheTransferManager to sync kvCache transfers with execution kernels.
Signed-off-by: SimengLiu-nv <simengl@nvidia.com>
1 parent 24c47f8 commit 83e47b9

File tree

7 files changed

+89
-16
lines changed

7 files changed

+89
-16
lines changed

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,8 @@ def __init__(self,
149149
self.execution_stream = execution_stream if execution_stream is not None else torch.cuda.Stream(
150150
)
151151
logger.info(
152-
f"[PyExecutor] execution_stream initialized: {self.execution_stream}, "
153-
f"cuda_stream ptr: 0x{self.execution_stream.cuda_stream:x}, ")
152+
f"[PyExecutor] execution_stream initialized: {self.execution_stream}. "
153+
)
154154

155155
self.peft_cache_config = peft_cache_config
156156

tensorrt_llm/_torch/pyexecutor/py_executor_creator.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -605,9 +605,8 @@ def drafting_loop_wrapper(model):
605605
# Create the execution stream for model forward operations
606606
# for proper synchronization with KVCacheTransferManager's onboard/offload operations.
607607
execution_stream = torch.cuda.Stream()
608-
logger.debug(
609-
f"[create_py_executor] Created execution_stream: {execution_stream}, "
610-
f"cuda_stream ptr: 0x{execution_stream.cuda_stream:x}")
608+
logger.info(
609+
f"[create_py_executor] Created execution_stream: {execution_stream}")
611610

612611
if model_engine.model.model_config.is_generation:
613612
#NOTE: non-generation models do not have kv cache

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,7 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
358358
# If no execution stream is provided, create a new one (for backward compatibility).
359359
self._stream = execution_stream if execution_stream is not None else torch.cuda.Stream(
360360
)
361-
logger.info(f"[KVCacheManager] execution_stream: {self._stream}, "
362-
f"cuda_stream ptr: 0x{self._stream.cuda_stream:x}, ")
361+
logger.info(f"[KVCacheManager] execution_stream: {self._stream}")
363362
kwargs = {
364363
'num_kv_heads_per_layer': self.num_kv_heads_per_layer,
365364
'size_per_head': head_dim,
@@ -371,7 +370,7 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
371370
'temp_attention_window_inputs': temp_attention_window_inputs,
372371
'dtype': dtype,
373372
'sink_token_length': sink_token_length,
374-
'stream': self._stream.cuda_stream, # Passed to BufferManager
373+
'stream': self._stream.cuda_stream, # Pass to BufferManager
375374
'max_sequence_length': max_seq_len,
376375
'enable_block_reuse': kv_cache_config.enable_block_reuse,
377376
'onboard_blocks': kv_cache_config.onboard_blocks,
@@ -1457,8 +1456,12 @@ def __init__(self,
14571456
world_config = _tb.WorldConfig()
14581457

14591458
BufferManager = tensorrt_llm.bindings.internal.runtime.BufferManager
1460-
# Use the provided execution stream for proper synchronization with lora cache.
1461-
buffer_manager = BufferManager(execution_stream.cuda_stream, True)
1459+
buffer_manager_stream = execution_stream.cuda_stream if execution_stream is not None else torch.cuda.current_stream(
1460+
).cuda_stream
1461+
buffer_manager = BufferManager(buffer_manager_stream, True)
1462+
logger.info(
1463+
f"[PeftCacheManager] buffer_manager_stream: {buffer_manager_stream}, "
1464+
f"cuda_stream ptr: 0x{buffer_manager_stream.cuda_stream:x}, ")
14621465
self.impl = PeftCacheManagerCpp(config=peft_cache_manager_config,
14631466
model_config=model_config,
14641467
world_config=world_config,

tests/integration/test_lists/test-db/l0_a100.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ l0_a100:
1515
tests:
1616
- unittest/llmapi/test_llm_pytorch.py
1717
- unittest/llmapi/test_mpi_session.py ISOLATION
18-
- unittest/llmapi/test_memory_profiling.py # profile kvcache for vision encoder
18+
- unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
19+
- unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
1920
- unittest/trt/model_api/test_model_quantization.py
2021
# executor
2122
- unittest/executor/test_base_worker.py

tests/integration/test_lists/waives.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP
302302
full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337)
303303
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847)
304304
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143)
305-
unittest/llmapi/test_memory_profiling.py SKIP (https://nvbugs/5580781)
305+
unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
306306
triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
307307
full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
308308
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)

tests/unittest/_torch/executor/test_pytorch_model_engine.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -484,9 +484,6 @@ def test_prepare_tp_inputs_with_helix_parallelism(self) -> None:
484484

485485
def test_kv_cache_manager_with_execution_stream(self):
486486
"""Test that KVCacheManager uses the provided execution_stream.
487-
488-
This test verifies the fix for bug 5717993: Stream synchronization
489-
across TRT-LLM, regarding KV cache allocation and kernels' execution.
490487
"""
491488
# Create a dedicated execution stream
492489
execution_stream = torch.cuda.Stream()
@@ -509,7 +506,8 @@ def test_kv_cache_manager_with_execution_stream(self):
509506
batch.context_requests = requests
510507
batch.generation_requests = []
511508
kv_cache_manager.prepare_resources(batch)
512-
model_engine.forward(batch, resource_manager)
509+
with torch.cuda.stream(execution_stream):
510+
model_engine.forward(batch, resource_manager)
513511

514512
# Verify the stream is still the same after forward pass
515513
self.assertEqual(

tests/unittest/llmapi/test_memory_profiling.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from tensorrt_llm._torch.pyexecutor.py_executor_creator import \
55
create_py_executor
6+
from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
67
from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
78
DynamicBatchConfig, SchedulerConfig)
89
from tensorrt_llm.llmapi.llm_args import (CudaGraphConfig, KvCacheConfig,
@@ -75,3 +76,74 @@ def test_profile_kvcache():
7576
torch.cuda.empty_cache()
7677

7778
assert vlm_activation_bytes_with_mm_reqs > vlm_activation_bytes_no_mm_reqs, f"Activation bytes should be higher with mm reqs, but got {vlm_activation_bytes_with_mm_reqs} for mm reqs and {vlm_activation_bytes_no_mm_reqs} without mm reqs"
79+
80+
81+
def test_pyexecutor_and_kvcache_share_execution_stream():
82+
"""Test that PyExecutor and KVCacheManager share the same execution_stream.
83+
84+
The execution_stream is created once in create_py_executor and passed to:
85+
- KVCacheManager (via KvCacheCreator -> _create_kv_cache_manager)
86+
- PyExecutor (via create_py_executor_instance)
87+
88+
Both components must use the same stream for proper synchronization.
89+
"""
90+
# Use a simple model for testing
91+
MODEL = "llama-3.2-models/llama-3.2-1b-instruct"
92+
MODEL_PATH = get_model_path(MODEL)
93+
94+
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
95+
free_gpu_memory_fraction=0.5)
96+
97+
build_config = BuildConfig(max_beam_width=1, max_num_tokens=4096)
98+
scheduler_config = SchedulerConfig(
99+
capacity_scheduler_policy=CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, )
100+
backend = "pytorch"
101+
llm_args = {
102+
"model": MODEL,
103+
"scheduler_config": scheduler_config,
104+
"tokenizer": None,
105+
"tensor_parallel_size": 1,
106+
"pipeline_parallel_size": 1,
107+
"moe_expert_parallel_size": None,
108+
"gpus_per_node": 1,
109+
"trust_remote_code": False,
110+
"max_batch_size": build_config.max_batch_size,
111+
"max_num_tokens": build_config.max_num_tokens,
112+
"max_beam_width": build_config.max_beam_width,
113+
"max_seq_len": build_config.max_seq_len,
114+
"kv_cache_config": kv_cache_config,
115+
"backend": backend,
116+
"num_postprocess_workers": 0,
117+
"postprocess_tokenizer_dir": MODEL,
118+
"reasoning_parser": None,
119+
"fail_fast_on_attention_window_too_large": False,
120+
}
121+
122+
torchllm_args = TorchLlmArgs(**llm_args)
123+
124+
py_executor = create_py_executor(llm_args=torchllm_args,
125+
checkpoint_dir=MODEL_PATH)
126+
127+
# Get the KVCacheManager from the resource manager
128+
kv_cache_manager = py_executor.resource_manager.get_resource_manager(
129+
ResourceManagerType.KV_CACHE_MANAGER)
130+
131+
# Verify both PyExecutor and KVCacheManager have execution_stream
132+
assert py_executor.execution_stream is not None, \
133+
"PyExecutor should have an execution_stream"
134+
assert kv_cache_manager is not None, \
135+
"KVCacheManager should exist in resource_manager"
136+
assert hasattr(kv_cache_manager, '_stream'), \
137+
"KVCacheManager should have _stream attribute"
138+
139+
# Verify they share the same CUDA stream pointer
140+
assert py_executor.execution_stream.cuda_stream == kv_cache_manager._stream.cuda_stream, \
141+
f"PyExecutor.execution_stream ({py_executor.execution_stream.cuda_stream}) " \
142+
f"should have the same cuda_stream pointer as KVCacheManager._stream ({kv_cache_manager._stream.cuda_stream})"
143+
144+
# Verify they are the exact same stream object
145+
assert py_executor.execution_stream is kv_cache_manager._stream, \
146+
"PyExecutor.execution_stream and KVCacheManager._stream should be the exact same stream object"
147+
148+
py_executor.shutdown()
149+
torch.cuda.empty_cache()

0 commit comments

Comments
 (0)