diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 5985c92efec..cc060ae22b6 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -1696,7 +1696,7 @@ def flag_if_kv_transfer_timed_out(req: LlmRequest, type: str) -> None: ) req.py_kv_transfer_timed_out = True - for req, _ in self.ctx_in_transmission_requests: + for req, _, _ in self.ctx_in_transmission_requests.values(): flag_if_kv_transfer_timed_out(req, "context") for req in self.active_requests: diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 2e9c59a9485..4a276bebacb 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -4,7 +4,6 @@ import json import os import sys -import time # Required for test_generate_with_seed to pass. # See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891 @@ -29,13 +28,11 @@ from tensorrt_llm import LLM as LLM_torch from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.bindings import executor as tllm -from tensorrt_llm.disaggregated_params import DisaggregatedParams from tensorrt_llm.executor import (GenerationExecutorWorker, GenerationRequest, GenerationResult, LoRARequest, PromptAdapterRequest, RequestError) -from tensorrt_llm.llmapi import (BuildCacheConfig, CacheTransceiverConfig, - EagleDecodingConfig, KvCacheConfig, - KvCacheRetentionConfig, +from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig, + KvCacheConfig, KvCacheRetentionConfig, LookaheadDecodingConfig, MedusaDecodingConfig, RequestOutput) from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs @@ -2567,61 +2564,3 @@ def test_llm_api_draft_target(): prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -def test_llm_context_only_timed_out(): - tp_size = 1 - use_overlap = False - enable_iter_req_stats = False - - llm_args_extra = {} - - llm_args_extra.update( - dict(enable_iter_perf_stats=True, - enable_iter_req_stats=enable_iter_req_stats, - disable_overlap_scheduler=not use_overlap)) - LLM_CLASS = LLM_torch - - llm = LLM_CLASS(model=llama_model_path, - kv_cache_config=global_kvcache_config, - tensor_parallel_size=tp_size, - cache_transceiver_config=CacheTransceiverConfig( - backend="DEFAULT", kv_transfer_timeout_ms=1000), - **llm_args_extra) - - max_tokens = 1 - sampling_params = SamplingParams(max_tokens=max_tokens) - - disaggregated_params = DisaggregatedParams(request_type="context_only") - - prompts0 = [ - "What is your name?", - ] - prompts1 = [ - "Nvidia is awesome because", - ] - - # Send context-only request - for output in llm.generate(prompts1, - sampling_params=sampling_params, - disaggregated_params=disaggregated_params): - print(output) - - results = llm.get_stats(2) - assert len(results) == 1 - context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] - print(f"Context only used num blocks: {context_only_used_num_blocks}") - - # Sleep 5 seconds to allow context only request to time out - time.sleep(5) - - # Send regular request - for output in llm.generate(prompts0, sampling_params=sampling_params): - print(output) - - # Get number of allocated blocks - results = llm.get_stats(2) - assert len(results) == 1 - final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] - - assert final_used_num_blocks == 0 diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 04a74745abc..def1da13332 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -1,13 +1,15 @@ import random +import time from contextlib import contextmanager, nullcontext from typing import Optional import pytest from tensorrt_llm import LLM +from tensorrt_llm.disaggregated_params import DisaggregatedParams from tensorrt_llm.executor import GenerationExecutorWorker from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy -from tensorrt_llm.llmapi import KvCacheConfig +from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig, PeftCacheConfig from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer from tensorrt_llm.metrics import MetricNames @@ -961,3 +963,62 @@ async def test_llm_rpc_streaming(): outputs.append(output.outputs[0].text) "".join(outputs) print(f"get result: {outputs}") + + +@pytest.mark.threadleak(enabled=False) +@pytest.mark.part0 +def test_llm_context_only_timed_out(): + tp_size = 1 + use_overlap = False + enable_iter_req_stats = False + + llm_args_extra = {} + + llm_args_extra.update( + dict(enable_iter_perf_stats=True, + enable_iter_req_stats=enable_iter_req_stats, + disable_overlap_scheduler=not use_overlap)) + + llm = LLM(model=llama_model_path, + kv_cache_config=global_kvcache_config, + tensor_parallel_size=tp_size, + cache_transceiver_config=CacheTransceiverConfig( + backend="DEFAULT", kv_transfer_timeout_ms=1000), + **llm_args_extra) + + max_tokens = 1 + sampling_params = SamplingParams(max_tokens=max_tokens) + + disaggregated_params = DisaggregatedParams(request_type="context_only") + + prompts0 = [ + "What is your name?", + ] + prompts1 = [ + "Nvidia is awesome because", + ] + + # Send context-only request + for output in llm.generate(prompts1, + sampling_params=sampling_params, + disaggregated_params=disaggregated_params): + print(output) + + results = llm.get_stats(2) + assert len(results) == 1 + context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] + print(f"Context only used num blocks: {context_only_used_num_blocks}") + + # Sleep 5 seconds to allow context only request to time out + time.sleep(5) + + # Send regular request + for output in llm.generate(prompts0, sampling_params=sampling_params): + print(output) + + # Get number of allocated blocks + results = llm.get_stats(2) + assert len(results) == 1 + final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] + + assert final_used_num_blocks == 0