Skip to content

Commit e088937

Browse files
committed
Moving transfer timeout test to test_llm_pytorch, fixing broken py_executor.py
Signed-off-by: Patrice Castonguay <[email protected]>
1 parent f877823 commit e088937

File tree

3 files changed

+64
-65
lines changed

3 files changed

+64
-65
lines changed

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1669,7 +1669,7 @@ def flag_if_kv_transfer_timed_out(req: LlmRequest, type: str) -> None:
16691669
)
16701670
req.py_kv_transfer_timed_out = True
16711671

1672-
for req, _ in self.ctx_in_transmission_requests:
1672+
for req, _, _ in self.ctx_in_transmission_requests.values():
16731673
flag_if_kv_transfer_timed_out(req, "context")
16741674

16751675
for req in self.active_requests:

tests/unittest/llmapi/test_llm.py

Lines changed: 2 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import json
55
import os
66
import sys
7-
import time
87

98
# Required for test_generate_with_seed to pass.
109
# See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891
@@ -29,13 +28,11 @@
2928
from tensorrt_llm import LLM as LLM_torch
3029
from tensorrt_llm._tensorrt_engine import LLM
3130
from tensorrt_llm.bindings import executor as tllm
32-
from tensorrt_llm.disaggregated_params import DisaggregatedParams
3331
from tensorrt_llm.executor import (GenerationExecutorWorker, GenerationRequest,
3432
GenerationResult, LoRARequest,
3533
PromptAdapterRequest, RequestError)
36-
from tensorrt_llm.llmapi import (BuildCacheConfig, CacheTransceiverConfig,
37-
EagleDecodingConfig, KvCacheConfig,
38-
KvCacheRetentionConfig,
34+
from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig,
35+
KvCacheConfig, KvCacheRetentionConfig,
3936
LookaheadDecodingConfig, MedusaDecodingConfig,
4037
RequestOutput)
4138
from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs
@@ -2567,61 +2564,3 @@ def test_llm_api_draft_target():
25672564
prompt = output.prompt
25682565
generated_text = output.outputs[0].text
25692566
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
2570-
2571-
2572-
def test_llm_context_only_timed_out():
2573-
tp_size = 1
2574-
use_overlap = False
2575-
enable_iter_req_stats = False
2576-
2577-
llm_args_extra = {}
2578-
2579-
llm_args_extra.update(
2580-
dict(enable_iter_perf_stats=True,
2581-
enable_iter_req_stats=enable_iter_req_stats,
2582-
disable_overlap_scheduler=not use_overlap))
2583-
LLM_CLASS = LLM_torch
2584-
2585-
llm = LLM_CLASS(model=llama_model_path,
2586-
kv_cache_config=global_kvcache_config,
2587-
tensor_parallel_size=tp_size,
2588-
cache_transceiver_config=CacheTransceiverConfig(
2589-
backend="DEFAULT", kv_transfer_timeout_ms=1000),
2590-
**llm_args_extra)
2591-
2592-
max_tokens = 1
2593-
sampling_params = SamplingParams(max_tokens=max_tokens)
2594-
2595-
disaggregated_params = DisaggregatedParams(request_type="context_only")
2596-
2597-
prompts0 = [
2598-
"What is your name?",
2599-
]
2600-
prompts1 = [
2601-
"Nvidia is awesome because",
2602-
]
2603-
2604-
# Send context-only request
2605-
for output in llm.generate(prompts1,
2606-
sampling_params=sampling_params,
2607-
disaggregated_params=disaggregated_params):
2608-
print(output)
2609-
2610-
results = llm.get_stats(2)
2611-
assert len(results) == 1
2612-
context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]
2613-
print(f"Context only used num blocks: {context_only_used_num_blocks}")
2614-
2615-
# Sleep 5 seconds to allow context only request to time out
2616-
time.sleep(5)
2617-
2618-
# Send regular request
2619-
for output in llm.generate(prompts0, sampling_params=sampling_params):
2620-
print(output)
2621-
2622-
# Get number of allocated blocks
2623-
results = llm.get_stats(2)
2624-
assert len(results) == 1
2625-
final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]
2626-
2627-
assert final_used_num_blocks == 0

tests/unittest/llmapi/test_llm_pytorch.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
import random
2+
import time
23
from contextlib import contextmanager, nullcontext
34
from typing import Optional
45

56
import pytest
67

78
from tensorrt_llm import LLM
9+
from tensorrt_llm.disaggregated_params import DisaggregatedParams
810
from tensorrt_llm.executor import GenerationExecutorWorker
911
from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy
10-
from tensorrt_llm.llmapi import KvCacheConfig
12+
from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
1113
from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig, PeftCacheConfig
1214
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
1315
from tensorrt_llm.metrics import MetricNames
@@ -961,3 +963,61 @@ async def test_llm_rpc_streaming():
961963
outputs.append(output.outputs[0].text)
962964
"".join(outputs)
963965
print(f"get result: {outputs}")
966+
967+
968+
@pytest.mark.part0
969+
def test_llm_context_only_timed_out():
970+
tp_size = 1
971+
use_overlap = False
972+
enable_iter_req_stats = False
973+
974+
llm_args_extra = {}
975+
976+
llm_args_extra.update(
977+
dict(enable_iter_perf_stats=True,
978+
enable_iter_req_stats=enable_iter_req_stats,
979+
disable_overlap_scheduler=not use_overlap))
980+
981+
llm = LLM(model=llama_model_path,
982+
kv_cache_config=global_kvcache_config,
983+
tensor_parallel_size=tp_size,
984+
cache_transceiver_config=CacheTransceiverConfig(
985+
backend="DEFAULT", kv_transfer_timeout_ms=1000),
986+
**llm_args_extra)
987+
988+
max_tokens = 1
989+
sampling_params = SamplingParams(max_tokens=max_tokens)
990+
991+
disaggregated_params = DisaggregatedParams(request_type="context_only")
992+
993+
prompts0 = [
994+
"What is your name?",
995+
]
996+
prompts1 = [
997+
"Nvidia is awesome because",
998+
]
999+
1000+
# Send context-only request
1001+
for output in llm.generate(prompts1,
1002+
sampling_params=sampling_params,
1003+
disaggregated_params=disaggregated_params):
1004+
print(output)
1005+
1006+
results = llm.get_stats(2)
1007+
assert len(results) == 1
1008+
context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]
1009+
print(f"Context only used num blocks: {context_only_used_num_blocks}")
1010+
1011+
# Sleep 5 seconds to allow context only request to time out
1012+
time.sleep(5)
1013+
1014+
# Send regular request
1015+
for output in llm.generate(prompts0, sampling_params=sampling_params):
1016+
print(output)
1017+
1018+
# Get number of allocated blocks
1019+
results = llm.get_stats(2)
1020+
assert len(results) == 1
1021+
final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]
1022+
1023+
assert final_used_num_blocks == 0

0 commit comments

Comments
 (0)