Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1696,7 +1696,7 @@ def flag_if_kv_transfer_timed_out(req: LlmRequest, type: str) -> None:
)
req.py_kv_transfer_timed_out = True

for req, _ in self.ctx_in_transmission_requests:
for req, _, _ in self.ctx_in_transmission_requests.values():
flag_if_kv_transfer_timed_out(req, "context")

for req in self.active_requests:
Expand Down
65 changes: 2 additions & 63 deletions tests/unittest/llmapi/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import json
import os
import sys
import time

# Required for test_generate_with_seed to pass.
# See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891
Expand All @@ -29,13 +28,11 @@
from tensorrt_llm import LLM as LLM_torch
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bindings import executor as tllm
from tensorrt_llm.disaggregated_params import DisaggregatedParams
from tensorrt_llm.executor import (GenerationExecutorWorker, GenerationRequest,
GenerationResult, LoRARequest,
PromptAdapterRequest, RequestError)
from tensorrt_llm.llmapi import (BuildCacheConfig, CacheTransceiverConfig,
EagleDecodingConfig, KvCacheConfig,
KvCacheRetentionConfig,
from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig,
KvCacheConfig, KvCacheRetentionConfig,
LookaheadDecodingConfig, MedusaDecodingConfig,
RequestOutput)
from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs
Expand Down Expand Up @@ -2567,61 +2564,3 @@ def test_llm_api_draft_target():
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


def test_llm_context_only_timed_out():
tp_size = 1
use_overlap = False
enable_iter_req_stats = False

llm_args_extra = {}

llm_args_extra.update(
dict(enable_iter_perf_stats=True,
enable_iter_req_stats=enable_iter_req_stats,
disable_overlap_scheduler=not use_overlap))
LLM_CLASS = LLM_torch

llm = LLM_CLASS(model=llama_model_path,
kv_cache_config=global_kvcache_config,
tensor_parallel_size=tp_size,
cache_transceiver_config=CacheTransceiverConfig(
backend="DEFAULT", kv_transfer_timeout_ms=1000),
**llm_args_extra)

max_tokens = 1
sampling_params = SamplingParams(max_tokens=max_tokens)

disaggregated_params = DisaggregatedParams(request_type="context_only")

prompts0 = [
"What is your name?",
]
prompts1 = [
"Nvidia is awesome because",
]

# Send context-only request
for output in llm.generate(prompts1,
sampling_params=sampling_params,
disaggregated_params=disaggregated_params):
print(output)

results = llm.get_stats(2)
assert len(results) == 1
context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]
print(f"Context only used num blocks: {context_only_used_num_blocks}")

# Sleep 5 seconds to allow context only request to time out
time.sleep(5)

# Send regular request
for output in llm.generate(prompts0, sampling_params=sampling_params):
print(output)

# Get number of allocated blocks
results = llm.get_stats(2)
assert len(results) == 1
final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]

assert final_used_num_blocks == 0
63 changes: 62 additions & 1 deletion tests/unittest/llmapi/test_llm_pytorch.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import random
import time
from contextlib import contextmanager, nullcontext
from typing import Optional

import pytest

from tensorrt_llm import LLM
from tensorrt_llm.disaggregated_params import DisaggregatedParams
from tensorrt_llm.executor import GenerationExecutorWorker
from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig, PeftCacheConfig
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.metrics import MetricNames
Expand Down Expand Up @@ -961,3 +963,62 @@ async def test_llm_rpc_streaming():
outputs.append(output.outputs[0].text)
"".join(outputs)
print(f"get result: {outputs}")


@pytest.mark.threadleak(enabled=False)
@pytest.mark.part0
def test_llm_context_only_timed_out():
tp_size = 1
use_overlap = False
enable_iter_req_stats = False

llm_args_extra = {}

llm_args_extra.update(
dict(enable_iter_perf_stats=True,
enable_iter_req_stats=enable_iter_req_stats,
disable_overlap_scheduler=not use_overlap))

llm = LLM(model=llama_model_path,
kv_cache_config=global_kvcache_config,
tensor_parallel_size=tp_size,
cache_transceiver_config=CacheTransceiverConfig(
backend="DEFAULT", kv_transfer_timeout_ms=1000),
**llm_args_extra)

max_tokens = 1
sampling_params = SamplingParams(max_tokens=max_tokens)

disaggregated_params = DisaggregatedParams(request_type="context_only")

prompts0 = [
"What is your name?",
]
prompts1 = [
"Nvidia is awesome because",
]

# Send context-only request
for output in llm.generate(prompts1,
sampling_params=sampling_params,
disaggregated_params=disaggregated_params):
print(output)

results = llm.get_stats(2)
assert len(results) == 1
context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]
print(f"Context only used num blocks: {context_only_used_num_blocks}")

# Sleep 5 seconds to allow context only request to time out
time.sleep(5)

# Send regular request
for output in llm.generate(prompts0, sampling_params=sampling_params):
print(output)

# Get number of allocated blocks
results = llm.get_stats(2)
assert len(results) == 1
final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]

assert final_used_num_blocks == 0