|
4 | 4 | import json |
5 | 5 | import os |
6 | 6 | import sys |
7 | | -import time |
8 | 7 |
|
9 | 8 | # Required for test_generate_with_seed to pass. |
10 | 9 | # See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891 |
|
29 | 28 | from tensorrt_llm import LLM as LLM_torch |
30 | 29 | from tensorrt_llm._tensorrt_engine import LLM |
31 | 30 | from tensorrt_llm.bindings import executor as tllm |
32 | | -from tensorrt_llm.disaggregated_params import DisaggregatedParams |
33 | 31 | from tensorrt_llm.executor import (GenerationExecutorWorker, GenerationRequest, |
34 | 32 | GenerationResult, LoRARequest, |
35 | 33 | PromptAdapterRequest, RequestError) |
36 | | -from tensorrt_llm.llmapi import (BuildCacheConfig, CacheTransceiverConfig, |
37 | | - EagleDecodingConfig, KvCacheConfig, |
38 | | - KvCacheRetentionConfig, |
| 34 | +from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig, |
| 35 | + KvCacheConfig, KvCacheRetentionConfig, |
39 | 36 | LookaheadDecodingConfig, MedusaDecodingConfig, |
40 | 37 | RequestOutput) |
41 | 38 | from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs |
@@ -2567,61 +2564,3 @@ def test_llm_api_draft_target(): |
2567 | 2564 | prompt = output.prompt |
2568 | 2565 | generated_text = output.outputs[0].text |
2569 | 2566 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") |
2570 | | - |
2571 | | - |
2572 | | -def test_llm_context_only_timed_out(): |
2573 | | - tp_size = 1 |
2574 | | - use_overlap = False |
2575 | | - enable_iter_req_stats = False |
2576 | | - |
2577 | | - llm_args_extra = {} |
2578 | | - |
2579 | | - llm_args_extra.update( |
2580 | | - dict(enable_iter_perf_stats=True, |
2581 | | - enable_iter_req_stats=enable_iter_req_stats, |
2582 | | - disable_overlap_scheduler=not use_overlap)) |
2583 | | - LLM_CLASS = LLM_torch |
2584 | | - |
2585 | | - llm = LLM_CLASS(model=llama_model_path, |
2586 | | - kv_cache_config=global_kvcache_config, |
2587 | | - tensor_parallel_size=tp_size, |
2588 | | - cache_transceiver_config=CacheTransceiverConfig( |
2589 | | - backend="DEFAULT", kv_transfer_timeout_ms=1000), |
2590 | | - **llm_args_extra) |
2591 | | - |
2592 | | - max_tokens = 1 |
2593 | | - sampling_params = SamplingParams(max_tokens=max_tokens) |
2594 | | - |
2595 | | - disaggregated_params = DisaggregatedParams(request_type="context_only") |
2596 | | - |
2597 | | - prompts0 = [ |
2598 | | - "What is your name?", |
2599 | | - ] |
2600 | | - prompts1 = [ |
2601 | | - "Nvidia is awesome because", |
2602 | | - ] |
2603 | | - |
2604 | | - # Send context-only request |
2605 | | - for output in llm.generate(prompts1, |
2606 | | - sampling_params=sampling_params, |
2607 | | - disaggregated_params=disaggregated_params): |
2608 | | - print(output) |
2609 | | - |
2610 | | - results = llm.get_stats(2) |
2611 | | - assert len(results) == 1 |
2612 | | - context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] |
2613 | | - print(f"Context only used num blocks: {context_only_used_num_blocks}") |
2614 | | - |
2615 | | - # Sleep 5 seconds to allow context only request to time out |
2616 | | - time.sleep(5) |
2617 | | - |
2618 | | - # Send regular request |
2619 | | - for output in llm.generate(prompts0, sampling_params=sampling_params): |
2620 | | - print(output) |
2621 | | - |
2622 | | - # Get number of allocated blocks |
2623 | | - results = llm.get_stats(2) |
2624 | | - assert len(results) == 1 |
2625 | | - final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] |
2626 | | - |
2627 | | - assert final_used_num_blocks == 0 |
0 commit comments