File tree Expand file tree Collapse file tree 2 files changed +78
-0
lines changed
tests/integration/defs/disaggregated/test_configs Expand file tree Collapse file tree 2 files changed +78
-0
lines changed Original file line number Diff line number Diff line change 1+ model : TinyLlama/TinyLlama-1.1B-Chat-v1.0
2+ hostname : localhost
3+ port : 8000
4+ backend : " pytorch"
5+ cuda_graph_config : null
6+ free_gpu_memory_fraction : 0.2
7+ context_servers :
8+ num_instances : 1
9+ max_batch_size : 8
10+ max_num_tokens : 3000
11+ max_seq_len : 4096
12+ tensor_parallel_size : 1
13+ pipeline_parallel_size : 1
14+ kv_cache_config :
15+ enable_block_reuse : False
16+ free_gpu_memory_fraction : 0.2
17+ enable_partial_reuse : False
18+
19+ cache_transceiver_config :
20+ backend : DEFAULT
21+ transceiver_runtime : PYTHON
22+ urls :
23+ - " localhost:8001"
24+ generation_servers :
25+ num_instances : 1
26+ tensor_parallel_size : 1
27+ pipeline_parallel_size : 1
28+ max_batch_size : 256
29+ max_num_tokens : 4096
30+ max_seq_len : 4096
31+ kv_cache_config :
32+ enable_block_reuse : False
33+ free_gpu_memory_fraction : 0.2
34+ enable_partial_reuse : False
35+ cache_transceiver_config :
36+ backend : DEFAULT
37+ transceiver_runtime : PYTHON
38+ urls :
39+ - " localhost:8002"
Original file line number Diff line number Diff line change 1+ model : TinyLlama/TinyLlama-1.1B-Chat-v1.0
2+ hostname : localhost
3+ port : 8000
4+ backend : " pytorch"
5+ cuda_graph_config : null
6+ free_gpu_memory_fraction : 0.2
7+ context_servers :
8+ num_instances : 1
9+ max_batch_size : 8
10+ max_num_tokens : 3000
11+ max_seq_len : 4096
12+ tensor_parallel_size : 1
13+ pipeline_parallel_size : 4
14+ kv_cache_config :
15+ enable_block_reuse : False
16+ free_gpu_memory_fraction : 0.2
17+ enable_partial_reuse : False
18+
19+ cache_transceiver_config :
20+ backend : DEFAULT
21+ transceiver_runtime : PYTHON
22+ urls :
23+ - " localhost:8001"
24+ generation_servers :
25+ num_instances : 1
26+ tensor_parallel_size : 1
27+ pipeline_parallel_size : 1
28+ max_batch_size : 256
29+ max_num_tokens : 4096
30+ max_seq_len : 4096
31+ kv_cache_config :
32+ enable_block_reuse : False
33+ free_gpu_memory_fraction : 0.2
34+ enable_partial_reuse : False
35+ cache_transceiver_config :
36+ backend : DEFAULT
37+ transceiver_runtime : PYTHON
38+ urls :
39+ - " localhost:8002"
You can’t perform that action at this time.
0 commit comments