File tree Expand file tree Collapse file tree 2 files changed +2
-2
lines changed
Expand file tree Collapse file tree 2 files changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -224,7 +224,7 @@ policy:
224224 use_cuda_graphs_for_non_decode_steps : true # Enable CUDA graphs for prefill/context processing
225225 enable_chunked_prefill : true # Split long prefills into chunks for better memory management
226226 unified_memory_level : 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
227- max_tokens : 16834 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
227+ max_tokens : 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
228228 vllm_cfg :
229229 async_engine : false
230230 precision : ${policy.precision}
Original file line number Diff line number Diff line change @@ -150,7 +150,7 @@ policy:
150150 use_cuda_graphs_for_non_decode_steps : true # Enable CUDA graphs for prefill/context processing
151151 enable_chunked_prefill : true # Split long prefills into chunks for better memory management
152152 unified_memory_level : 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
153- max_tokens : 16834 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
153+ max_tokens : 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
154154
155155 vllm_cfg :
156156 tensor_parallel_size : 1
You can’t perform that action at this time.
0 commit comments