We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 3b751fa commit a33d5c2Copy full SHA for a33d5c2
model-engine/model_engine_server/inference/vllm/vllm_batch.py
@@ -201,7 +201,7 @@ def determine_max_concurrent_requests(
201
# anecdotally, we're seeing the engine able to handle around 7req/s (for outlines), so set to 30 * 7 ~= 200
202
if any(
203
request.to_sampling_params(
204
- default_max_tokens=0, logits_processor_pattern=None
+ default_max_tokens=1, logits_processor_pattern=None
205
).guided_decoding
206
for request in requests
207
):
0 commit comments