Skip to content

Commit 9665313

Browse files
authored
[V1] Set structured output backend to auto by default (#15724)
Signed-off-by: Russell Bryant <[email protected]>
1 parent 0c54fc7 commit 9665313

File tree

4 files changed

+22
-68
lines changed

4 files changed

+22
-68
lines changed

tests/entrypoints/openai/test_chat.py

Lines changed: 11 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
# any model with a chat template should work here
2121
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
2222

23-
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
24-
2523

2624
@pytest.fixture(scope="module")
2725
def monkeypatch_module():
@@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
487485
assert last_completion_tokens == 10
488486

489487

490-
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
491-
# (i.e. using the same ordering as in the Completions API tests), the test
492-
# will fail on the second `guided_decoding_backend` even when I swap their order
493-
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
494488
@pytest.mark.asyncio
495-
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
496489
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
497-
is_v1_server: bool,
498-
guided_decoding_backend: str,
499490
sample_guided_choice):
500-
501-
if is_v1_server and guided_decoding_backend != 'xgrammar':
502-
pytest.skip("Only xgrammar backend is supported with V1")
503-
504491
messages = [{
505492
"role": "system",
506493
"content": "you are a helpful assistant"
@@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
515502
messages=messages,
516503
max_completion_tokens=10,
517504
temperature=0.7,
518-
extra_body=dict(guided_choice=sample_guided_choice,
519-
guided_decoding_backend=guided_decoding_backend))
505+
extra_body=dict(guided_choice=sample_guided_choice))
520506
choice1 = chat_completion.choices[0].message.content
521507
assert choice1 in sample_guided_choice
522508

@@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
530516
messages=messages,
531517
max_completion_tokens=10,
532518
temperature=0.7,
533-
extra_body=dict(guided_choice=sample_guided_choice,
534-
guided_decoding_backend=guided_decoding_backend))
519+
extra_body=dict(guided_choice=sample_guided_choice))
535520
choice2 = chat_completion.choices[0].message.content
536521
assert choice2 in sample_guided_choice
537522
assert choice1 != choice2
538523

539524

540525
@pytest.mark.asyncio
541-
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
542-
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
543-
guided_decoding_backend: str,
526+
async def test_guided_json_chat(client: openai.AsyncOpenAI,
544527
sample_json_schema):
545528

546-
if is_v1_server:
547-
pytest.skip("sample_json_schema has features unsupported in V1")
548-
549529
messages = [{
550530
"role": "system",
551531
"content": "you are a helpful assistant"
@@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
560540
model=MODEL_NAME,
561541
messages=messages,
562542
max_completion_tokens=1000,
563-
extra_body=dict(guided_json=sample_json_schema,
564-
guided_decoding_backend=guided_decoding_backend))
543+
extra_body=dict(guided_json=sample_json_schema))
565544
message = chat_completion.choices[0].message
566545
assert message.content is not None
567546
json1 = json.loads(message.content)
@@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
578557
model=MODEL_NAME,
579558
messages=messages,
580559
max_completion_tokens=1000,
581-
extra_body=dict(guided_json=sample_json_schema,
582-
guided_decoding_backend=guided_decoding_backend))
560+
extra_body=dict(guided_json=sample_json_schema))
583561
message = chat_completion.choices[0].message
584562
assert message.content is not None
585563
json2 = json.loads(message.content)
@@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
589567

590568

591569
@pytest.mark.asyncio
592-
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
593-
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
594-
is_v1_server: bool,
595-
guided_decoding_backend: str, sample_regex):
596-
597-
if is_v1_server and guided_decoding_backend != 'xgrammar':
598-
pytest.skip("Only xgrammar backend is supported with V1")
570+
async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
599571

600572
messages = [{
601573
"role": "system",
@@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
610582
model=MODEL_NAME,
611583
messages=messages,
612584
max_completion_tokens=20,
613-
extra_body=dict(guided_regex=sample_regex,
614-
guided_decoding_backend=guided_decoding_backend))
585+
extra_body=dict(guided_regex=sample_regex))
615586
ip1 = chat_completion.choices[0].message.content
616587
assert ip1 is not None
617588
assert re.fullmatch(sample_regex, ip1) is not None
@@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
622593
model=MODEL_NAME,
623594
messages=messages,
624595
max_completion_tokens=20,
625-
extra_body=dict(guided_regex=sample_regex,
626-
guided_decoding_backend=guided_decoding_backend))
596+
extra_body=dict(guided_regex=sample_regex))
627597
ip2 = chat_completion.choices[0].message.content
628598
assert ip2 is not None
629599
assert re.fullmatch(sample_regex, ip2) is not None
@@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
652622

653623

654624
@pytest.mark.asyncio
655-
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
656625
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
657-
is_v1_server: bool,
658-
guided_decoding_backend: str,
659626
sample_guided_choice):
660627

661-
if is_v1_server and guided_decoding_backend != 'xgrammar':
662-
pytest.skip("Only xgrammar backend is supported with V1")
663-
664628
messages = [{
665629
"role": "system",
666630
"content": "you are a helpful assistant"
@@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
676640
max_completion_tokens=10,
677641
logprobs=True,
678642
top_logprobs=5,
679-
extra_body=dict(guided_choice=sample_guided_choice,
680-
guided_decoding_backend=guided_decoding_backend))
643+
extra_body=dict(guided_choice=sample_guided_choice))
681644

682645
assert chat_completion.choices[0].logprobs is not None
683646
assert chat_completion.choices[0].logprobs.content is not None
@@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
689652

690653

691654
@pytest.mark.asyncio
692-
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
693-
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
694-
guided_decoding_backend: str,
695-
sample_json_schema):
696-
697-
if is_v1_server:
698-
pytest.skip("sample_json_schema has features unsupported on V1")
699-
655+
async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
700656
messages = [{
701657
"role": "system",
702658
"content": "you are a helpful assistant"
@@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
728684
"name": "dummy_function_name"
729685
}
730686
},
731-
extra_body=dict(guided_decoding_backend=guided_decoding_backend))
687+
)
732688
message = chat_completion.choices[0].message
733689
assert len(message.content) == 0
734690
json_string = message.tool_calls[0].function.arguments
@@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
763719
"name": "dummy_function_name"
764720
}
765721
},
766-
extra_body=dict(guided_decoding_backend=guided_decoding_backend),
767722
stream=True)
768723

769724
output = []
@@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
888843
model=model_name,
889844
tools=tools,
890845
tool_choice="required",
891-
extra_body=dict(guided_decoding_backend="outlines"),
892846
)
893847

894848
assert chat_completion.choices[0].message.tool_calls is not None
@@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
900854
model=model_name,
901855
tools=tools,
902856
tool_choice="required",
903-
extra_body=dict(guided_decoding_backend="outlines"),
904857
stream=True,
905858
)
906859

@@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
914867

915868
@pytest.mark.asyncio
916869
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
917-
is_v1_server: bool,
918870
sample_json_schema):
919-
920-
if is_v1_server:
921-
pytest.skip("sample_json_schema has features unsupported on V1")
922-
923871
messages = [{
924872
"role": "system",
925873
"content": "you are a helpful assistant"

vllm/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2976,7 +2976,7 @@ class DecodingConfig:
29762976

29772977
# Which guided decoding algo to use.
29782978
# 'outlines' / 'lm-format-enforcer' / 'xgrammar'
2979-
guided_decoding_backend: str = 'xgrammar'
2979+
guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
29802980

29812981
reasoning_backend: Optional[str] = None
29822982

@@ -3001,7 +3001,7 @@ def compute_hash(self) -> str:
30013001

30023002
def __post_init__(self):
30033003
v0_valid_guided_backends = [
3004-
'outlines', 'lm-format-enforcer', 'xgrammar'
3004+
'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
30053005
]
30063006
v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
30073007

vllm/engine/arg_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ class EngineArgs:
182182
enable_chunked_prefill: Optional[bool] = None
183183
disable_chunked_mm_input: bool = False
184184

185-
guided_decoding_backend: str = 'xgrammar'
185+
guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
186186
logits_processor_pattern: Optional[str] = None
187187

188188
speculative_config: Optional[Dict[str, Any]] = None
@@ -407,13 +407,13 @@ def get_kwargs(cls: type[Any]) -> Dict[str, Any]:
407407
parser.add_argument(
408408
'--guided-decoding-backend',
409409
type=str,
410-
default='xgrammar',
410+
default=DecodingConfig.guided_decoding_backend,
411411
help='Which engine will be used for guided decoding'
412412
' (JSON schema / regex etc) by default. Currently support '
413413
'https://github.com/mlc-ai/xgrammar and '
414414
'https://github.com/guidance-ai/llguidance.'
415415
'Valid backend values are "xgrammar", "guidance", and "auto". '
416-
'With "auto", we will make opinionated choices based on request'
416+
'With "auto", we will make opinionated choices based on request '
417417
'contents and what the backend libraries currently support, so '
418418
'the behavior is subject to change in each release.')
419419
parser.add_argument(

vllm/model_executor/guided_decoding/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
3333
logger.warning("%s Falling back to use %s instead.", message, fallback)
3434
guided_params.backend = fallback
3535

36+
# `auto` was added for V1 to explicitly declare a mode that has fallbacks
37+
# in place. If that is specified with V0, treat it as `xgrammar`, as we have
38+
# fallbacks enabled for that and it is the V0 default.
39+
if guided_params.backend == "auto":
40+
guided_params.backend = "xgrammar"
41+
3642
# lm-format-enforce doesn't support grammar, fallback to xgrammar
3743
if guided_params.backend_name == "lm-format-enforcer":
3844
if guided_params.grammar is not None:

0 commit comments

Comments
 (0)