20
20
# any model with a chat template should work here
21
21
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
22
22
23
- GUIDED_DECODING_BACKENDS = ["outlines" , "lm-format-enforcer" , "xgrammar" ]
24
-
25
23
26
24
@pytest .fixture (scope = "module" )
27
25
def monkeypatch_module ():
@@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
487
485
assert last_completion_tokens == 10
488
486
489
487
490
- # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
491
- # (i.e. using the same ordering as in the Completions API tests), the test
492
- # will fail on the second `guided_decoding_backend` even when I swap their order
493
- # (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
494
488
@pytest .mark .asyncio
495
- @pytest .mark .parametrize ("guided_decoding_backend" , GUIDED_DECODING_BACKENDS )
496
489
async def test_guided_choice_chat (client : openai .AsyncOpenAI ,
497
- is_v1_server : bool ,
498
- guided_decoding_backend : str ,
499
490
sample_guided_choice ):
500
-
501
- if is_v1_server and guided_decoding_backend != 'xgrammar' :
502
- pytest .skip ("Only xgrammar backend is supported with V1" )
503
-
504
491
messages = [{
505
492
"role" : "system" ,
506
493
"content" : "you are a helpful assistant"
@@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
515
502
messages = messages ,
516
503
max_completion_tokens = 10 ,
517
504
temperature = 0.7 ,
518
- extra_body = dict (guided_choice = sample_guided_choice ,
519
- guided_decoding_backend = guided_decoding_backend ))
505
+ extra_body = dict (guided_choice = sample_guided_choice ))
520
506
choice1 = chat_completion .choices [0 ].message .content
521
507
assert choice1 in sample_guided_choice
522
508
@@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
530
516
messages = messages ,
531
517
max_completion_tokens = 10 ,
532
518
temperature = 0.7 ,
533
- extra_body = dict (guided_choice = sample_guided_choice ,
534
- guided_decoding_backend = guided_decoding_backend ))
519
+ extra_body = dict (guided_choice = sample_guided_choice ))
535
520
choice2 = chat_completion .choices [0 ].message .content
536
521
assert choice2 in sample_guided_choice
537
522
assert choice1 != choice2
538
523
539
524
540
525
@pytest .mark .asyncio
541
- @pytest .mark .parametrize ("guided_decoding_backend" , GUIDED_DECODING_BACKENDS )
542
- async def test_guided_json_chat (client : openai .AsyncOpenAI , is_v1_server : bool ,
543
- guided_decoding_backend : str ,
526
+ async def test_guided_json_chat (client : openai .AsyncOpenAI ,
544
527
sample_json_schema ):
545
528
546
- if is_v1_server :
547
- pytest .skip ("sample_json_schema has features unsupported in V1" )
548
-
549
529
messages = [{
550
530
"role" : "system" ,
551
531
"content" : "you are a helpful assistant"
@@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
560
540
model = MODEL_NAME ,
561
541
messages = messages ,
562
542
max_completion_tokens = 1000 ,
563
- extra_body = dict (guided_json = sample_json_schema ,
564
- guided_decoding_backend = guided_decoding_backend ))
543
+ extra_body = dict (guided_json = sample_json_schema ))
565
544
message = chat_completion .choices [0 ].message
566
545
assert message .content is not None
567
546
json1 = json .loads (message .content )
@@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
578
557
model = MODEL_NAME ,
579
558
messages = messages ,
580
559
max_completion_tokens = 1000 ,
581
- extra_body = dict (guided_json = sample_json_schema ,
582
- guided_decoding_backend = guided_decoding_backend ))
560
+ extra_body = dict (guided_json = sample_json_schema ))
583
561
message = chat_completion .choices [0 ].message
584
562
assert message .content is not None
585
563
json2 = json .loads (message .content )
@@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
589
567
590
568
591
569
@pytest .mark .asyncio
592
- @pytest .mark .parametrize ("guided_decoding_backend" , GUIDED_DECODING_BACKENDS )
593
- async def test_guided_regex_chat (client : openai .AsyncOpenAI ,
594
- is_v1_server : bool ,
595
- guided_decoding_backend : str , sample_regex ):
596
-
597
- if is_v1_server and guided_decoding_backend != 'xgrammar' :
598
- pytest .skip ("Only xgrammar backend is supported with V1" )
570
+ async def test_guided_regex_chat (client : openai .AsyncOpenAI , sample_regex ):
599
571
600
572
messages = [{
601
573
"role" : "system" ,
@@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
610
582
model = MODEL_NAME ,
611
583
messages = messages ,
612
584
max_completion_tokens = 20 ,
613
- extra_body = dict (guided_regex = sample_regex ,
614
- guided_decoding_backend = guided_decoding_backend ))
585
+ extra_body = dict (guided_regex = sample_regex ))
615
586
ip1 = chat_completion .choices [0 ].message .content
616
587
assert ip1 is not None
617
588
assert re .fullmatch (sample_regex , ip1 ) is not None
@@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
622
593
model = MODEL_NAME ,
623
594
messages = messages ,
624
595
max_completion_tokens = 20 ,
625
- extra_body = dict (guided_regex = sample_regex ,
626
- guided_decoding_backend = guided_decoding_backend ))
596
+ extra_body = dict (guided_regex = sample_regex ))
627
597
ip2 = chat_completion .choices [0 ].message .content
628
598
assert ip2 is not None
629
599
assert re .fullmatch (sample_regex , ip2 ) is not None
@@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
652
622
653
623
654
624
@pytest .mark .asyncio
655
- @pytest .mark .parametrize ("guided_decoding_backend" , GUIDED_DECODING_BACKENDS )
656
625
async def test_guided_choice_chat_logprobs (client : openai .AsyncOpenAI ,
657
- is_v1_server : bool ,
658
- guided_decoding_backend : str ,
659
626
sample_guided_choice ):
660
627
661
- if is_v1_server and guided_decoding_backend != 'xgrammar' :
662
- pytest .skip ("Only xgrammar backend is supported with V1" )
663
-
664
628
messages = [{
665
629
"role" : "system" ,
666
630
"content" : "you are a helpful assistant"
@@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
676
640
max_completion_tokens = 10 ,
677
641
logprobs = True ,
678
642
top_logprobs = 5 ,
679
- extra_body = dict (guided_choice = sample_guided_choice ,
680
- guided_decoding_backend = guided_decoding_backend ))
643
+ extra_body = dict (guided_choice = sample_guided_choice ))
681
644
682
645
assert chat_completion .choices [0 ].logprobs is not None
683
646
assert chat_completion .choices [0 ].logprobs .content is not None
@@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
689
652
690
653
691
654
@pytest .mark .asyncio
692
- @pytest .mark .parametrize ("guided_decoding_backend" , GUIDED_DECODING_BACKENDS )
693
- async def test_named_tool_use (client : openai .AsyncOpenAI , is_v1_server : bool ,
694
- guided_decoding_backend : str ,
695
- sample_json_schema ):
696
-
697
- if is_v1_server :
698
- pytest .skip ("sample_json_schema has features unsupported on V1" )
699
-
655
+ async def test_named_tool_use (client : openai .AsyncOpenAI , sample_json_schema ):
700
656
messages = [{
701
657
"role" : "system" ,
702
658
"content" : "you are a helpful assistant"
@@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
728
684
"name" : "dummy_function_name"
729
685
}
730
686
},
731
- extra_body = dict ( guided_decoding_backend = guided_decoding_backend ) )
687
+ )
732
688
message = chat_completion .choices [0 ].message
733
689
assert len (message .content ) == 0
734
690
json_string = message .tool_calls [0 ].function .arguments
@@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
763
719
"name" : "dummy_function_name"
764
720
}
765
721
},
766
- extra_body = dict (guided_decoding_backend = guided_decoding_backend ),
767
722
stream = True )
768
723
769
724
output = []
@@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
888
843
model = model_name ,
889
844
tools = tools ,
890
845
tool_choice = "required" ,
891
- extra_body = dict (guided_decoding_backend = "outlines" ),
892
846
)
893
847
894
848
assert chat_completion .choices [0 ].message .tool_calls is not None
@@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
900
854
model = model_name ,
901
855
tools = tools ,
902
856
tool_choice = "required" ,
903
- extra_body = dict (guided_decoding_backend = "outlines" ),
904
857
stream = True ,
905
858
)
906
859
@@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
914
867
915
868
@pytest .mark .asyncio
916
869
async def test_inconsistent_tool_choice_and_tools (client : openai .AsyncOpenAI ,
917
- is_v1_server : bool ,
918
870
sample_json_schema ):
919
-
920
- if is_v1_server :
921
- pytest .skip ("sample_json_schema has features unsupported on V1" )
922
-
923
871
messages = [{
924
872
"role" : "system" ,
925
873
"content" : "you are a helpful assistant"
0 commit comments