Skip to content

Commit 299cb1f

Browse files
committed
[emu] Fix ellm reasoning_effort param (#912)
* provider ellm will properly pass reasoning_effort disable * add gpt5.2/4 check * add vllm explicit disable check * add gemini 3.1 pro check * update preset model config (gpt5.4 family, Qwen3.5 MoEs, removed gemini 3 pro preview)
1 parent f6a4d51 commit 299cb1f

File tree

3 files changed

+201
-58
lines changed

3 files changed

+201
-58
lines changed

services/api/src/owl/configs/preset_models.json

Lines changed: 114 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,70 @@
11
[
2+
{
3+
"meta": {
4+
"icon": "openai"
5+
},
6+
"id": "openai/gpt-5.4",
7+
"name": "OpenAI GPT-5.4",
8+
"type": "llm",
9+
"context_length": 1050000,
10+
"max_output_tokens": 128000,
11+
"capabilities": ["chat", "image", "reasoning", "tool"],
12+
"languages": ["en", "mul"],
13+
"llm_input_cost_per_mtoken": 2.5,
14+
"llm_output_cost_per_mtoken": 15.0,
15+
"deployments": [
16+
{
17+
"name": "OpenAI GPT-5.4 Deployment",
18+
"provider": "openai",
19+
"routing_id": "openai/gpt-5.4",
20+
"api_base": ""
21+
}
22+
]
23+
},
24+
{
25+
"meta": {
26+
"icon": "openai"
27+
},
28+
"id": "openai/gpt-5.4-mini",
29+
"name": "OpenAI GPT-5.4 Mini",
30+
"type": "llm",
31+
"context_length": 400000,
32+
"max_output_tokens": 128000,
33+
"capabilities": ["chat", "image", "reasoning", "tool"],
34+
"languages": ["en", "mul"],
35+
"llm_input_cost_per_mtoken": 0.75,
36+
"llm_output_cost_per_mtoken": 4.5,
37+
"deployments": [
38+
{
39+
"name": "OpenAI GPT-5.4 Mini Deployment",
40+
"provider": "openai",
41+
"routing_id": "openai/gpt-5.4-mini",
42+
"api_base": ""
43+
}
44+
]
45+
},
46+
{
47+
"meta": {
48+
"icon": "openai"
49+
},
50+
"id": "openai/gpt-5.4-nano",
51+
"name": "OpenAI GPT-5.4 Nano",
52+
"type": "llm",
53+
"context_length": 400000,
54+
"max_output_tokens": 128000,
55+
"capabilities": ["chat", "image", "reasoning", "tool"],
56+
"languages": ["en", "mul"],
57+
"llm_input_cost_per_mtoken": 0.2,
58+
"llm_output_cost_per_mtoken": 1.25,
59+
"deployments": [
60+
{
61+
"name": "OpenAI GPT-5.4 Nano Deployment",
62+
"provider": "openai",
63+
"routing_id": "openai/gpt-5.4-nano",
64+
"api_base": ""
65+
}
66+
]
67+
},
268
{
369
"meta": {
470
"icon": "openai"
@@ -241,28 +307,6 @@
241307
}
242308
]
243309
},
244-
{
245-
"meta": {
246-
"icon": "google"
247-
},
248-
"id": "google/gemini-3-pro-preview",
249-
"name": "Google Gemini 3 Pro Preview",
250-
"type": "llm",
251-
"context_length": 1048576,
252-
"max_output_tokens": 65536,
253-
"capabilities": ["chat", "image", "reasoning", "tool"],
254-
"languages": ["en", "mul"],
255-
"llm_input_cost_per_mtoken": 4.0,
256-
"llm_output_cost_per_mtoken": 18.0,
257-
"deployments": [
258-
{
259-
"name": "Google Gemini 3 Pro Preview Deployment",
260-
"provider": "gemini",
261-
"routing_id": "gemini/gemini-3-pro-preview",
262-
"api_base": ""
263-
}
264-
]
265-
},
266310
{
267311
"meta": {
268312
"icon": "google"
@@ -285,30 +329,6 @@
285329
}
286330
]
287331
},
288-
{
289-
"meta": {
290-
"icon": "meta"
291-
},
292-
"id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
293-
"name": "Meta Llama 4 Scout (109B-A17B)",
294-
"type": "llm",
295-
"context_length": 262144,
296-
"capabilities": ["chat", "image"],
297-
"languages": ["en", "mul"],
298-
"llm_input_cost_per_mtoken": 0.15,
299-
"llm_output_cost_per_mtoken": 0.5,
300-
"deployments": [
301-
{
302-
"name": "Meta Llama 4 Scout (109B-A17B) Deployment",
303-
"huggingface_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
304-
"cpu_count": "4",
305-
"memory_gb": "24",
306-
"required_vram": "140",
307-
"num_replicas": 1,
308-
"provider": "vllm"
309-
}
310-
]
311-
},
312332
{
313333
"meta": {
314334
"icon": "meta"
@@ -429,6 +449,54 @@
429449
}
430450
]
431451
},
452+
{
453+
"meta": {
454+
"icon": "qwen"
455+
},
456+
"id": "Qwen/Qwen3.5-122B-A10B",
457+
"name": "Qwen 3.5 (122B-A10B)",
458+
"type": "llm",
459+
"context_length": 256000,
460+
"capabilities": ["chat", "image", "reasoning", "tool"],
461+
"languages": ["en", "mul"],
462+
"llm_input_cost_per_mtoken": 0.4,
463+
"llm_output_cost_per_mtoken": 2.0,
464+
"deployments": [
465+
{
466+
"name": "Qwen 3.5 (122B-A10B) Deployment",
467+
"huggingface_id": "Qwen/Qwen3.5-122B-A10B-FP8",
468+
"cpu_count": "8",
469+
"memory_gb": "16",
470+
"required_vram": "150",
471+
"num_replicas": 1,
472+
"provider": "vllm"
473+
}
474+
]
475+
},
476+
{
477+
"meta": {
478+
"icon": "qwen"
479+
},
480+
"id": "Qwen/Qwen3.5-35B-A3B",
481+
"name": "Qwen 3.5 (35B-A3B)",
482+
"type": "llm",
483+
"context_length": 256000,
484+
"capabilities": ["chat", "image", "reasoning", "tool"],
485+
"languages": ["en", "mul"],
486+
"llm_input_cost_per_mtoken": 0.25,
487+
"llm_output_cost_per_mtoken": 0.8,
488+
"deployments": [
489+
{
490+
"name": "Qwen 3.5 (35B-A3B) Deployment",
491+
"huggingface_id": "Qwen/Qwen3.5-35B-A3B-FP8",
492+
"cpu_count": "8",
493+
"memory_gb": "16",
494+
"required_vram": "50",
495+
"num_replicas": 1,
496+
"provider": "vllm"
497+
}
498+
]
499+
},
432500
{
433501
"meta": {
434502
"icon": "qwen"

services/api/src/owl/utils/lm.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,15 @@ def _prepare_hyperparams(
728728
# Non-reasoning model does not require further processing
729729
if not ctx.is_reasoning_model:
730730
return
731+
# handle vLLM reasoning (only applicable to some models) only disable when explicitly requested
732+
if ctx.inference_provider in (
733+
OnPremProvider.VLLM,
734+
OnPremProvider.VLLM_AMD,
735+
CloudProvider.VLLM_CLOUD,
736+
):
737+
if reasoning_effort in ("disable", "none"):
738+
hyperparams["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
739+
return
731740
# Disable reasoning if requested
732741
if (
733742
reasoning_effort in ("disable", "minimal", "none")
@@ -736,6 +745,7 @@ def _prepare_hyperparams(
736745
):
737746
if ctx.inference_provider == CloudProvider.ELLM:
738747
hyperparams["reasoning_effort"] = "disable"
748+
hyperparams["allowed_openai_params"] = ["reasoning_effort"]
739749
return
740750
elif ctx.inference_provider == CloudProvider.GEMINI:
741751
# 3/3.1-Pro cannot disable thinking
@@ -751,8 +761,12 @@ def _prepare_hyperparams(
751761
hyperparams["thinking"] = {"type": "disabled"}
752762
return
753763
elif ctx.inference_provider == CloudProvider.OPENAI:
754-
if "gpt-5.1" in ctx.routing_id:
755-
# gpt-5.1: Supported values are: 'none', 'low', 'medium', and 'high'.
764+
if (
765+
"gpt-5.1" in ctx.routing_id
766+
or "gpt-5.2" in ctx.routing_id
767+
or "gpt-5.4" in ctx.routing_id
768+
):
769+
# gpt-5.1/2/4: Supported values are: 'none', 'low', 'medium', and 'high'.
756770
hyperparams["reasoning"] = {
757771
"effort": "none",
758772
"summary": reasoning_summary,
@@ -776,13 +790,6 @@ def _prepare_hyperparams(
776790
"summary": reasoning_summary,
777791
}
778792
return
779-
elif ctx.inference_provider in (
780-
OnPremProvider.VLLM,
781-
OnPremProvider.VLLM_AMD,
782-
CloudProvider.VLLM_CLOUD,
783-
):
784-
hyperparams["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
785-
return
786793
logger.warning(
787794
(
788795
f'Disabling reasoning is not supported for model "{self.config.id}" '
@@ -807,7 +814,7 @@ def _prepare_hyperparams(
807814
elif ctx.inference_provider in [CloudProvider.GEMINI, CloudProvider.ANTHROPIC]:
808815
# Gemini 3-Pro recommends reasoning_effort
809816
# https://ai.google.dev/gemini-api/docs/openai
810-
if "3-pro" in ctx.routing_id:
817+
if "3-pro" in ctx.routing_id or "3.1-pro" in ctx.routing_id:
811818
hyperparams["reasoning_effort"] = (
812819
"high" if reasoning_effort == "high" else "low"
813820
)

services/api/tests/utils/test_lm.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from types import SimpleNamespace
22

3-
from owl.types import CloudProvider, ModelProvider
4-
from owl.utils.lm import DeploymentRouter
3+
from owl.types import CloudProvider, ModelProvider, OnPremProvider
4+
from owl.utils.lm import DeploymentContext, DeploymentRouter
55

66

77
def _make_router(*, owned_by: str = "openai") -> DeploymentRouter:
@@ -10,6 +10,26 @@ def _make_router(*, owned_by: str = "openai") -> DeploymentRouter:
1010
return router
1111

1212

13+
def _make_ellm_context(*, is_reasoning_model: bool = True) -> DeploymentContext:
14+
return DeploymentContext(
15+
deployment=SimpleNamespace(provider=CloudProvider.ELLM),
16+
api_key="dummy",
17+
routing_id="Qwen/Qwen3.5-35B-A3B",
18+
inference_provider=CloudProvider.ELLM,
19+
is_reasoning_model=is_reasoning_model,
20+
)
21+
22+
23+
def _make_vllm_context(*, is_reasoning_model: bool = True) -> DeploymentContext:
24+
return DeploymentContext(
25+
deployment=SimpleNamespace(provider=OnPremProvider.VLLM),
26+
api_key="dummy",
27+
routing_id="Qwen/Qwen3.5-35B-A3B",
28+
inference_provider=OnPremProvider.VLLM,
29+
is_reasoning_model=is_reasoning_model,
30+
)
31+
32+
1333
def test_inference_provider_should_prefer_vllm_cloud_over_owned_by() -> None:
1434
router = _make_router()
1535

@@ -28,3 +48,51 @@ def test_inference_provider_should_use_owned_by_for_azure_openai() -> None:
2848
router = _make_router()
2949

3050
assert router._inference_provider(CloudProvider.AZURE, "openai") == ModelProvider.OPENAI
51+
52+
53+
def test_ellm_default_disables_reasoning() -> None:
54+
router = _make_router()
55+
ctx = _make_ellm_context()
56+
hyperparams: dict[str, object] = {}
57+
58+
router._prepare_hyperparams(ctx, hyperparams)
59+
60+
assert hyperparams["reasoning_effort"] == "disable"
61+
assert hyperparams["allowed_openai_params"] == ["reasoning_effort"]
62+
63+
64+
def test_ellm_explicitly_disable_reasoning() -> None:
65+
router = _make_router()
66+
ctx = _make_ellm_context()
67+
hyperparams: dict[str, object] = {"reasoning_effort": "disable"}
68+
69+
router._prepare_hyperparams(ctx, hyperparams)
70+
71+
assert hyperparams["reasoning_effort"] == "disable"
72+
assert hyperparams["allowed_openai_params"] == ["reasoning_effort"]
73+
74+
75+
def test_vllm_default_does_not_disable_thinking() -> None:
76+
router = _make_router()
77+
ctx = _make_vllm_context()
78+
hyperparams: dict[str, object] = {}
79+
80+
router._prepare_hyperparams(ctx, hyperparams)
81+
82+
assert "extra_body" not in hyperparams
83+
84+
85+
def test_vllm_explicitly_disable_thinking() -> None:
86+
router = _make_router()
87+
ctx = _make_vllm_context()
88+
hyperparams: dict[str, object] = {"reasoning_effort": "disable"}
89+
90+
router._prepare_hyperparams(ctx, hyperparams)
91+
92+
assert hyperparams["extra_body"] == {"chat_template_kwargs": {"enable_thinking": False}}
93+
94+
hyperparams = {"reasoning_effort": "none"}
95+
96+
router._prepare_hyperparams(ctx, hyperparams)
97+
98+
assert hyperparams["extra_body"] == {"chat_template_kwargs": {"enable_thinking": False}}

0 commit comments

Comments
 (0)