[emu] Fix ellm reasoning_effort param (#912)

noobHappylife · noobHappylife · commit 299cb1f77ce2 · 2026-03-19T05:34:33.000Z
* provider ellm will properly pass reasoning_effort disable
* add gpt5.2/4 check
* add vllm explicit disable check
* add gemini 3.1 pro check
* update preset model config (gpt5.4 family, Qwen3.5 MoEs, removed gemini 3 pro preview)
diff --git a/services/api/src/owl/configs/preset_models.json b/services/api/src/owl/configs/preset_models.json
@@ -1,4 +1,70 @@
 [
+  {
+    "meta": {
+      "icon": "openai"
+    },
+    "id": "openai/gpt-5.4",
+    "name": "OpenAI GPT-5.4",
+    "type": "llm",
+    "context_length": 1050000,
+    "max_output_tokens": 128000,
+    "capabilities": ["chat", "image", "reasoning", "tool"],
+    "languages": ["en", "mul"],
+    "llm_input_cost_per_mtoken": 2.5,
+    "llm_output_cost_per_mtoken": 15.0,
+    "deployments": [
+      {
+        "name": "OpenAI GPT-5.4 Deployment",
+        "provider": "openai",
+        "routing_id": "openai/gpt-5.4",
+        "api_base": ""
+      }
+    ]
+  },
+  {
+    "meta": {
+      "icon": "openai"
+    },
+    "id": "openai/gpt-5.4-mini",
+    "name": "OpenAI GPT-5.4 Mini",
+    "type": "llm",
+    "context_length": 400000,
+    "max_output_tokens": 128000,
+    "capabilities": ["chat", "image", "reasoning", "tool"],
+    "languages": ["en", "mul"],
+    "llm_input_cost_per_mtoken": 0.75,
+    "llm_output_cost_per_mtoken": 4.5,
+    "deployments": [
+      {
+        "name": "OpenAI GPT-5.4 Mini Deployment",
+        "provider": "openai",
+        "routing_id": "openai/gpt-5.4-mini",
+        "api_base": ""
+      }
+    ]
+  },
+  {
+    "meta": {
+      "icon": "openai"
+    },
+    "id": "openai/gpt-5.4-nano",
+    "name": "OpenAI GPT-5.4 Nano",
+    "type": "llm",
+    "context_length": 400000,
+    "max_output_tokens": 128000,
+    "capabilities": ["chat", "image", "reasoning", "tool"],
+    "languages": ["en", "mul"],
+    "llm_input_cost_per_mtoken": 0.2,
+    "llm_output_cost_per_mtoken": 1.25,
+    "deployments": [
+      {
+        "name": "OpenAI GPT-5.4 Nano Deployment",
+        "provider": "openai",
+        "routing_id": "openai/gpt-5.4-nano",
+        "api_base": ""
+      }
+    ]
+  },
   {
     "meta": {
       "icon": "openai"
@@ -241,28 +307,6 @@
       }
     ]
   },
-  {
-    "meta": {
-      "icon": "google"
-    },
-    "id": "google/gemini-3-pro-preview",
-    "name": "Google Gemini 3 Pro Preview",
-    "type": "llm",
-    "context_length": 1048576,
-    "max_output_tokens": 65536,
-    "capabilities": ["chat", "image", "reasoning", "tool"],
-    "languages": ["en", "mul"],
-    "llm_input_cost_per_mtoken": 4.0,
-    "llm_output_cost_per_mtoken": 18.0,
-    "deployments": [
-      {
-        "name": "Google Gemini 3 Pro Preview Deployment",
-        "provider": "gemini",
-        "routing_id": "gemini/gemini-3-pro-preview",
-        "api_base": ""
-      }
-    ]
-  },
   {
     "meta": {
       "icon": "google"
@@ -285,30 +329,6 @@
       }
     ]
   },
-  {
-    "meta": {
-      "icon": "meta"
-    },
-    "id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "name": "Meta Llama 4 Scout (109B-A17B)",
-    "type": "llm",
-    "context_length": 262144,
-    "capabilities": ["chat", "image"],
-    "languages": ["en", "mul"],
-    "llm_input_cost_per_mtoken": 0.15,
-    "llm_output_cost_per_mtoken": 0.5,
-    "deployments": [
-      {
-        "name": "Meta Llama 4 Scout (109B-A17B) Deployment",
-        "huggingface_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        "cpu_count": "4",
-        "memory_gb": "24",
-        "required_vram": "140",
-        "num_replicas": 1,
-        "provider": "vllm"
-      }
-    ]
-  },
   {
     "meta": {
       "icon": "meta"
@@ -429,6 +449,54 @@
       }
     ]
   },
+  {
+    "meta": {
+      "icon": "qwen"
+    },
+    "id": "Qwen/Qwen3.5-122B-A10B",
+    "name": "Qwen 3.5 (122B-A10B)",
+    "type": "llm",
+    "context_length": 256000,
+    "capabilities": ["chat", "image", "reasoning", "tool"],
+    "languages": ["en", "mul"],
+    "llm_input_cost_per_mtoken": 0.4,
+    "llm_output_cost_per_mtoken": 2.0,
+    "deployments": [
+      {
+        "name": "Qwen 3.5 (122B-A10B) Deployment",
+        "huggingface_id": "Qwen/Qwen3.5-122B-A10B-FP8",
+        "cpu_count": "8",
+        "memory_gb": "16",
+        "required_vram": "150",
+        "num_replicas": 1,
+        "provider": "vllm"
+      }
+    ]
+  },
+  {
+    "meta": {
+      "icon": "qwen"
+    },
+    "id": "Qwen/Qwen3.5-35B-A3B",
+    "name": "Qwen 3.5 (35B-A3B)",
+    "type": "llm",
+    "context_length": 256000,
+    "capabilities": ["chat", "image", "reasoning", "tool"],
+    "languages": ["en", "mul"],
+    "llm_input_cost_per_mtoken": 0.25,
+    "llm_output_cost_per_mtoken": 0.8,
+    "deployments": [
+      {
+        "name": "Qwen 3.5 (35B-A3B) Deployment",
+        "huggingface_id": "Qwen/Qwen3.5-35B-A3B-FP8",
+        "cpu_count": "8",
+        "memory_gb": "16",
+        "required_vram": "50",
+        "num_replicas": 1,
+        "provider": "vllm"
+      }
+    ]
+  },
   {
     "meta": {
       "icon": "qwen"
diff --git a/services/api/src/owl/utils/lm.py b/services/api/src/owl/utils/lm.py
@@ -728,6 +728,15 @@ def _prepare_hyperparams(
         # Non-reasoning model does not require further processing
         if not ctx.is_reasoning_model:
             return
+        # handle vLLM reasoning (only applicable to some models) only disable when explicitly requested
+        if ctx.inference_provider in (
+            OnPremProvider.VLLM,
+            OnPremProvider.VLLM_AMD,
+            CloudProvider.VLLM_CLOUD,
+        ):
+            if reasoning_effort in ("disable", "none"):
+                hyperparams["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
+            return
         # Disable reasoning if requested
         if (
             reasoning_effort in ("disable", "minimal", "none")
@@ -736,6 +745,7 @@ def _prepare_hyperparams(
         ):
             if ctx.inference_provider == CloudProvider.ELLM:
                 hyperparams["reasoning_effort"] = "disable"
+                hyperparams["allowed_openai_params"] = ["reasoning_effort"]
                 return
             elif ctx.inference_provider == CloudProvider.GEMINI:
                 # 3/3.1-Pro cannot disable thinking
@@ -751,8 +761,12 @@ def _prepare_hyperparams(
                 hyperparams["thinking"] = {"type": "disabled"}
                 return
             elif ctx.inference_provider == CloudProvider.OPENAI:
-                if "gpt-5.1" in ctx.routing_id:
-                    # gpt-5.1: Supported values are: 'none', 'low', 'medium', and 'high'.
+                if (
+                    "gpt-5.1" in ctx.routing_id
+                    or "gpt-5.2" in ctx.routing_id
+                    or "gpt-5.4" in ctx.routing_id
+                ):
+                    # gpt-5.1/2/4: Supported values are: 'none', 'low', 'medium', and 'high'.
                     hyperparams["reasoning"] = {
                         "effort": "none",
                         "summary": reasoning_summary,
@@ -776,13 +790,6 @@ def _prepare_hyperparams(
                         "summary": reasoning_summary,
                     }
                     return
-            elif ctx.inference_provider in (
-                OnPremProvider.VLLM,
-                OnPremProvider.VLLM_AMD,
-                CloudProvider.VLLM_CLOUD,
-            ):
-                hyperparams["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
-                return
             logger.warning(
                 (
                     f'Disabling reasoning is not supported for model "{self.config.id}" '
@@ -807,7 +814,7 @@ def _prepare_hyperparams(
             elif ctx.inference_provider in [CloudProvider.GEMINI, CloudProvider.ANTHROPIC]:
                 # Gemini 3-Pro recommends reasoning_effort
                 # https://ai.google.dev/gemini-api/docs/openai
-                if "3-pro" in ctx.routing_id:
+                if "3-pro" in ctx.routing_id or "3.1-pro" in ctx.routing_id:
                     hyperparams["reasoning_effort"] = (
                         "high" if reasoning_effort == "high" else "low"
                     )
diff --git a/services/api/tests/utils/test_lm.py b/services/api/tests/utils/test_lm.py
@@ -1,7 +1,7 @@
 from types import SimpleNamespace
 
-from owl.types import CloudProvider, ModelProvider
-from owl.utils.lm import DeploymentRouter
+from owl.types import CloudProvider, ModelProvider, OnPremProvider
+from owl.utils.lm import DeploymentContext, DeploymentRouter
 
 
 def _make_router(*, owned_by: str = "openai") -> DeploymentRouter:
@@ -10,6 +10,26 @@ def _make_router(*, owned_by: str = "openai") -> DeploymentRouter:
     return router
 
 
+def _make_ellm_context(*, is_reasoning_model: bool = True) -> DeploymentContext:
+    return DeploymentContext(
+        deployment=SimpleNamespace(provider=CloudProvider.ELLM),
+        api_key="dummy",
+        routing_id="Qwen/Qwen3.5-35B-A3B",
+        inference_provider=CloudProvider.ELLM,
+        is_reasoning_model=is_reasoning_model,
+    )
+
+
+def _make_vllm_context(*, is_reasoning_model: bool = True) -> DeploymentContext:
+    return DeploymentContext(
+        deployment=SimpleNamespace(provider=OnPremProvider.VLLM),
+        api_key="dummy",
+        routing_id="Qwen/Qwen3.5-35B-A3B",
+        inference_provider=OnPremProvider.VLLM,
+        is_reasoning_model=is_reasoning_model,
+    )
+
+
 def test_inference_provider_should_prefer_vllm_cloud_over_owned_by() -> None:
     router = _make_router()
 
@@ -28,3 +48,51 @@ def test_inference_provider_should_use_owned_by_for_azure_openai() -> None:
     router = _make_router()
 
     assert router._inference_provider(CloudProvider.AZURE, "openai") == ModelProvider.OPENAI
+
+
+def test_ellm_default_disables_reasoning() -> None:
+    router = _make_router()
+    ctx = _make_ellm_context()
+    hyperparams: dict[str, object] = {}
+
+    router._prepare_hyperparams(ctx, hyperparams)
+
+    assert hyperparams["reasoning_effort"] == "disable"
+    assert hyperparams["allowed_openai_params"] == ["reasoning_effort"]
+
+
+def test_ellm_explicitly_disable_reasoning() -> None:
+    router = _make_router()
+    ctx = _make_ellm_context()
+    hyperparams: dict[str, object] = {"reasoning_effort": "disable"}
+
+    router._prepare_hyperparams(ctx, hyperparams)
+
+    assert hyperparams["reasoning_effort"] == "disable"
+    assert hyperparams["allowed_openai_params"] == ["reasoning_effort"]
+
+
+def test_vllm_default_does_not_disable_thinking() -> None:
+    router = _make_router()
+    ctx = _make_vllm_context()
+    hyperparams: dict[str, object] = {}
+
+    router._prepare_hyperparams(ctx, hyperparams)
+
+    assert "extra_body" not in hyperparams
+
+
+def test_vllm_explicitly_disable_thinking() -> None:
+    router = _make_router()
+    ctx = _make_vllm_context()
+    hyperparams: dict[str, object] = {"reasoning_effort": "disable"}
+
+    router._prepare_hyperparams(ctx, hyperparams)
+
+    assert hyperparams["extra_body"] == {"chat_template_kwargs": {"enable_thinking": False}}
+
+    hyperparams = {"reasoning_effort": "none"}
+
+    router._prepare_hyperparams(ctx, hyperparams)
+
+    assert hyperparams["extra_body"] == {"chat_template_kwargs": {"enable_thinking": False}}