Skip to content

Commit b6d675f

Browse files
Infer max_output_tokens from model family when metadata is missing
1 parent dd35a5e commit b6d675f

File tree

6 files changed

+66
-5
lines changed

6 files changed

+66
-5
lines changed

backend/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Backend sync worker service."""
22

3-
__version__ = "0.6.30"
3+
__version__ = "0.6.31"

frontend/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Frontend API and UI service."""
22

3-
__version__ = "0.6.30"
3+
__version__ = "0.6.31"

proxy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.30"
1+
__version__ = "0.6.31"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "litellm-companion"
3-
version = "0.6.30"
3+
version = "0.6.31"
44
description = "Synchronize models from Ollama or OpenAI-compatible endpoints into LiteLLM"
55
authors = [
66
{name = "LiteLLM Companion Authors", email = "dev@example.com"}

shared/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Shared code between backend and frontend services."""
22

3-
__version__ = "0.6.30"
3+
__version__ = "0.6.31"

shared/models.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,60 @@ def _fallback_context_window(model_id: str, context_window: int | None) -> int |
418418
return None
419419

420420

421+
def _fallback_max_output_tokens(
422+
model_id: str,
423+
max_output_tokens: int | None,
424+
max_tokens: int | None,
425+
context_window: int | None,
426+
model_type: str | None,
427+
) -> int | None:
428+
"""Infer max_output_tokens when upstream metadata omits it."""
429+
if max_output_tokens is not None:
430+
return max_output_tokens
431+
432+
# Embedding-style models do not have completion output token budgets.
433+
lowered_type = (model_type or "").lower()
434+
lowered_id = model_id.lower()
435+
if "embedding" in lowered_type or "embed" in lowered_id:
436+
return None
437+
438+
# When a provider exposes max_tokens, use that as the best available bound.
439+
if max_tokens is not None:
440+
return max_tokens
441+
442+
# Family/model heuristics for common chat/completion models in this deployment.
443+
if "gpt-oss-120b" in lowered_id or "gpt-oss:120b" in lowered_id:
444+
return 131072
445+
if "gpt-oss-20b" in lowered_id or "gpt-oss:20b" in lowered_id:
446+
return 65536
447+
if "deepseek-v3" in lowered_id:
448+
return 65536
449+
if "qwen3-coder" in lowered_id:
450+
return 65536
451+
if "qwen3" in lowered_id or "qwen-3" in lowered_id:
452+
return 32768
453+
if "kimi-k2.5" in lowered_id:
454+
return 65536
455+
if "kimi-k2" in lowered_id:
456+
return 32768
457+
if "minimax-m2.5" in lowered_id:
458+
return 65536
459+
if "glm-5" in lowered_id:
460+
return 65536
461+
if "glm-4.7" in lowered_id or "glm-4.6" in lowered_id:
462+
return 32768
463+
if "llama-3.3" in lowered_id:
464+
return 16384
465+
if "llama-3.1" in lowered_id or "llama-3.2" in lowered_id:
466+
return 8192
467+
468+
# Last resort: derive from context window, but keep sane limits.
469+
if context_window is not None:
470+
return max(2048, min(context_window // 2, 65536))
471+
472+
return None
473+
474+
421475
def _extract_tags(raw: dict) -> list[str]:
422476
"""Extract and normalize tags from common payload sections."""
423477

@@ -595,6 +649,13 @@ def from_raw(cls, model_id: str, raw: dict, database_id: str | None = None) -> "
595649
model_type = _extract_model_type(model_id, raw, capabilities)
596650
capabilities = _ensure_capabilities(model_id, capabilities, model_type)
597651
context_window = _fallback_context_window(model_id, context_window)
652+
max_output_tokens = _fallback_max_output_tokens(
653+
model_id=model_id,
654+
max_output_tokens=max_output_tokens,
655+
max_tokens=max_tokens,
656+
context_window=context_window,
657+
model_type=model_type,
658+
)
598659
tags = _extract_tags(raw)
599660

600661
return cls(

0 commit comments

Comments
 (0)