Merge pull request BerriAI#20333 from BerriAI/litellm_tuesday_cicd_release_final

Sameerlite · web-flow · commit 793a7fd993d4 · 2026-02-03T15:37:30.000+05:30
Litellm tuesday cicd release final
diff --git a/ci_cd/security_scans.sh b/ci_cd/security_scans.sh
@@ -154,6 +154,7 @@ run_grype_scans() {
         "CVE-2025-15367" # No fix available yet
         "CVE-2025-12781" # No fix available yet
         "CVE-2025-11468" # No fix available yet
+        "CVE-2026-1299" # Python 3.13 email module header injection - not applicable, LiteLLM doesn't use BytesGenerator for email serialization
     )
 
     # Build JSON array of allowlisted CVE IDs for jq
diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md
@@ -545,6 +545,9 @@ router_settings:
 | DEFAULT_MAX_TOKENS | Default maximum tokens for LLM calls. Default is 4096
 | DEFAULT_MAX_TOKENS_FOR_TRITON | Default maximum tokens for Triton models. Default is 2000
 | DEFAULT_MAX_REDIS_BATCH_CACHE_SIZE | Default maximum size for redis batch cache. Default is 1000
+| DEFAULT_MCP_SEMANTIC_FILTER_EMBEDDING_MODEL | Default embedding model for MCP semantic tool filtering. Default is "text-embedding-3-small"
+| DEFAULT_MCP_SEMANTIC_FILTER_SIMILARITY_THRESHOLD | Default similarity threshold for MCP semantic tool filtering. Default is 0.3
+| DEFAULT_MCP_SEMANTIC_FILTER_TOP_K | Default number of top results to return for MCP semantic tool filtering. Default is 10
 | DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT | Default token count for mock response completions. Default is 20
 | DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT | Default token count for mock response prompts. Default is 10
 | DEFAULT_MODEL_CREATED_AT_TIME | Default creation timestamp for models. Default is 1677610602
@@ -802,6 +805,7 @@ router_settings:
 | MAXIMUM_TRACEBACK_LINES_TO_LOG | Maximum number of lines to log in traceback in LiteLLM Logs UI. Default is 100
 | MAX_RETRY_DELAY | Maximum delay in seconds for retrying requests. Default is 8.0
 | MAX_LANGFUSE_INITIALIZED_CLIENTS | Maximum number of Langfuse clients to initialize on proxy. Default is 50. This is set since langfuse initializes 1 thread everytime a client is initialized. We've had an incident in the past where we reached 100% cpu utilization because Langfuse was initialized several times.
+| MAX_MCP_SEMANTIC_FILTER_TOOLS_HEADER_LENGTH | Maximum header length for MCP semantic filter tools. Default is 150
 | MIN_NON_ZERO_TEMPERATURE | Minimum non-zero temperature value. Default is 0.0001
 | MINIMUM_PROMPT_CACHE_TOKEN_COUNT | Minimum token count for caching a prompt. Default is 1024
 | MISTRAL_API_BASE | Base URL for Mistral API. Default is https://api.mistral.ai
diff --git a/litellm-proxy-extras/litellm_proxy_extras/migrations/20260129103648_add_verificationtoken_indexes/migration.sql b/litellm-proxy-extras/litellm_proxy_extras/migrations/20260129103648_add_verificationtoken_indexes/migration.sql
diff --git a/litellm-proxy-extras/litellm_proxy_extras/schema.prisma b/litellm-proxy-extras/litellm_proxy_extras/schema.prisma
@@ -305,16 +305,6 @@ model LiteLLM_VerificationToken {
     litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
     litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
     object_permission LiteLLM_ObjectPermissionTable?   @relation(fields: [object_permission_id], references: [object_permission_id])
-
-    // SELECT COUNT(*) FROM (SELECT "public"."LiteLLM_VerificationToken"."token" FROM "public"."LiteLLM_VerificationToken" WHERE ("public"."LiteLLM_VerificationToken"."user_id" = $1 AND ("public"."LiteLLM_VerificationToken"."team_id" IS NULL OR "public"."LiteLLM_VerificationToken"."team_id" <> $2)) OFFSET $3 ) AS "sub"
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."user_id" = $1 OFFSET $2
-    @@index([user_id, team_id])
-
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."team_id" = $1 OFFSET $2
-    @@index([team_id])
-
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE (("public"."LiteLLM_VerificationToken"."expires" IS NULL OR "public"."LiteLLM_VerificationToken"."expires" > $1) AND "public"."LiteLLM_VerificationToken"."budget_reset_at" < $2) OFFSET $3
-    @@index([budget_reset_at, expires])
 }
 
 // Audit table for deleted keys - preserves spend and key information for historical tracking
diff --git a/litellm/proxy/auth/model_checks.py b/litellm/proxy/auth/model_checks.py
@@ -64,27 +64,6 @@ def _get_models_from_access_groups(
     return all_models
 
 
-def get_access_groups_from_models(
-    model_access_groups: Dict[str, List[str]],
-    models: List[str],
-) -> List[str]:
-    """
-    Extract access group names from a models list.
-
-    Given a models list like ["gpt-4", "beta-models", "claude-v1"]
-    and access groups like {"beta-models": ["gpt-5", "gpt-6"]},
-    returns ["beta-models"].
-
-    This is used to pass allowed access groups to the router for filtering
-    deployments during load balancing (GitHub issue #18333).
-    """
-    access_groups = []
-    for model in models:
-        if model in model_access_groups:
-            access_groups.append(model)
-    return access_groups
-
-
 async def get_mcp_server_ids(
     user_api_key_dict: UserAPIKeyAuth,
 ) -> List[str]:
@@ -101,6 +80,7 @@ async def get_mcp_server_ids(
 
     # Make a direct SQL query to get just the mcp_servers
     try:
+
         result = await prisma_client.db.litellm_objectpermissiontable.find_unique(
             where={"object_permission_id": user_api_key_dict.object_permission_id},
         )
@@ -196,7 +176,6 @@ def get_complete_model_list(
     """
 
     unique_models = []
-
     def append_unique(models):
         for model in models:
             if model not in unique_models:
@@ -209,7 +188,7 @@ def append_unique(models):
     else:
         append_unique(proxy_model_list)
         if include_model_access_groups:
-            append_unique(list(model_access_groups.keys()))  # TODO: keys order
+            append_unique(list(model_access_groups.keys())) # TODO: keys order
 
         if user_model:
             append_unique([user_model])
diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
@@ -1021,37 +1021,6 @@ async def add_litellm_data_to_request(  # noqa: PLR0915
         "user_api_key_user_max_budget"
     ] = user_api_key_dict.user_max_budget
 
-    # Extract allowed access groups for router filtering (GitHub issue #18333)
-    # This allows the router to filter deployments based on key's and team's access groups
-    # NOTE: We keep key and team access groups SEPARATE because a key doesn't always
-    # inherit all team access groups (per maintainer feedback).
-    if llm_router is not None:
-        from litellm.proxy.auth.model_checks import get_access_groups_from_models
-
-        model_access_groups = llm_router.get_model_access_groups()
-
-        # Key-level access groups (from user_api_key_dict.models)
-        key_models = list(user_api_key_dict.models) if user_api_key_dict.models else []
-        key_allowed_access_groups = get_access_groups_from_models(
-            model_access_groups=model_access_groups, models=key_models
-        )
-        if key_allowed_access_groups:
-            data[_metadata_variable_name][
-                "user_api_key_allowed_access_groups"
-            ] = key_allowed_access_groups
-
-        # Team-level access groups (from user_api_key_dict.team_models)
-        team_models = (
-            list(user_api_key_dict.team_models) if user_api_key_dict.team_models else []
-        )
-        team_allowed_access_groups = get_access_groups_from_models(
-            model_access_groups=model_access_groups, models=team_models
-        )
-        if team_allowed_access_groups:
-            data[_metadata_variable_name][
-                "user_api_key_team_allowed_access_groups"
-            ] = team_allowed_access_groups
-
     data[_metadata_variable_name]["user_api_key_metadata"] = user_api_key_dict.metadata
     _headers = dict(request.headers)
     _headers.pop(
diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma
@@ -305,16 +305,6 @@ model LiteLLM_VerificationToken {
     litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
     litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
     object_permission LiteLLM_ObjectPermissionTable?   @relation(fields: [object_permission_id], references: [object_permission_id])
-
-    // SELECT COUNT(*) FROM (SELECT "public"."LiteLLM_VerificationToken"."token" FROM "public"."LiteLLM_VerificationToken" WHERE ("public"."LiteLLM_VerificationToken"."user_id" = $1 AND ("public"."LiteLLM_VerificationToken"."team_id" IS NULL OR "public"."LiteLLM_VerificationToken"."team_id" <> $2)) OFFSET $3 ) AS "sub"
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."user_id" = $1 OFFSET $2
-    @@index([user_id, team_id])
-
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."team_id" = $1 OFFSET $2
-    @@index([team_id])
-
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE (("public"."LiteLLM_VerificationToken"."expires" IS NULL OR "public"."LiteLLM_VerificationToken"."expires" > $1) AND "public"."LiteLLM_VerificationToken"."budget_reset_at" < $2) OFFSET $3
-    @@index([budget_reset_at, expires])
 }
 
 // Audit table for deleted keys - preserves spend and key information for historical tracking
diff --git a/litellm/router.py b/litellm/router.py
@@ -88,7 +88,6 @@
     is_clientside_credential,
 )
 from litellm.router_utils.common_utils import (
-    filter_deployments_by_access_groups,
     filter_team_based_models,
     filter_web_search_deployments,
 )
@@ -8088,17 +8087,10 @@ async def async_get_healthy_deployments(
             request_kwargs=request_kwargs,
         )
 
-        verbose_router_logger.debug(f"healthy_deployments after web search filter: {healthy_deployments}")
-
-        # Filter by allowed access groups (GitHub issue #18333)
-        # This prevents cross-team load balancing when teams have models with same name in different access groups
-        healthy_deployments = filter_deployments_by_access_groups(
-            healthy_deployments=healthy_deployments,
-            request_kwargs=request_kwargs,
+        verbose_router_logger.debug(
+            f"healthy_deployments after web search filter: {healthy_deployments}"
         )
 
-        verbose_router_logger.debug(f"healthy_deployments after access group filter: {healthy_deployments}")
-
         if isinstance(healthy_deployments, dict):
             return healthy_deployments
 
diff --git a/litellm/router_utils/common_utils.py b/litellm/router_utils/common_utils.py
@@ -75,7 +75,6 @@ def filter_team_based_models(
         if deployment.get("model_info", {}).get("id") not in ids_to_remove
     ]
 
-
 def _deployment_supports_web_search(deployment: Dict) -> bool:
     """
     Check if a deployment supports web search.
@@ -113,7 +112,7 @@ def filter_web_search_deployments(
     is_web_search_request = False
     tools = request_kwargs.get("tools") or []
     for tool in tools:
-        # These are the two websearch tools for OpenAI / Azure.
+        # These are the two websearch tools for OpenAI / Azure. 
         if tool.get("type") == "web_search" or tool.get("type") == "web_search_preview":
             is_web_search_request = True
             break
@@ -122,82 +121,8 @@ def filter_web_search_deployments(
         return healthy_deployments
 
     # Filter out deployments that don't support web search
-    final_deployments = [
-        d for d in healthy_deployments if _deployment_supports_web_search(d)
-    ]
+    final_deployments = [d for d in healthy_deployments if _deployment_supports_web_search(d)]
     if len(healthy_deployments) > 0 and len(final_deployments) == 0:
         verbose_logger.warning("No deployments support web search for request")
     return final_deployments
 
-
-def filter_deployments_by_access_groups(
-    healthy_deployments: Union[List[Dict], Dict],
-    request_kwargs: Optional[Dict] = None,
-) -> Union[List[Dict], Dict]:
-    """
-    Filter deployments to only include those matching the user's allowed access groups.
-
-    Reads from TWO separate metadata fields (per maintainer feedback):
-    - `user_api_key_allowed_access_groups`: Access groups from the API Key's models.
-    - `user_api_key_team_allowed_access_groups`: Access groups from the Team's models.
-
-    A deployment is included if its access_groups overlap with EITHER the key's
-    or the team's allowed access groups. Deployments with no access_groups are
-    always included (not restricted).
-
-    This prevents cross-team load balancing when multiple teams have models with
-    the same name but in different access groups (GitHub issue #18333).
-    """
-    if request_kwargs is None:
-        return healthy_deployments
-
-    if isinstance(healthy_deployments, dict):
-        return healthy_deployments
-
-    metadata = request_kwargs.get("metadata") or {}
-    litellm_metadata = request_kwargs.get("litellm_metadata") or {}
-
-    # Gather key-level allowed access groups
-    key_allowed_access_groups = (
-        metadata.get("user_api_key_allowed_access_groups")
-        or litellm_metadata.get("user_api_key_allowed_access_groups")
-        or []
-    )
-
-    # Gather team-level allowed access groups
-    team_allowed_access_groups = (
-        metadata.get("user_api_key_team_allowed_access_groups")
-        or litellm_metadata.get("user_api_key_team_allowed_access_groups")
-        or []
-    )
-
-    # Combine both for the final allowed set
-    combined_allowed_access_groups = list(key_allowed_access_groups) + list(
-        team_allowed_access_groups
-    )
-
-    # If no access groups specified from either source, return all deployments (backwards compatible)
-    if not combined_allowed_access_groups:
-        return healthy_deployments
-
-    allowed_set = set(combined_allowed_access_groups)
-    filtered = []
-    for deployment in healthy_deployments:
-        model_info = deployment.get("model_info") or {}
-        deployment_access_groups = model_info.get("access_groups") or []
-
-        # If deployment has no access groups, include it (not restricted)
-        if not deployment_access_groups:
-            filtered.append(deployment)
-            continue
-
-        # Include if any of deployment's groups overlap with allowed groups
-        if set(deployment_access_groups) & allowed_set:
-            filtered.append(deployment)
-
-    if len(healthy_deployments) > 0 and len(filtered) == 0:
-        verbose_logger.warning(
-            f"No deployments match allowed access groups {combined_allowed_access_groups}"
-        )
-
-    return filtered
diff --git a/litellm/router_utils/fallback_event_handlers.py b/litellm/router_utils/fallback_event_handlers.py
@@ -113,16 +113,8 @@ async def run_async_fallback(
         The most recent exception if all fallback model groups fail.
     """
 
-    ### BASE CASE ### MAX FALLBACK DEPTH REACHED 
-    if fallback_depth >= max_fallbacks: 
-        raise original_exception
-
-    ### CHECK IF MODEL GROUP LIST EXHAUSTED
-    if original_model_group in fallback_model_group:
-        fallback_group_length = len(fallback_model_group) - 1
-    else:
-        fallback_group_length = len(fallback_model_group) 
-    if fallback_depth >= fallback_group_length:
+    ### BASE CASE ### MAX FALLBACK DEPTH REACHED
+    if fallback_depth >= max_fallbacks:
         raise original_exception
 
     error_from_fallbacks = original_exception
diff --git a/schema.prisma b/schema.prisma
@@ -305,16 +305,6 @@ model LiteLLM_VerificationToken {
     litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
     litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
     object_permission LiteLLM_ObjectPermissionTable?   @relation(fields: [object_permission_id], references: [object_permission_id])
-
-    // SELECT COUNT(*) FROM (SELECT "public"."LiteLLM_VerificationToken"."token" FROM "public"."LiteLLM_VerificationToken" WHERE ("public"."LiteLLM_VerificationToken"."user_id" = $1 AND ("public"."LiteLLM_VerificationToken"."team_id" IS NULL OR "public"."LiteLLM_VerificationToken"."team_id" <> $2)) OFFSET $3 ) AS "sub"
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."user_id" = $1 OFFSET $2
-    @@index([user_id, team_id])
-
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE "public"."LiteLLM_VerificationToken"."team_id" = $1 OFFSET $2
-    @@index([team_id])
-
-    // SELECT ... FROM "public"."LiteLLM_VerificationToken" WHERE (("public"."LiteLLM_VerificationToken"."expires" IS NULL OR "public"."LiteLLM_VerificationToken"."expires" > $1) AND "public"."LiteLLM_VerificationToken"."budget_reset_at" < $2) OFFSET $3
-    @@index([budget_reset_at, expires])
 }
 
 // Audit table for deleted keys - preserves spend and key information for historical tracking
diff --git a/tests/mcp_tests/test_semantic_tool_filter_e2e.py b/tests/mcp_tests/test_semantic_tool_filter_e2e.py
@@ -12,8 +12,19 @@
 
 from mcp.types import Tool as MCPTool
 
+# Check if semantic-router is available
+try:
+    import semantic_router
+    SEMANTIC_ROUTER_AVAILABLE = True
+except ImportError:
+    SEMANTIC_ROUTER_AVAILABLE = False
+
 
 @pytest.mark.asyncio
+@pytest.mark.skipif(
+    not SEMANTIC_ROUTER_AVAILABLE,
+    reason="semantic-router not installed. Install with: pip install 'litellm[semantic-router]'"
+)
 async def test_e2e_semantic_filter():
     """E2E: Load router/filter and verify hook filters tools."""
     from litellm import Router
@@ -37,8 +48,6 @@ async def test_e2e_semantic_filter():
         enabled=True,
     )
     
-    hook = SemanticToolFilterHook(filter_instance)
-    
     # Create 10 tools
     tools = [
         MCPTool(name="gmail_send", description="Send an email via Gmail", inputSchema={"type": "object"}),
@@ -53,10 +62,16 @@ async def test_e2e_semantic_filter():
         MCPTool(name="note_add", description="Add note", inputSchema={"type": "object"}),
     ]
     
+    # Build router with test tools
+    filter_instance._build_router(tools)
+    
+    hook = SemanticToolFilterHook(filter_instance)
+    
     data = {
         "model": "gpt-4",
         "messages": [{"role": "user", "content": "Send an email and create a calendar event"}],
         "tools": tools,
+        "metadata": {},  # Initialize metadata dict for hook to store filter stats
     }
     
     # Call hook
diff --git a/tests/test_fallbacks.py b/tests/test_fallbacks.py
diff --git a/tests/test_litellm/proxy/_experimental/mcp_server/test_semantic_tool_filter.py b/tests/test_litellm/proxy/_experimental/mcp_server/test_semantic_tool_filter.py
diff --git a/tests/test_litellm/router_unit_tests/test_filter_deployments_by_access_groups.py b/tests/test_litellm/router_unit_tests/test_filter_deployments_by_access_groups.py

Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,7 @@ run_grype_scans() {`
`154`	`154`	`"CVE-2025-15367" # No fix available yet`
`155`	`155`	`"CVE-2025-12781" # No fix available yet`
`156`	`156`	`"CVE-2025-11468" # No fix available yet`
	`157`	`+ "CVE-2026-1299" # Python 3.13 email module header injection - not applicable, LiteLLM doesn't use BytesGenerator for email serialization`
`157`	`158`	`)`
`158`	`159`
`159`	`160`	`# Build JSON array of allowlisted CVE IDs for jq`
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,6 @@`
`88`	`88`	`is_clientside_credential,`
`89`	`89`	`)`
`90`	`90`	`from litellm.router_utils.common_utils import (`
`91`		`- filter_deployments_by_access_groups,`
`92`	`91`	`filter_team_based_models,`
`93`	`92`	`filter_web_search_deployments,`
`94`	`93`	`)`
`@@ -8088,17 +8087,10 @@ async def async_get_healthy_deployments(`
`8088`	`8087`	`request_kwargs=request_kwargs,`
`8089`	`8088`	`)`
`8090`	`8089`
`8091`		`- verbose_router_logger.debug(f"healthy_deployments after web search filter: {healthy_deployments}")`
`8092`		`-`
`8093`		`- # Filter by allowed access groups (GitHub issue #18333)`
`8094`		`- # This prevents cross-team load balancing when teams have models with same name in different access groups`
`8095`		`- healthy_deployments = filter_deployments_by_access_groups(`
`8096`		`- healthy_deployments=healthy_deployments,`
`8097`		`- request_kwargs=request_kwargs,`
	`8090`	`+ verbose_router_logger.debug(`
	`8091`	`+ f"healthy_deployments after web search filter: {healthy_deployments}"`
`8098`	`8092`	`)`
`8099`	`8093`
`8100`		`- verbose_router_logger.debug(f"healthy_deployments after access group filter: {healthy_deployments}")`
`8101`		`-`
`8102`	`8094`	`if isinstance(healthy_deployments, dict):`
`8103`	`8095`	`return healthy_deployments`
`8104`	`8096`