Merge pull request #14477 from BerriAI/litellm_dev_09_11_2025_p2

krrishdholakia · web-flow · commit 663dbc60809a · 2025-09-12T19:51:44.000-07:00
`/v1/messages` - don't send content block after message w/ finish reason + usage block + `/key/unblock` - support hashed tokens
diff --git a/litellm/constants.py b/litellm/constants.py
@@ -15,7 +15,7 @@
     os.getenv("DEFAULT_SQS_FLUSH_INTERVAL_SECONDS", 10)
 )
 DEFAULT_NUM_WORKERS_LITELLM_PROXY = int(
-    os.getenv("DEFAULT_NUM_WORKERS_LITELLM_PROXY", os.cpu_count() or 4)
+    os.getenv("DEFAULT_NUM_WORKERS_LITELLM_PROXY", 1)
 )
 DEFAULT_SQS_BATCH_SIZE = int(os.getenv("DEFAULT_SQS_BATCH_SIZE", 512))
 SQS_SEND_MESSAGE_ACTION = "SendMessage"
@@ -60,7 +60,9 @@
     os.getenv("DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_PRO", 128)
 )
 DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH_LITE = int(
-    os.getenv("DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH_LITE", 512)
+    os.getenv(
+        "DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH_LITE", 512
+    )
 )
 
 # Generic fallback for unknown models
@@ -949,7 +951,9 @@
 DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
 PROMETHEUS_EMIT_BUDGET_METRICS_JOB_NAME = "prometheus_emit_budget_metrics"
 CLOUDZERO_EXPORT_USAGE_DATA_JOB_NAME = "cloudzero_export_usage_data"
-CLOUDZERO_MAX_FETCHED_DATA_RECORDS = int(os.getenv("CLOUDZERO_MAX_FETCHED_DATA_RECORDS", 50000))
+CLOUDZERO_MAX_FETCHED_DATA_RECORDS = int(
+    os.getenv("CLOUDZERO_MAX_FETCHED_DATA_RECORDS", 50000)
+)
 SPEND_LOG_CLEANUP_JOB_NAME = "spend_log_cleanup"
 SPEND_LOG_RUN_LOOPS = int(os.getenv("SPEND_LOG_RUN_LOOPS", 500))
 SPEND_LOG_CLEANUP_BATCH_SIZE = int(os.getenv("SPEND_LOG_CLEANUP_BATCH_SIZE", 1000))
diff --git a/litellm/llms/anthropic/experimental_pass_through/adapters/streaming_iterator.py b/litellm/llms/anthropic/experimental_pass_through/adapters/streaming_iterator.py
@@ -28,17 +28,14 @@ class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
         TextBlock,
     )
 
-    def __init__(self, completion_stream: Any, model: str):
-        super().__init__(completion_stream)
-        self.model = model
-
     sent_first_chunk: bool = False
     sent_content_block_start: bool = False
     sent_content_block_finish: bool = False
     current_content_block_type: Literal["text", "tool_use"] = "text"
     sent_last_message: bool = False
     holding_chunk: Optional[Any] = None
     holding_stop_reason_chunk: Optional[Any] = None
+    queued_usage_chunk: bool = False
     current_content_block_index: int = 0
     current_content_block_start: ContentBlockContentBlockDict = TextBlock(
         type="text",
@@ -47,6 +44,10 @@ def __init__(self, completion_stream: Any, model: str):
     pending_new_content_block: bool = False
     chunk_queue: deque = deque()  # Queue for buffering multiple chunks
 
+    def __init__(self, completion_stream: Any, model: str):
+        super().__init__(completion_stream)
+        self.model = model
+
     def __next__(self):
         from .transformation import LiteLLMAnthropicMessagesAdapter
 
@@ -217,77 +218,83 @@ async def __anext__(self):  # noqa: PLR0915
 
                     # Queue the merged chunk and reset
                     self.chunk_queue.append(merged_chunk)
+                    self.queued_usage_chunk = True
                     self.holding_stop_reason_chunk = None
                     return self.chunk_queue.popleft()
 
                 # Check if this processed chunk has a stop_reason - hold it for next chunk
 
-                if should_start_new_block and not self.sent_content_block_finish:
-                    # Queue the sequence: content_block_stop -> content_block_start -> current_chunk
-
-                    # 1. Stop current content block
-                    self.chunk_queue.append(
-                        {
-                            "type": "content_block_stop",
-                            "index": max(self.current_content_block_index - 1, 0),
-                        }
-                    )
-
-                    # 2. Start new content block
-                    self.chunk_queue.append(
-                        {
-                            "type": "content_block_start",
-                            "index": self.current_content_block_index,
-                            "content_block": self.current_content_block_start,
-                        }
-                    )
-
-                    # 3. Queue the current chunk (don't lose it!)
-                    self.chunk_queue.append(processed_chunk)
-
-                    # Reset state for new block
-                    self.sent_content_block_finish = False
-
-                    # Return the first queued item
-                    return self.chunk_queue.popleft()
-
-                if (
-                    processed_chunk["type"] == "message_delta"
-                    and self.sent_content_block_finish is False
-                ):
-                    # Queue both the content_block_stop and the holding chunk
-                    self.chunk_queue.append(
-                        {
-                            "type": "content_block_stop",
-                            "index": self.current_content_block_index,
-                        }
-                    )
-                    self.sent_content_block_finish = True
-                    if processed_chunk.get("delta", {}).get("stop_reason") is not None:
+                if not self.queued_usage_chunk:
+                    if should_start_new_block and not self.sent_content_block_finish:
+                        # Queue the sequence: content_block_stop -> content_block_start -> current_chunk
+
+                        # 1. Stop current content block
+                        self.chunk_queue.append(
+                            {
+                                "type": "content_block_stop",
+                                "index": max(self.current_content_block_index - 1, 0),
+                            }
+                        )
+
+                        # 2. Start new content block
+                        self.chunk_queue.append(
+                            {
+                                "type": "content_block_start",
+                                "index": self.current_content_block_index,
+                                "content_block": self.current_content_block_start,
+                            }
+                        )
+
+                        # 3. Queue the current chunk (don't lose it!)
+                        self.chunk_queue.append(processed_chunk)
 
-                        self.holding_stop_reason_chunk = processed_chunk
+                        # Reset state for new block
+                        self.sent_content_block_finish = False
+
+                        # Return the first queued item
+                        return self.chunk_queue.popleft()
+
+                    if (
+                        processed_chunk["type"] == "message_delta"
+                        and self.sent_content_block_finish is False
+                    ):
+                        # Queue both the content_block_stop and the holding chunk
+                        self.chunk_queue.append(
+                            {
+                                "type": "content_block_stop",
+                                "index": self.current_content_block_index,
+                            }
+                        )
+                        self.sent_content_block_finish = True
+                        if (
+                            processed_chunk.get("delta", {}).get("stop_reason")
+                            is not None
+                        ):
+
+                            self.holding_stop_reason_chunk = processed_chunk
+                        else:
+                            self.chunk_queue.append(processed_chunk)
+                        return self.chunk_queue.popleft()
+                    elif self.holding_chunk is not None:
+                        # Queue both chunks
+                        self.chunk_queue.append(self.holding_chunk)
+                        self.chunk_queue.append(processed_chunk)
+                        self.holding_chunk = None
+                        return self.chunk_queue.popleft()
                     else:
+                        # Queue the current chunk
                         self.chunk_queue.append(processed_chunk)
-                    return self.chunk_queue.popleft()
-                elif self.holding_chunk is not None:
-                    # Queue both chunks
-                    self.chunk_queue.append(self.holding_chunk)
-                    self.chunk_queue.append(processed_chunk)
-                    self.holding_chunk = None
-                    return self.chunk_queue.popleft()
-                else:
-                    # Queue the current chunk
-                    self.chunk_queue.append(processed_chunk)
-                    return self.chunk_queue.popleft()
+                        return self.chunk_queue.popleft()
 
             # Handle any remaining held chunks after stream ends
-            if self.holding_stop_reason_chunk is not None:
-                self.chunk_queue.append(self.holding_stop_reason_chunk)
-                self.holding_stop_reason_chunk = None
+            if not self.queued_usage_chunk:
+                if self.holding_stop_reason_chunk is not None:
+                    self.chunk_queue.append(self.holding_stop_reason_chunk)
+                    self.holding_stop_reason_chunk = None
 
-            if self.holding_chunk is not None:
-                self.chunk_queue.append(self.holding_chunk)
-                self.holding_chunk = None
+                if self.holding_chunk is not None:
+                    self.chunk_queue.append(self.holding_chunk)
+                    self.holding_chunk = None
 
             if not self.sent_last_message:
                 self.sent_last_message = True
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
@@ -7,3 +7,6 @@ model_list:
   - model_name: wildcard_models/*
     litellm_params:
       model: openai/*
+  - model_name: xai-grok-3
+    litellm_params:
+      model: xai/grok-3
diff --git a/litellm/proxy/hooks/parallel_request_limiter_v3.py b/litellm/proxy/hooks/parallel_request_limiter_v3.py
@@ -6,6 +6,7 @@
 
 import os
 from datetime import datetime
+from math import floor
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -17,7 +18,7 @@
     Union,
     cast,
 )
-from math import floor
+
 from fastapi import HTTPException
 
 from litellm import DualCache
@@ -95,6 +96,7 @@
 return results
 """
 
+
 class RateLimitDescriptorRateLimitObject(TypedDict, total=False):
     requests_per_unit: Optional[int]
     tokens_per_unit: Optional[int]
@@ -480,10 +482,15 @@ async def async_pre_call_hook(
                     },
                 )
             )
-        
+
         # Team Member rate limits
-        if user_api_key_dict.user_id and (user_api_key_dict.team_member_rpm_limit is not None or user_api_key_dict.team_member_tpm_limit is not None):
-            team_member_value = f"{user_api_key_dict.team_id}:{user_api_key_dict.user_id}"
+        if user_api_key_dict.user_id and (
+            user_api_key_dict.team_member_rpm_limit is not None
+            or user_api_key_dict.team_member_tpm_limit is not None
+        ):
+            team_member_value = (
+                f"{user_api_key_dict.team_id}:{user_api_key_dict.user_id}"
+            )
             descriptors.append(
                 RateLimitDescriptor(
                     key="team_member",
@@ -557,13 +564,13 @@ async def async_pre_call_hook(
                 # Find which descriptor hit the limit
                 for i, status in enumerate(response["statuses"]):
                     if status["code"] == "OVER_LIMIT":
-                        descriptor = descriptors[floor(i/2)]
+                        descriptor = descriptors[floor(i / 2)]
                         raise HTTPException(
                             status_code=429,
                             detail=f"Rate limit exceeded for {descriptor['key']}: {descriptor['value']}. Remaining: {status['limit_remaining']}",
                             headers={
                                 "retry-after": str(self.window_size),
-                                "rate_limit_type": str(status["rate_limit_type"])
+                                "rate_limit_type": str(status["rate_limit_type"]),
                             },  # Retry after 1 minute
                         )
 
@@ -613,7 +620,9 @@ async def async_increment_tokens_with_ttl_preservation(
 
         # Check if script is available
         if self.token_increment_script is None:
-            verbose_proxy_logger.debug("TTL preservation script not available, using regular pipeline")
+            verbose_proxy_logger.debug(
+                "TTL preservation script not available, using regular pipeline"
+            )
             await self.internal_usage_cache.dual_cache.async_increment_cache_pipeline(
                 increment_list=pipeline_operations,
                 litellm_parent_otel_span=parent_otel_span,
@@ -628,7 +637,7 @@ async def async_increment_tokens_with_ttl_preservation(
             for op in pipeline_operations:
                 # Convert None TTL to 0 for Lua script
                 ttl_value = op["ttl"] if op["ttl"] is not None else 0
-                
+
                 verbose_proxy_logger.debug(
                     f"Executing TTL-preserving increment for key={op['key']}, "
                     f"increment={op['increment_value']}, ttl={ttl_value}"
@@ -693,16 +702,15 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti
             )
 
             # Get metadata from kwargs
-            user_api_key = kwargs["litellm_params"]["metadata"].get("user_api_key")
-            user_api_key_user_id = kwargs["litellm_params"]["metadata"].get(
-                "user_api_key_user_id"
-            )
-            user_api_key_team_id = kwargs["litellm_params"]["metadata"].get(
-                "user_api_key_team_id"
+            litellm_metadata = kwargs["litellm_params"]["metadata"]
+            if litellm_metadata is None:
+                return
+            user_api_key = litellm_metadata.get("user_api_key")
+            user_api_key_user_id = litellm_metadata.get("user_api_key_user_id")
+            user_api_key_team_id = litellm_metadata.get("user_api_key_team_id")
+            user_api_key_end_user_id = kwargs.get("user") or litellm_metadata.get(
+                "user_api_key_end_user_id"
             )
-            user_api_key_end_user_id = kwargs.get("user") or kwargs["litellm_params"][
-                "metadata"
-            ].get("user_api_key_end_user_id")
             model_group = get_model_group_from_litellm_kwargs(kwargs)
 
             # Get total tokens from response
diff --git a/litellm/proxy/management_endpoints/key_management_endpoints.py b/litellm/proxy/management_endpoints/key_management_endpoints.py
@@ -346,6 +346,7 @@ def handle_key_type(data: GenerateKeyRequest, data_json: dict) -> dict:
         data_json["allowed_routes"] = ["info_routes"]
     return data_json
 
+
 async def validate_team_id_used_in_service_account_request(
     team_id: Optional[str],
     prisma_client: Optional[PrismaClient],
@@ -358,13 +359,13 @@ async def validate_team_id_used_in_service_account_request(
             status_code=400,
             detail="team_id is required for service account keys. Please specify `team_id` in the request body.",
         )
-    
+
     if prisma_client is None:
         raise HTTPException(
             status_code=400,
             detail="prisma_client is required for service account keys. Please specify `prisma_client` in the request body.",
         )
-    
+
     # check if team_id exists in the database
     team = await prisma_client.db.litellm_teamtable.find_unique(
         where={"team_id": team_id},
@@ -376,6 +377,7 @@ async def validate_team_id_used_in_service_account_request(
         )
     return True
 
+
 async def _common_key_generation_helper(  # noqa: PLR0915
     data: GenerateKeyRequest,
     user_api_key_dict: UserAPIKeyAuth,
@@ -557,7 +559,7 @@ async def _common_key_generation_helper(  # noqa: PLR0915
             status_code=400,
             detail={
                 "error": f"Invalid key format. LiteLLM Virtual Key must start with 'sk-'. Received: {data.key}"
-            }
+            },
         )
 
     response = await generate_key_helper_fn(
@@ -2885,7 +2887,10 @@ async def unblock_key(
             param="key",
             code=status.HTTP_400_BAD_REQUEST,
         )
-    hashed_token = hash_token(token=data.key)
+    if data.key.startswith("sk-"):
+        hashed_token = hash_token(token=data.key)
+    else:
+        hashed_token = data.key
 
     if litellm.store_audit_logs is True:
         # make an audit log for key update
diff --git a/tests/test_litellm/llms/anthropic/experimental_pass_through/messages/test_content_after_stop_reason.py b/tests/test_litellm/llms/anthropic/experimental_pass_through/messages/test_content_after_stop_reason.py
diff --git a/tests/test_litellm/proxy/management_endpoints/test_key_management_endpoints.py b/tests/test_litellm/proxy/management_endpoints/test_key_management_endpoints.py