Skip to content

Commit 663dbc6

Browse files
Merge pull request #14477 from BerriAI/litellm_dev_09_11_2025_p2
`/v1/messages` - don't send content block after message w/ finish reason + usage block + `/key/unblock` - support hashed tokens
2 parents 8b3b943 + 0c8b311 commit 663dbc6

File tree

7 files changed

+660
-131
lines changed

7 files changed

+660
-131
lines changed

litellm/constants.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
os.getenv("DEFAULT_SQS_FLUSH_INTERVAL_SECONDS", 10)
1616
)
1717
DEFAULT_NUM_WORKERS_LITELLM_PROXY = int(
18-
os.getenv("DEFAULT_NUM_WORKERS_LITELLM_PROXY", os.cpu_count() or 4)
18+
os.getenv("DEFAULT_NUM_WORKERS_LITELLM_PROXY", 1)
1919
)
2020
DEFAULT_SQS_BATCH_SIZE = int(os.getenv("DEFAULT_SQS_BATCH_SIZE", 512))
2121
SQS_SEND_MESSAGE_ACTION = "SendMessage"
@@ -60,7 +60,9 @@
6060
os.getenv("DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_PRO", 128)
6161
)
6262
DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH_LITE = int(
63-
os.getenv("DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH_LITE", 512)
63+
os.getenv(
64+
"DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH_LITE", 512
65+
)
6466
)
6567

6668
# Generic fallback for unknown models
@@ -949,7 +951,9 @@
949951
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
950952
PROMETHEUS_EMIT_BUDGET_METRICS_JOB_NAME = "prometheus_emit_budget_metrics"
951953
CLOUDZERO_EXPORT_USAGE_DATA_JOB_NAME = "cloudzero_export_usage_data"
952-
CLOUDZERO_MAX_FETCHED_DATA_RECORDS = int(os.getenv("CLOUDZERO_MAX_FETCHED_DATA_RECORDS", 50000))
954+
CLOUDZERO_MAX_FETCHED_DATA_RECORDS = int(
955+
os.getenv("CLOUDZERO_MAX_FETCHED_DATA_RECORDS", 50000)
956+
)
953957
SPEND_LOG_CLEANUP_JOB_NAME = "spend_log_cleanup"
954958
SPEND_LOG_RUN_LOOPS = int(os.getenv("SPEND_LOG_RUN_LOOPS", 500))
955959
SPEND_LOG_CLEANUP_BATCH_SIZE = int(os.getenv("SPEND_LOG_CLEANUP_BATCH_SIZE", 1000))

litellm/llms/anthropic/experimental_pass_through/adapters/streaming_iterator.py

Lines changed: 71 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,14 @@ class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
2828
TextBlock,
2929
)
3030

31-
def __init__(self, completion_stream: Any, model: str):
32-
super().__init__(completion_stream)
33-
self.model = model
34-
3531
sent_first_chunk: bool = False
3632
sent_content_block_start: bool = False
3733
sent_content_block_finish: bool = False
3834
current_content_block_type: Literal["text", "tool_use"] = "text"
3935
sent_last_message: bool = False
4036
holding_chunk: Optional[Any] = None
4137
holding_stop_reason_chunk: Optional[Any] = None
38+
queued_usage_chunk: bool = False
4239
current_content_block_index: int = 0
4340
current_content_block_start: ContentBlockContentBlockDict = TextBlock(
4441
type="text",
@@ -47,6 +44,10 @@ def __init__(self, completion_stream: Any, model: str):
4744
pending_new_content_block: bool = False
4845
chunk_queue: deque = deque() # Queue for buffering multiple chunks
4946

47+
def __init__(self, completion_stream: Any, model: str):
48+
super().__init__(completion_stream)
49+
self.model = model
50+
5051
def __next__(self):
5152
from .transformation import LiteLLMAnthropicMessagesAdapter
5253

@@ -217,77 +218,83 @@ async def __anext__(self): # noqa: PLR0915
217218

218219
# Queue the merged chunk and reset
219220
self.chunk_queue.append(merged_chunk)
221+
self.queued_usage_chunk = True
220222
self.holding_stop_reason_chunk = None
221223
return self.chunk_queue.popleft()
222224

223225
# Check if this processed chunk has a stop_reason - hold it for next chunk
224226

225-
if should_start_new_block and not self.sent_content_block_finish:
226-
# Queue the sequence: content_block_stop -> content_block_start -> current_chunk
227-
228-
# 1. Stop current content block
229-
self.chunk_queue.append(
230-
{
231-
"type": "content_block_stop",
232-
"index": max(self.current_content_block_index - 1, 0),
233-
}
234-
)
235-
236-
# 2. Start new content block
237-
self.chunk_queue.append(
238-
{
239-
"type": "content_block_start",
240-
"index": self.current_content_block_index,
241-
"content_block": self.current_content_block_start,
242-
}
243-
)
244-
245-
# 3. Queue the current chunk (don't lose it!)
246-
self.chunk_queue.append(processed_chunk)
247-
248-
# Reset state for new block
249-
self.sent_content_block_finish = False
250-
251-
# Return the first queued item
252-
return self.chunk_queue.popleft()
253-
254-
if (
255-
processed_chunk["type"] == "message_delta"
256-
and self.sent_content_block_finish is False
257-
):
258-
# Queue both the content_block_stop and the holding chunk
259-
self.chunk_queue.append(
260-
{
261-
"type": "content_block_stop",
262-
"index": self.current_content_block_index,
263-
}
264-
)
265-
self.sent_content_block_finish = True
266-
if processed_chunk.get("delta", {}).get("stop_reason") is not None:
227+
if not self.queued_usage_chunk:
228+
if should_start_new_block and not self.sent_content_block_finish:
229+
# Queue the sequence: content_block_stop -> content_block_start -> current_chunk
230+
231+
# 1. Stop current content block
232+
self.chunk_queue.append(
233+
{
234+
"type": "content_block_stop",
235+
"index": max(self.current_content_block_index - 1, 0),
236+
}
237+
)
238+
239+
# 2. Start new content block
240+
self.chunk_queue.append(
241+
{
242+
"type": "content_block_start",
243+
"index": self.current_content_block_index,
244+
"content_block": self.current_content_block_start,
245+
}
246+
)
247+
248+
# 3. Queue the current chunk (don't lose it!)
249+
self.chunk_queue.append(processed_chunk)
267250

268-
self.holding_stop_reason_chunk = processed_chunk
251+
# Reset state for new block
252+
self.sent_content_block_finish = False
253+
254+
# Return the first queued item
255+
return self.chunk_queue.popleft()
256+
257+
if (
258+
processed_chunk["type"] == "message_delta"
259+
and self.sent_content_block_finish is False
260+
):
261+
# Queue both the content_block_stop and the holding chunk
262+
self.chunk_queue.append(
263+
{
264+
"type": "content_block_stop",
265+
"index": self.current_content_block_index,
266+
}
267+
)
268+
self.sent_content_block_finish = True
269+
if (
270+
processed_chunk.get("delta", {}).get("stop_reason")
271+
is not None
272+
):
273+
274+
self.holding_stop_reason_chunk = processed_chunk
275+
else:
276+
self.chunk_queue.append(processed_chunk)
277+
return self.chunk_queue.popleft()
278+
elif self.holding_chunk is not None:
279+
# Queue both chunks
280+
self.chunk_queue.append(self.holding_chunk)
281+
self.chunk_queue.append(processed_chunk)
282+
self.holding_chunk = None
283+
return self.chunk_queue.popleft()
269284
else:
285+
# Queue the current chunk
270286
self.chunk_queue.append(processed_chunk)
271-
return self.chunk_queue.popleft()
272-
elif self.holding_chunk is not None:
273-
# Queue both chunks
274-
self.chunk_queue.append(self.holding_chunk)
275-
self.chunk_queue.append(processed_chunk)
276-
self.holding_chunk = None
277-
return self.chunk_queue.popleft()
278-
else:
279-
# Queue the current chunk
280-
self.chunk_queue.append(processed_chunk)
281-
return self.chunk_queue.popleft()
287+
return self.chunk_queue.popleft()
282288

283289
# Handle any remaining held chunks after stream ends
284-
if self.holding_stop_reason_chunk is not None:
285-
self.chunk_queue.append(self.holding_stop_reason_chunk)
286-
self.holding_stop_reason_chunk = None
290+
if not self.queued_usage_chunk:
291+
if self.holding_stop_reason_chunk is not None:
292+
self.chunk_queue.append(self.holding_stop_reason_chunk)
293+
self.holding_stop_reason_chunk = None
287294

288-
if self.holding_chunk is not None:
289-
self.chunk_queue.append(self.holding_chunk)
290-
self.holding_chunk = None
295+
if self.holding_chunk is not None:
296+
self.chunk_queue.append(self.holding_chunk)
297+
self.holding_chunk = None
291298

292299
if not self.sent_last_message:
293300
self.sent_last_message = True

litellm/proxy/_new_secret_config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@ model_list:
77
- model_name: wildcard_models/*
88
litellm_params:
99
model: openai/*
10+
- model_name: xai-grok-3
11+
litellm_params:
12+
model: xai/grok-3

litellm/proxy/hooks/parallel_request_limiter_v3.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import os
88
from datetime import datetime
9+
from math import floor
910
from typing import (
1011
TYPE_CHECKING,
1112
Any,
@@ -17,7 +18,7 @@
1718
Union,
1819
cast,
1920
)
20-
from math import floor
21+
2122
from fastapi import HTTPException
2223

2324
from litellm import DualCache
@@ -95,6 +96,7 @@
9596
return results
9697
"""
9798

99+
98100
class RateLimitDescriptorRateLimitObject(TypedDict, total=False):
99101
requests_per_unit: Optional[int]
100102
tokens_per_unit: Optional[int]
@@ -480,10 +482,15 @@ async def async_pre_call_hook(
480482
},
481483
)
482484
)
483-
485+
484486
# Team Member rate limits
485-
if user_api_key_dict.user_id and (user_api_key_dict.team_member_rpm_limit is not None or user_api_key_dict.team_member_tpm_limit is not None):
486-
team_member_value = f"{user_api_key_dict.team_id}:{user_api_key_dict.user_id}"
487+
if user_api_key_dict.user_id and (
488+
user_api_key_dict.team_member_rpm_limit is not None
489+
or user_api_key_dict.team_member_tpm_limit is not None
490+
):
491+
team_member_value = (
492+
f"{user_api_key_dict.team_id}:{user_api_key_dict.user_id}"
493+
)
487494
descriptors.append(
488495
RateLimitDescriptor(
489496
key="team_member",
@@ -557,13 +564,13 @@ async def async_pre_call_hook(
557564
# Find which descriptor hit the limit
558565
for i, status in enumerate(response["statuses"]):
559566
if status["code"] == "OVER_LIMIT":
560-
descriptor = descriptors[floor(i/2)]
567+
descriptor = descriptors[floor(i / 2)]
561568
raise HTTPException(
562569
status_code=429,
563570
detail=f"Rate limit exceeded for {descriptor['key']}: {descriptor['value']}. Remaining: {status['limit_remaining']}",
564571
headers={
565572
"retry-after": str(self.window_size),
566-
"rate_limit_type": str(status["rate_limit_type"])
573+
"rate_limit_type": str(status["rate_limit_type"]),
567574
}, # Retry after 1 minute
568575
)
569576

@@ -613,7 +620,9 @@ async def async_increment_tokens_with_ttl_preservation(
613620

614621
# Check if script is available
615622
if self.token_increment_script is None:
616-
verbose_proxy_logger.debug("TTL preservation script not available, using regular pipeline")
623+
verbose_proxy_logger.debug(
624+
"TTL preservation script not available, using regular pipeline"
625+
)
617626
await self.internal_usage_cache.dual_cache.async_increment_cache_pipeline(
618627
increment_list=pipeline_operations,
619628
litellm_parent_otel_span=parent_otel_span,
@@ -628,7 +637,7 @@ async def async_increment_tokens_with_ttl_preservation(
628637
for op in pipeline_operations:
629638
# Convert None TTL to 0 for Lua script
630639
ttl_value = op["ttl"] if op["ttl"] is not None else 0
631-
640+
632641
verbose_proxy_logger.debug(
633642
f"Executing TTL-preserving increment for key={op['key']}, "
634643
f"increment={op['increment_value']}, ttl={ttl_value}"
@@ -693,16 +702,15 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti
693702
)
694703

695704
# Get metadata from kwargs
696-
user_api_key = kwargs["litellm_params"]["metadata"].get("user_api_key")
697-
user_api_key_user_id = kwargs["litellm_params"]["metadata"].get(
698-
"user_api_key_user_id"
699-
)
700-
user_api_key_team_id = kwargs["litellm_params"]["metadata"].get(
701-
"user_api_key_team_id"
705+
litellm_metadata = kwargs["litellm_params"]["metadata"]
706+
if litellm_metadata is None:
707+
return
708+
user_api_key = litellm_metadata.get("user_api_key")
709+
user_api_key_user_id = litellm_metadata.get("user_api_key_user_id")
710+
user_api_key_team_id = litellm_metadata.get("user_api_key_team_id")
711+
user_api_key_end_user_id = kwargs.get("user") or litellm_metadata.get(
712+
"user_api_key_end_user_id"
702713
)
703-
user_api_key_end_user_id = kwargs.get("user") or kwargs["litellm_params"][
704-
"metadata"
705-
].get("user_api_key_end_user_id")
706714
model_group = get_model_group_from_litellm_kwargs(kwargs)
707715

708716
# Get total tokens from response

litellm/proxy/management_endpoints/key_management_endpoints.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ def handle_key_type(data: GenerateKeyRequest, data_json: dict) -> dict:
346346
data_json["allowed_routes"] = ["info_routes"]
347347
return data_json
348348

349+
349350
async def validate_team_id_used_in_service_account_request(
350351
team_id: Optional[str],
351352
prisma_client: Optional[PrismaClient],
@@ -358,13 +359,13 @@ async def validate_team_id_used_in_service_account_request(
358359
status_code=400,
359360
detail="team_id is required for service account keys. Please specify `team_id` in the request body.",
360361
)
361-
362+
362363
if prisma_client is None:
363364
raise HTTPException(
364365
status_code=400,
365366
detail="prisma_client is required for service account keys. Please specify `prisma_client` in the request body.",
366367
)
367-
368+
368369
# check if team_id exists in the database
369370
team = await prisma_client.db.litellm_teamtable.find_unique(
370371
where={"team_id": team_id},
@@ -376,6 +377,7 @@ async def validate_team_id_used_in_service_account_request(
376377
)
377378
return True
378379

380+
379381
async def _common_key_generation_helper( # noqa: PLR0915
380382
data: GenerateKeyRequest,
381383
user_api_key_dict: UserAPIKeyAuth,
@@ -557,7 +559,7 @@ async def _common_key_generation_helper( # noqa: PLR0915
557559
status_code=400,
558560
detail={
559561
"error": f"Invalid key format. LiteLLM Virtual Key must start with 'sk-'. Received: {data.key}"
560-
}
562+
},
561563
)
562564

563565
response = await generate_key_helper_fn(
@@ -2885,7 +2887,10 @@ async def unblock_key(
28852887
param="key",
28862888
code=status.HTTP_400_BAD_REQUEST,
28872889
)
2888-
hashed_token = hash_token(token=data.key)
2890+
if data.key.startswith("sk-"):
2891+
hashed_token = hash_token(token=data.key)
2892+
else:
2893+
hashed_token = data.key
28892894

28902895
if litellm.store_audit_logs is True:
28912896
# make an audit log for key update

0 commit comments

Comments
 (0)