[Performance] RPS Improvement +500 RPS when sending the user field (#14616)

AlexsanderHamir · web-flow · commit 02db2e8ae878 · 2025-09-16T16:18:23.000-07:00
* perf tool

* fix: cache type issue

* fix: exception hanging &amp; cache setting

1. Removed unhandled exceptions
2. Set cache value to dict
diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py
@@ -469,16 +469,13 @@ def check_in_budget(end_user_obj: LiteLLM_EndUserTable):
     # check if in cache
     cached_user_obj = await user_api_key_cache.async_get_cache(key=_key)
     if cached_user_obj is not None:
-        if isinstance(cached_user_obj, dict):
-            return_obj = LiteLLM_EndUserTable(**cached_user_obj)
-            check_in_budget(end_user_obj=return_obj)
-            return return_obj
-        elif isinstance(cached_user_obj, LiteLLM_EndUserTable):
-            return_obj = cached_user_obj
-            check_in_budget(end_user_obj=return_obj)
-            return return_obj
+        # Convert cached dict to LiteLLM_EndUserTable instance
+        return_obj = LiteLLM_EndUserTable(**cached_user_obj)
+        check_in_budget(end_user_obj=return_obj)
+        return return_obj
+
     # else, check db
-    try:
+    try:        
         response = await prisma_client.db.litellm_endusertable.find_unique(
             where={"user_id": end_user_id},
             include={"litellm_budget_table": True},
@@ -487,9 +484,9 @@ def check_in_budget(end_user_obj: LiteLLM_EndUserTable):
         if response is None:
             raise Exception
 
-        # save the end-user object to cache
+        # save the end-user object to cache (always store as dict for consistency)
         await user_api_key_cache.async_set_cache(
-            key="end_user_id:{}".format(end_user_id), value=response
+            key="end_user_id:{}".format(end_user_id), value=response.dict()
         )
 
         _response = LiteLLM_EndUserTable(**response.dict())
diff --git a/litellm/proxy/common_utils/performance_utils.py b/litellm/proxy/common_utils/performance_utils.py
@@ -0,0 +1,125 @@
+"""
+Performance utilities for LiteLLM proxy server.
+
+This module provides performance monitoring and profiling functionality for endpoint
+performance analysis using cProfile with configurable sampling rates.
+"""
+
+import asyncio
+import cProfile
+import functools
+import threading
+from pathlib import Path as PathLib
+
+from litellm._logging import verbose_proxy_logger
+
+# Global profiling state
+_profile_lock = threading.Lock()
+_profiler = None
+_last_profile_file_path = None
+_sample_counter = 0
+_sample_counter_lock = threading.Lock()
+
+
+def _should_sample(profile_sampling_rate: float) -> bool:
+    """Determine if current request should be sampled based on sampling rate."""
+    if profile_sampling_rate >= 1.0:
+        return True  # Always sample
+    elif profile_sampling_rate <= 0.0:
+        return False  # Never sample
+    
+    # Use deterministic sampling based on counter for consistent rate
+    global _sample_counter
+    with _sample_counter_lock:
+        _sample_counter += 1
+        # Sample based on rate (e.g., 0.1 means sample every 10th request)
+        should_sample = (_sample_counter % int(1.0 / profile_sampling_rate)) == 0
+        return should_sample
+
+
+def _start_profiling(profile_sampling_rate: float) -> None:
+    """Start cProfile profiling once globally."""
+    global _profiler
+    with _profile_lock:
+        if _profiler is None:
+            _profiler = cProfile.Profile()
+            _profiler.enable()
+            verbose_proxy_logger.info(f"Profiling started with sampling rate: {profile_sampling_rate}")
+
+
+def _start_profiling_for_request(profile_sampling_rate: float) -> bool:
+    """Start profiling for a specific request (if sampling allows)."""
+    if _should_sample(profile_sampling_rate):
+        _start_profiling(profile_sampling_rate)
+        return True
+    return False
+
+
+def _save_stats(profile_file: PathLib) -> None:
+    """Save current stats directly to file."""
+    with _profile_lock:
+        if _profiler is None:
+            return
+        try:
+            # Disable profiler temporarily to dump stats
+            _profiler.disable()
+            _profiler.dump_stats(str(profile_file))
+            # Re-enable profiler to continue profiling
+            _profiler.enable()
+            verbose_proxy_logger.debug(f"Profiling stats saved to {profile_file}")
+        except Exception as e:
+            verbose_proxy_logger.error(f"Error saving profiling stats: {e}")
+            # Make sure profiler is re-enabled even if there's an error
+            try:
+                _profiler.enable()
+            except Exception:
+                pass
+
+
+def profile_endpoint(sampling_rate: float = 1.0):
+    """Decorator to sample endpoint hits and save to a profile file.
+    
+    Args:
+        sampling_rate: Rate of requests to profile (0.0 to 1.0)
+                      - 1.0: Profile all requests (100%)
+                      - 0.1: Profile 1 in 10 requests (10%)
+                      - 0.0: Profile no requests (0%)
+    """
+    def decorator(func):
+        def set_last_profile_path(path: PathLib) -> None:
+            global _last_profile_file_path
+            _last_profile_file_path = path
+
+        if asyncio.iscoroutinefunction(func):
+            @functools.wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                is_sampling = _start_profiling_for_request(sampling_rate)
+                file_path_obj = PathLib("endpoint_profile.pstat")
+                set_last_profile_path(file_path_obj)
+                try:
+                    result = await func(*args, **kwargs)
+                    if is_sampling:
+                        _save_stats(file_path_obj)
+                    return result
+                except Exception:
+                    if is_sampling:
+                        _save_stats(file_path_obj)
+                    raise
+            return async_wrapper
+        else:
+            @functools.wraps(func)
+            def sync_wrapper(*args, **kwargs):
+                is_sampling = _start_profiling_for_request(sampling_rate)
+                file_path_obj = PathLib("endpoint_profile.pstat")
+                set_last_profile_path(file_path_obj)
+                try:
+                    result = func(*args, **kwargs)
+                    if is_sampling:
+                        _save_stats(file_path_obj)
+                    return result
+                except Exception:
+                    if is_sampling:
+                        _save_stats(file_path_obj)
+                    raise
+            return sync_wrapper
+    return decorator
diff --git a/tests/proxy_unit_tests/test_auth_checks.py b/tests/proxy_unit_tests/test_auth_checks.py
@@ -48,7 +48,7 @@ async def test_get_end_user_object(customer_spend, customer_budget):
     )
     _cache = DualCache()
     _key = "end_user_id:{}".format(end_user_id)
-    _cache.set_cache(key=_key, value=end_user_obj)
+    _cache.set_cache(key=_key, value=end_user_obj.model_dump())
     try:
         await get_end_user_object(
             end_user_id=end_user_id,

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ async def test_get_end_user_object(customer_spend, customer_budget):`
`48`	`48`	`)`
`49`	`49`	`_cache = DualCache()`
`50`	`50`	`_key = "end_user_id:{}".format(end_user_id)`
`51`		`- _cache.set_cache(key=_key, value=end_user_obj)`
	`51`	`+ _cache.set_cache(key=_key, value=end_user_obj.model_dump())`
`52`	`52`	`try:`
`53`	`53`	`await get_end_user_object(`
`54`	`54`	`end_user_id=end_user_id,`