fix: fix race conditions

krrishdholakia · krrishdholakia · commit 03f2be1e203f · 2025-09-14T09:41:04.000-07:00
diff --git a/tests/test_litellm/litellm_core_utils/test_logging_worker.py b/tests/test_litellm/litellm_core_utils/test_logging_worker.py
@@ -1,11 +1,13 @@
 """
 Tests for the LoggingWorker class to ensure graceful shutdown handling.
 """
+
 import asyncio
 import contextvars
-import pytest
 from unittest.mock import AsyncMock, patch
 
+import pytest
+
 from litellm.litellm_core_utils.logging_worker import LoggingWorker
 
 
@@ -21,7 +23,9 @@ def logging_worker(self):
     async def test_graceful_shutdown_with_clear_queue(self, logging_worker):
         """Test that cancellation triggers clear_queue to prevent 'never awaited' warnings."""
         # Mock the clear_queue method to verify it's called during cancellation
-        with patch.object(logging_worker, "clear_queue", new_callable=AsyncMock) as mock_clear_queue:
+        with patch.object(
+            logging_worker, "clear_queue", new_callable=AsyncMock
+        ) as mock_clear_queue:
             # Start the worker
             logging_worker.start()
 
@@ -64,7 +68,9 @@ async def test_clear_queue_processes_remaining_items(self, logging_worker):
     async def test_worker_handles_cancellation_gracefully(self, logging_worker):
         """Test that the worker handles cancellation without throwing exceptions."""
         # Mock verbose_logger to capture debug messages
-        with patch("litellm.litellm_core_utils.logging_worker.verbose_logger") as mock_logger:
+        with patch(
+            "litellm.litellm_core_utils.logging_worker.verbose_logger"
+        ) as mock_logger:
             # Start the worker
             logging_worker.start()
 
@@ -131,90 +137,133 @@ async def test_queue_full_handling(self, logging_worker):
         small_worker._ensure_queue()
 
         # Mock verbose_logger to capture exception messages
-        with patch("litellm.litellm_core_utils.logging_worker.verbose_logger") as mock_logger:
+        with patch(
+            "litellm.litellm_core_utils.logging_worker.verbose_logger"
+        ) as mock_logger:
             # Fill the queue beyond capacity
             mock_coro = AsyncMock()
             for _ in range(5):  # More than max_queue_size of 2
                 small_worker.enqueue(mock_coro())
 
             # Should have logged queue full exceptions
-            exception_calls = [call for call in mock_logger.exception.call_args_list if "queue is full" in str(call)]
+            exception_calls = [
+                call
+                for call in mock_logger.exception.call_args_list
+                if "queue is full" in str(call)
+            ]
             assert len(exception_calls) > 0
 
     @pytest.mark.asyncio
     async def test_context_propagation(self, logging_worker):
         """Test that enqueued tasks execute in their original context."""
         # Create a context variable for testing
-        test_context_var: contextvars.ContextVar[str] = contextvars.ContextVar('test_context_var')
-        
-        # Track results from multiple tasks
+        test_context_var: contextvars.ContextVar[str] = contextvars.ContextVar(
+            "test_context_var"
+        )
+
+        # Track results from multiple tasks using asyncio.Event for synchronization
         task_results = []
-        
+        completion_events = {}
+
         async def test_task(task_id: str):
             """A test coroutine that checks if it can access the context variable."""
-            # Sleep a bit to simulate real work and ensure context persists
-            await asyncio.sleep(0.1)
-            
             try:
                 # Try to get the context variable value
                 value = test_context_var.get()
-                task_results.append({
-                    'task_id': task_id,
-                    'context_value': value,
-                    'context_accessible': True
-                })
+                task_results.append(
+                    {
+                        "task_id": task_id,
+                        "context_value": value,
+                        "context_accessible": True,
+                    }
+                )
             except LookupError:
                 # Context variable not found
-                task_results.append({
-                    'task_id': task_id,
-                    'context_accessible': False,
-                    'context_value': None
-                })
-        
+                task_results.append(
+                    {
+                        "task_id": task_id,
+                        "context_accessible": False,
+                        "context_value": None,
+                    }
+                )
+            finally:
+                # Signal that this task is complete
+                completion_events[task_id].set()
+
+        # Create completion events for each task
+        completion_events["task_1"] = asyncio.Event()
+        completion_events["task_2"] = asyncio.Event()
+        completion_events["task_3"] = asyncio.Event()
+
         # Start the logging worker
         logging_worker.start()
-        
+
+        # Give the worker a moment to start
+        await asyncio.sleep(0.1)
+
         # Create two separate contexts and enqueue tasks from each
-        
+
         # Context 1: Set context var to "context_1"
         ctx1 = contextvars.copy_context()
         ctx1.run(test_context_var.set, "context_1")
         ctx1.run(logging_worker.enqueue, test_task("task_1"))
-        
-        # Context 2: Set context var to "context_2"  
+
+        # Context 2: Set context var to "context_2"
         ctx2 = contextvars.copy_context()
         ctx2.run(test_context_var.set, "context_2")
         ctx2.run(logging_worker.enqueue, test_task("task_2"))
-        
+
         # Context 3: No context variable set (should get LookupError)
         ctx3 = contextvars.copy_context()
         ctx3.run(logging_worker.enqueue, test_task("task_3"))
-        
-        # Wait for all tasks to be processed
-        await asyncio.sleep(0.5)
-        
+
+        # Wait for all tasks to complete with a reasonable timeout
+        try:
+            await asyncio.wait_for(
+                asyncio.gather(
+                    completion_events["task_1"].wait(),
+                    completion_events["task_2"].wait(),
+                    completion_events["task_3"].wait(),
+                ),
+                timeout=5.0,
+            )
+        except asyncio.TimeoutError:
+            pytest.fail("Tasks did not complete within timeout")
+
         # Stop the worker
         await logging_worker.stop()
-        
+
         # Sort results by task_id for consistent testing
-        task_results.sort(key=lambda x: x['task_id'])
-        
+        task_results.sort(key=lambda x: x["task_id"])
+
         # Verify that each task saw its own context
-        assert len(task_results) == 3, f"Expected 3 results, got {len(task_results)}"
-        
+        assert (
+            len(task_results) == 3
+        ), f"Expected 3 results, got {len(task_results)}: {task_results}"
+
         # Task 1 should see "context_1"
-        task1_result = next((r for r in task_results if r['task_id'] == 'task_1'), None)
+        task1_result = next((r for r in task_results if r["task_id"] == "task_1"), None)
         assert task1_result is not None, "Task 1 result not found"
-        assert task1_result['context_accessible'] is True, "Task 1 should have access to context variable"
-        assert task1_result['context_value'] == "context_1", f"Task 1 should see 'context_1', got: {task1_result['context_value']}"
-        
+        assert (
+            task1_result["context_accessible"] is True
+        ), "Task 1 should have access to context variable"
+        assert (
+            task1_result["context_value"] == "context_1"
+        ), f"Task 1 should see 'context_1', got: {task1_result['context_value']}"
+
         # Task 2 should see "context_2"
-        task2_result = next((r for r in task_results if r['task_id'] == 'task_2'), None)
+        task2_result = next((r for r in task_results if r["task_id"] == "task_2"), None)
         assert task2_result is not None, "Task 2 result not found"
-        assert task2_result['context_accessible'] is True, "Task 2 should have access to context variable"
-        assert task2_result['context_value'] == "context_2", f"Task 2 should see 'context_2', got: {task2_result['context_value']}"
-        
+        assert (
+            task2_result["context_accessible"] is True
+        ), "Task 2 should have access to context variable"
+        assert (
+            task2_result["context_value"] == "context_2"
+        ), f"Task 2 should see 'context_2', got: {task2_result['context_value']}"
+
         # Task 3 should not have access to the context variable
-        task3_result = next((r for r in task_results if r['task_id'] == 'task_3'), None)
+        task3_result = next((r for r in task_results if r["task_id"] == "task_3"), None)
         assert task3_result is not None, "Task 3 result not found"
-        assert task3_result['context_accessible'] is False, "Task 3 should not have access to context variable"
+        assert (
+            task3_result["context_accessible"] is False
+        ), "Task 3 should not have access to context variable"