make litellm async

derekmeegan · derekmeegan · commit adc89e2e5df2 · 2025-09-18T14:13:52.000-07:00
diff --git a/stagehand/handlers/extract_handler.py b/stagehand/handlers/extract_handler.py
@@ -105,7 +105,7 @@ async def extract(
             schema = transformed_schema = DefaultExtractSchema
 
         # Use inference to call the LLM
-        extraction_response = extract_inference(
+        extraction_response = await extract_inference(
             instruction=instruction,
             tree_elements=output_string,
             schema=transformed_schema,
diff --git a/stagehand/handlers/observe_handler.py b/stagehand/handlers/observe_handler.py
@@ -74,7 +74,7 @@ async def observe(
         iframes = tree.get("iframes", [])
 
         # use inference to call the llm
-        observation_response = observe_inference(
+        observation_response = await observe_inference(
             instruction=instruction,
             tree_elements=output_string,
             llm_client=self.stagehand.llm,
diff --git a/stagehand/llm/client.py b/stagehand/llm/client.py
@@ -60,7 +60,7 @@ def __init__(
                     f"Set global litellm.api_base to {value}", category="llm"
                 )
 
-    def create_response(
+    async def create_response(
         self,
         *,
         messages: list[dict[str, str]],
@@ -77,7 +77,7 @@ def create_response(
                    Overrides the default_model if provided.
             function_name: The name of the Stagehand function calling this method (ACT, OBSERVE, etc.)
                    Used for metrics tracking.
-            **kwargs: Additional parameters to pass directly to litellm.completion
+            **kwargs: Additional parameters to pass directly to litellm.acompletion
                       (e.g., temperature, max_tokens, stream=True, specific provider arguments).
 
         Returns:
@@ -87,7 +87,7 @@ def create_response(
 
         Raises:
             ValueError: If no model is specified (neither default nor in the call).
-            Exception: Propagates exceptions from litellm.completion.
+            Exception: Propagates exceptions from litellm.acompletion.
         """
         completion_model = model or self.default_model
         if not completion_model:
@@ -115,16 +115,16 @@ def create_response(
             filtered_params["temperature"] = 1
 
         self.logger.debug(
-            f"Calling litellm.completion with model={completion_model} and params: {filtered_params}",
+            f"Calling litellm.acompletion with model={completion_model} and params: {filtered_params}",
             category="llm",
         )
 
         try:
             # Start tracking inference time
             start_time = start_inference_timer()
 
-            # Use litellm's completion function
-            response = litellm.completion(**filtered_params)
+            # Use litellm's async completion function
+            response = await litellm.acompletion(**filtered_params)
 
             # Calculate inference time
             inference_time_ms = get_inference_time_ms(start_time)
@@ -136,6 +136,6 @@ def create_response(
             return response
 
         except Exception as e:
-            self.logger.error(f"Error calling litellm.completion: {e}", category="llm")
+            self.logger.error(f"Error calling litellm.acompletion: {e}", category="llm")
             # Consider more specific exception handling based on litellm errors
             raise
diff --git a/stagehand/llm/inference.py b/stagehand/llm/inference.py
@@ -21,7 +21,7 @@
 
 
 # TODO: kwargs
-def observe(
+async def observe(
     instruction: str,
     tree_elements: str,
     llm_client: Any,
@@ -66,7 +66,7 @@ def observe(
     try:
         # Call the LLM
         logger.info("Calling LLM")
-        response = llm_client.create_response(
+        response = await llm_client.create_response(
             model=llm_client.default_model,
             messages=messages,
             response_format=ObserveInferenceSchema,
@@ -123,7 +123,7 @@ def observe(
         }
 
 
-def extract(
+async def extract(
     instruction: str,
     tree_elements: str,
     schema: Optional[Union[type[BaseModel], dict]] = None,
@@ -177,7 +177,7 @@ def extract(
 
     # Call the LLM with appropriate parameters
     try:
-        extract_response = llm_client.create_response(
+        extract_response = await llm_client.create_response(
             model=llm_client.default_model,
             messages=extract_messages,
             response_format=response_format,
@@ -227,7 +227,7 @@ def extract(
     # Call LLM for metadata
     try:
         metadata_start_time = time.time()
-        metadata_response = llm_client.create_response(
+        metadata_response = await llm_client.create_response(
             model=llm_client.default_model,
             messages=metadata_messages,
             response_format=metadata_schema,
diff --git a/test_async_performance.py b/test_async_performance.py
@@ -0,0 +1,138 @@
+"""Test script to verify async LLM calls are non-blocking"""
+
+import asyncio
+import time
+from unittest.mock import AsyncMock, MagicMock
+from stagehand.llm.client import LLMClient
+from stagehand.llm.inference import observe, extract
+
+
+async def simulate_slow_llm_response(delay=1.0):
+    """Simulate a slow LLM API response"""
+    await asyncio.sleep(delay)
+    return MagicMock(
+        usage=MagicMock(prompt_tokens=100, completion_tokens=50),
+        choices=[MagicMock(message=MagicMock(content='{"elements": []}'))]
+    )
+
+
+async def test_parallel_execution():
+    """Test that multiple LLM calls can run in parallel"""
+    print("\n🧪 Testing parallel async execution...")
+
+    # Create mock LLM client
+    mock_logger = MagicMock()
+    mock_logger.info = MagicMock()
+    mock_logger.debug = MagicMock()
+    mock_logger.error = MagicMock()
+
+    llm_client = LLMClient(
+        stagehand_logger=mock_logger,
+        default_model="gpt-4o"
+    )
+
+    # Mock the async create_response to simulate delay
+    async def mock_create_response(**kwargs):
+        return await simulate_slow_llm_response(1.0)
+
+    llm_client.create_response = mock_create_response
+
+    # Measure time for parallel execution
+    start_time = time.time()
+
+    # Run 3 observe calls in parallel
+    tasks = [
+        observe("Find button 1", "DOM content 1", llm_client, logger=mock_logger),
+        observe("Find button 2", "DOM content 2", llm_client, logger=mock_logger),
+        observe("Find button 3", "DOM content 3", llm_client, logger=mock_logger),
+    ]
+
+    results = await asyncio.gather(*tasks)
+    parallel_time = time.time() - start_time
+
+    print(f"✅ Parallel execution of 3 calls took: {parallel_time:.2f}s")
+    print(f"   Expected ~1s (running in parallel), not 3s (sequential)")
+
+    # Verify results
+    assert len(results) == 3
+    for i, result in enumerate(results, 1):
+        assert "elements" in result
+        print(f"   Result {i}: {result}")
+
+    # Test sequential execution for comparison
+    print("\n🧪 Testing sequential execution for comparison...")
+    start_time = time.time()
+
+    result1 = await observe("Find button 1", "DOM content 1", llm_client, logger=mock_logger)
+    result2 = await observe("Find button 2", "DOM content 2", llm_client, logger=mock_logger)
+    result3 = await observe("Find button 3", "DOM content 3", llm_client, logger=mock_logger)
+
+    sequential_time = time.time() - start_time
+    print(f"✅ Sequential execution of 3 calls took: {sequential_time:.2f}s")
+    print(f"   Expected ~3s (running sequentially)")
+
+    # Parallel should be significantly faster
+    assert parallel_time < sequential_time * 0.5, "Parallel execution should be much faster than sequential"
+
+    print(f"\n🎉 Async implementation is working correctly!")
+    print(f"   Parallel speedup: {sequential_time/parallel_time:.2f}x faster")
+
+
+async def test_real_llm_async():
+    """Test with real LiteLLM to ensure the async implementation works"""
+    print("\n🧪 Testing with real LiteLLM (using mock responses)...")
+
+    import litellm
+    from unittest.mock import patch
+
+    # Mock litellm.acompletion to return test data
+    async def mock_acompletion(**kwargs):
+        await asyncio.sleep(0.1)  # Small delay to simulate API call
+        return MagicMock(
+            usage=MagicMock(prompt_tokens=100, completion_tokens=50),
+            choices=[MagicMock(message=MagicMock(content='{"elements": [{"selector": "#test"}]}'))]
+        )
+
+    with patch('litellm.acompletion', new=mock_acompletion):
+        mock_logger = MagicMock()
+        mock_logger.info = MagicMock()
+        mock_logger.debug = MagicMock()
+        mock_logger.error = MagicMock()
+
+        llm_client = LLMClient(
+            stagehand_logger=mock_logger,
+            default_model="gpt-4o"
+        )
+
+        # Test that the actual async call works
+        response = await llm_client.create_response(
+            messages=[{"role": "user", "content": "test"}],
+            model="gpt-4o"
+        )
+
+        assert response is not None
+        print(f"✅ Real LiteLLM async call successful")
+        print(f"   Response: {response.choices[0].message.content}")
+
+
+async def main():
+    """Run all tests"""
+    print("=" * 50)
+    print("ASYNC IMPLEMENTATION VERIFICATION")
+    print("=" * 50)
+
+    try:
+        await test_parallel_execution()
+        await test_real_llm_async()
+
+        print("\n" + "=" * 50)
+        print("✅ ALL TESTS PASSED - ASYNC IS WORKING!")
+        print("=" * 50)
+
+    except Exception as e:
+        print(f"\n❌ Test failed: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/mocks/mock_llm.py b/tests/mocks/mock_llm.py
@@ -258,7 +258,7 @@ def get_usage_stats(self) -> Dict[str, int]:
             "total_tokens": total_prompt_tokens + total_completion_tokens
         }
     
-    def create_response(
+    async def create_response(
         self,
         *,
         messages: list[dict[str, str]],
@@ -274,13 +274,13 @@ def create_response(
             # Fall back to content-based detection
             content = str(messages).lower()
             response_type = self._determine_response_type(content)
-        
+
         # Track the call
         self.call_count += 1
         self.last_messages = messages
         self.last_model = model or self.default_model
         self.last_kwargs = kwargs
-        
+
         # Store call in history
         call_info = {
             "messages": messages,
@@ -290,26 +290,26 @@ def create_response(
             "timestamp": asyncio.get_event_loop().time()
         }
         self.call_history.append(call_info)
-        
+
         # Simulate failure if configured
         if self.should_fail:
             raise Exception(self.failure_message)
-        
+
         # Check for custom responses first
         if response_type in self.custom_responses:
             response_data = self.custom_responses[response_type]
             if callable(response_data):
                 response_data = response_data(messages, **kwargs)
             return self._create_response(response_data, model=self.last_model)
-        
+
         # Use default response mapping
         response_generator = self.response_mapping.get(response_type, self._default_response)
         response_data = response_generator(messages, **kwargs)
-        
+
         response = self._create_response(response_data, model=self.last_model)
-        
+
         # Call metrics callback if set
         if self.metrics_callback:
             self.metrics_callback(response, 100, response_type)  # 100ms mock inference time
-        
+
         return response