Fix ModuleLLM import and async semaphore issues - all tests passing

crocmons · crocmons · commit b6bd62b3b831 · 2026-03-13T21:59:33.000+06:00
diff --git a/mesa_llm/module_llm.py b/mesa_llm/module_llm.py
@@ -2,18 +2,20 @@
 import contextlib
 import hashlib
 import logging
+import os
 import threading
 import time
 from collections import deque
 from typing import Any
 
 from dotenv import load_dotenv
-from litellm import acompletion
+from litellm import acompletion, completion, litellm
 from litellm.exceptions import (
     APIConnectionError,
     RateLimitError,
     Timeout,
 )
+from tenacity import AsyncRetrying, retry_if_exception_type, wait_exponential
 
 RETRYABLE_EXCEPTIONS = (
     APIConnectionError,
@@ -248,8 +250,17 @@ def _safe_release_sync(self) -> None:
 
     async def acquire_async(self) -> None:
         """Async leaky-bucket acquisition (release is scheduled)."""
-        if self._async_sem is None:
+        # Create new semaphore for each event loop to avoid binding issues
+        try:
+            loop = asyncio.get_running_loop()
+            if (
+                self._async_sem is None
+                or getattr(self._async_sem, "_loop", None) != loop
+            ):
+                self._async_sem = asyncio.Semaphore(self.requests_per_second)
+        except RuntimeError:
             self._async_sem = asyncio.Semaphore(self.requests_per_second)
+
         await self._async_sem.acquire()
         delay = 1.0 / float(self.requests_per_second)
         asyncio.get_running_loop().call_later(delay, self._safe_release_async)
@@ -260,7 +271,250 @@ def _safe_release_async(self) -> None:
         with contextlib.suppress(ValueError):
             self._async_sem.release()
 
-    # ... (rest of the code remains the same)
+
+# Global rate limiter instance
+_global_rate_limiter = GlobalRateLimiter(requests_per_second=20)
+
+
+class ModuleLLM:
+    """
+    A module that provides a simple interface for using LLMs with performance optimizations.
+
+    Note : Currently supports OpenAI, Anthropic, xAI, Huggingface, Ollama, OpenRouter, NovitaAI, Gemini
+    """
+
+    def __init__(
+        self,
+        llm_model: str,
+        api_base: str | None = None,
+        system_prompt: str | None = None,
+        enable_caching: bool = False,
+        enable_batching: bool = False,
+        cache_size: int = 1000,
+        cache_ttl: float = 300.0,
+        batch_size: int = 10,
+    ):
+        """
+        Initialize LLM module with optional performance optimizations
+
+        Args:
+            llm_model: The model to use for LLM in format
+                "{provider}/{model}" (for example, "openai/gpt-4o").
+            api_base: The API base to use if LLM provider is Ollama
+            system_prompt: The system prompt to use for LLM
+            enable_caching: Enable response caching for performance
+            enable_batching: Enable request batching for performance
+            cache_size: Maximum number of cached responses
+            cache_ttl: Cache time-to-live in seconds
+            batch_size: Number of requests to batch together
+
+        Raises:
+            ValueError: If llm_model is not in the expected "{provider}/{model}"
+                format, or if the provider API key is missing.
+        """
+        self.api_base = api_base
+        self.llm_model = llm_model
+        self.system_prompt = system_prompt
+
+        # Performance optimizations
+        self.enable_caching = enable_caching
+        self.enable_batching = enable_batching
+
+        # Initialize optimization components
+        if enable_caching:
+            self.cache = ResponseCache(max_size=cache_size, default_ttl=cache_ttl)
+
+        if enable_batching:
+            self.batcher = RequestBatcher(batch_size=batch_size)
+            # Start batch processing task only if event loop is running
+            try:
+                asyncio.get_running_loop()
+                self._batch_task = asyncio.create_task(self.batcher._process_batch())
+            except RuntimeError:
+                # No event loop running, will create task when needed
+                self._batch_task = None
+
+        self.connection_pool = ConnectionPool()
+
+        # Performance tracking
+        self.request_count = 0
+        self.cache_hits = 0
+        self.batch_count = 0
+
+        if "/" not in llm_model:
+            raise ValueError(
+                f"Invalid model format '{llm_model}'. "
+                "Expected '{provider}/{model}', e.g. 'openai/gpt-4o'."
+            )
+
+        provider = self.llm_model.split("/")[0].upper()
+
+        if provider in ["OLLAMA", "OLLAMA_CHAT"]:
+            if self.api_base is None:
+                self.api_base = "http://localhost:11434"
+                logger.warning(
+                    "Using default Ollama API base: %s. If inference is not working, you may need to set the API base to the correct URL.",
+                    self.api_base,
+                )
+                self.api_key = "your_default_api_key"  # Add this line
+        else:
+            try:
+                self.api_key = os.environ[f"{provider}_API_KEY"]
+            except KeyError as err:
+                raise ValueError(
+                    f"No API key found for {provider}. Please set the {provider}_API_KEY environment variable (e.g., in your .env file)."
+                ) from err
+
+        if not litellm.supports_function_calling(model=self.llm_model):
+            logger.warning(
+                "%s does not support function calling. This model may not be able to use tools. Please check the model documentation at https://docs.litellm.ai/docs/providers for more information.",
+                self.llm_model,
+            )
+
+    def _build_messages(self, prompt: str | list[str] | None = None) -> list[dict]:
+        """
+        Format the prompt messages for the LLM of the form : {"role": ..., "content": ...}
+
+        Args:
+            prompt: The prompt to generate a response for (str, list of strings, or None)
+
+        Returns:
+            The messages for the LLM
+        """
+        messages = []
+
+        # Always include a system message. Default to empty string if no system prompt to support Ollama
+        system_content = self.system_prompt if self.system_prompt else ""
+        messages.append({"role": "system", "content": system_content})
+
+        if prompt:
+            if isinstance(prompt, str):
+                messages.append({"role": "user", "content": prompt})
+            elif isinstance(prompt, list):
+                # Use extend to add all prompts from the list
+                messages.extend([{"role": "user", "content": p} for p in prompt])
+
+        return messages
+
+    def generate(
+        self,
+        prompt: str | list[str] | None = None,
+        tool_schema: list[dict] | None = None,
+        tool_choice: str = "auto",
+        response_format: dict | object | None = None,
+    ) -> str:
+        """
+        Generate a response from LLM using litellm based on prompt
+
+        Args:
+            prompt: The prompt to generate a response for (str, list of strings, or None)
+            tool_schema: The schema of tools to use
+            tool_choice: The choice of tool to use
+            response_format: The format of response
+
+        Returns:
+            The response from the LLM
+        """
+        # Apply global rate limiting
+        _global_rate_limiter.acquire_sync()
+        try:
+            self.request_count += 1
+            messages = self._build_messages(prompt)
+
+            # Check cache first if enabled
+            cached_response = None
+            if self.enable_caching:
+                cached_response = self.cache.get(self.llm_model, messages)
+            if cached_response is not None:
+                self.cache_hits += 1
+                return cached_response
+
+            completion_kwargs = {
+                "model": self.llm_model,
+                "messages": messages,
+                "tools": tool_schema,
+                "tool_choice": tool_choice if tool_schema else None,
+                "response_format": response_format,
+            }
+            if self.api_base:
+                completion_kwargs["api_base"] = self.api_base
+
+            response = completion(**completion_kwargs)
+
+            # Cache response if enabled
+            if self.enable_caching:
+                self.cache.set(self.llm_model, messages, response)
+
+            return response
+        finally:
+            # Sync limiter releases via timer
+            pass
+
+    async def agenerate(
+        self,
+        prompt: str | list[str] | None = None,
+        tool_schema: list[dict] | None = None,
+        tool_choice: str = "auto",
+        response_format: dict | object | None = None,
+    ) -> str:
+        """
+        Asynchronous version of generate() method for parallel LLM calls.
+        """
+        # Apply global rate limiting
+        await _global_rate_limiter.acquire_async()
+        try:
+            self.request_count += 1
+            messages = self._build_messages(prompt)
+
+            # Check cache first if enabled
+            cached_response = None
+            if self.enable_caching:
+                cached_response = self.cache.get(self.llm_model, messages)
+            if cached_response is not None:
+                self.cache_hits += 1
+                return cached_response
+
+            # Use batching if enabled
+            if self.enable_batching:
+                request_data = {
+                    "model": self.llm_model,
+                    "messages": messages,
+                    "tools": tool_schema,
+                    "tool_choice": tool_choice if tool_schema else None,
+                    "response_format": response_format,
+                    "api_base": self.api_base,
+                }
+                response = await self.batcher.add_request(request_data)
+                self.batch_count += 1
+            else:
+                async for attempt in AsyncRetrying(
+                    wait=wait_exponential(
+                        multiplier=1.1, min=1, max=5
+                    ),  # Gentler backoff
+                    retry=retry_if_exception_type(RETRYABLE_EXCEPTIONS),
+                    reraise=True,
+                ):
+                    with attempt:
+                        completion_kwargs = {
+                            "model": self.llm_model,
+                            "messages": messages,
+                            "tools": tool_schema,
+                            "tool_choice": tool_choice if tool_schema else None,
+                            "response_format": response_format,
+                        }
+                        if self.api_base:
+                            completion_kwargs["api_base"] = self.api_base
+
+                        response = await acompletion(**completion_kwargs)
+
+            # Cache response if enabled
+            if self.enable_caching:
+                self.cache.set(self.llm_model, messages, response)
+
+            return response
+        finally:
+            # Async limiter releases via scheduled callback
+            pass
 
     def get_performance_stats(self) -> dict:
         """Get performance statistics."""
@@ -284,3 +538,34 @@ async def cleanup(self):
                 await self._batch_task
 
         self.connection_pool.cleanup()
+
+
+# Add the missing methods to GlobalRateLimiter class
+def _get_performance_stats(self) -> dict:
+    """Get performance statistics."""
+    stats = {
+        "request_count": self.request_count,
+        "cache_hits": self.cache_hits,
+        "cache_hit_rate": self.cache_hits / max(1, self.request_count),
+        "batch_count": self.batch_count,
+    }
+
+    if self.enable_caching:
+        stats.update(self.cache.get_stats())
+
+    return stats
+
+
+async def _async_cleanup(self):
+    """Cleanup resources."""
+    if hasattr(self, "_batch_task"):
+        self._batch_task.cancel()
+        with contextlib.suppress(asyncio.CancelledError):
+            await self._batch_task
+
+    self.connection_pool.cleanup()
+
+
+# Monkey patch the methods to GlobalRateLimiter
+GlobalRateLimiter.get_performance_stats = _get_performance_stats
+GlobalRateLimiter.async_cleanup = _async_cleanup