OtherVibes · OtherVibes · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
@@ -15,3 +15,5 @@
 DATABASE_URL = "sqlite://:memory:"
 MAX_SESSION_RECORDS = 20  # Maximum records to keep per session (FIFO)
 MAX_TOTAL_SESSIONS = 50  # Maximum total sessions to keep (LRU cleanup)
+MAX_CONTEXT_TOKENS = 50000  # Maximum tokens for session token (1 token ≈ 4 characters)
+MAX_RESPONSE_TOKENS = 5000  # Maximum tokens for LLM responses
@@ -7,12 +7,17 @@
 3. Managing session-based conversation history
 """
 
+from typing import Any
+
 from mcp_as_a_judge.db import (
     ConversationHistoryDB,
     ConversationRecord,
     create_database_provider,
 )
 from mcp_as_a_judge.db.db_config import Config
+from mcp_as_a_judge.db.token_utils import (
+    filter_records_by_token_limit,
+)
 from mcp_as_a_judge.logging_config import get_logger
 
 # Set up logger
@@ -35,34 +40,54 @@ def __init__(
         self.config = config
         self.db = db_provider or create_database_provider(config)
 
-    async def load_context_for_enrichment(
-        self, session_id: str
+    async def load_filtered_context_for_enrichment(
+        self, session_id: str, current_prompt: str = "", ctx: Any = None
     ) -> list[ConversationRecord]:
         """
         Load recent conversation records for LLM context enrichment.
 
+        Two-level filtering approach:
+        1. Database already enforces storage limits (record count + token limits)
+        2. Load-time filtering ensures history + current prompt fits within LLM context limits
+
         Args:
             session_id: Session identifier
+            current_prompt: Current prompt that will be sent to LLM (for token calculation)
+            ctx: MCP context for model detection and accurate token counting (optional)
 
         Returns:
-            List of conversation records for LLM context
+            List of conversation records for LLM context (filtered for LLM limits)
         """
         logger.info(f"🔍 Loading conversation history for session: {session_id}")
 
-        # Load recent conversations for this session
-        recent_records = await self.db.get_session_conversations(
-            session_id=session_id,
-            limit=self.config.database.max_session_records,  # load last X records (same as save limit)
-        )
+        # Load all conversations for this session - database already contains
+        # records within storage limits, but we may need to filter further for LLM context
+        recent_records = await self.db.get_session_conversations(session_id)
 
         logger.info(f"📚 Retrieved {len(recent_records)} conversation records from DB")
-        return recent_records
 
-    async def save_tool_interaction(
+        # Apply LLM context filtering: ensure history + current prompt will fit within token limit
+        # This filters the list without modifying the database (only token limit matters for LLM)
+        # Pass ctx for accurate token counting when available
+        filtered_records = await filter_records_by_token_limit(
+            recent_records, current_prompt=current_prompt, ctx=ctx
+        )
+
+        logger.info(
+            f"✅ Returning {len(filtered_records)} conversation records for LLM context"
+        )
+        return filtered_records
+
+    async def save_tool_interaction_and_cleanup(
         self, session_id: str, tool_name: str, tool_input: str, tool_output: str
     ) -> str:
         """
-        Save a tool interaction as a conversation record.
+        Save a tool interaction as a conversation record and perform automatic cleanup.in the provider layer
+
+        After saving, the database provider automatically performs cleanup to enforce limits:
+        - Removes old records if session exceeds MAX_SESSION_RECORDS (20)
+        - Removes old records if session exceeds MAX_CONTEXT_TOKENS (50,000)
+        - Removes least recently used sessions if total sessions exceed MAX_TOTAL_SESSIONS (50)
 
         Args:
             session_id: Session identifier from AI agent
@@ -87,28 +112,6 @@ async def save_tool_interaction(
         logger.info(f"✅ Saved conversation record with ID: {record_id}")
         return record_id
 
-    async def get_conversation_history(
-        self, session_id: str
-    ) -> list[ConversationRecord]:
-        """
-        Get conversation history for a session to be injected into user prompts.
-
-        Args:
-            session_id: Session identifier
-
-        Returns:
-            List of conversation records for the session (most recent first)
-        """
-        logger.info(f"🔄 Loading conversation history for session {session_id}")
-
-        context_records = await self.load_context_for_enrichment(session_id)
-
-        logger.info(
-            f"📝 Retrieved {len(context_records)} conversation records for session {session_id}"
-        )
-
-        return context_records
-
     def format_conversation_history_as_json_array(
         self, conversation_history: list[ConversationRecord]
     ) -> list[dict]:

@@ -0,0 +1,131 @@
+"""
+Dynamic token limits based on actual model capabilities.
+
+This module provides dynamic token limit calculation based on the actual model
+being used, replacing hardcoded MAX_CONTEXT_TOKENS and MAX_RESPONSE_TOKENS
+with model-specific limits from LiteLLM.
+"""
+
+from dataclasses import dataclass
+
+from mcp_as_a_judge.constants import MAX_CONTEXT_TOKENS, MAX_RESPONSE_TOKENS
+from mcp_as_a_judge.logging_config import get_logger
+
+# Set up logger
+logger = get_logger(__name__)
+
+
+@dataclass
+class ModelLimits:
+    """Model-specific token limits."""
+
+    context_window: int  # Total context window size
+    max_input_tokens: int  # Maximum tokens for input (context + prompt)
+    max_output_tokens: int  # Maximum tokens for output/response
+    model_name: str  # Model name for reference
+    source: str  # Where the limits came from ("litellm", "hardcoded", "estimated")
+
+
+# Cache for model limits to avoid repeated API calls
+_model_limits_cache: dict[str, ModelLimits] = {}
+
+
+def get_model_limits(model_name: str | None = None) -> ModelLimits:
+    """
+    Get token limits: start with hardcoded, upgrade from cache or LiteLLM if available.
+    """
+    # Start with hardcoded defaults
+    limits = ModelLimits(
+        context_window=MAX_CONTEXT_TOKENS + MAX_RESPONSE_TOKENS,
+        max_input_tokens=MAX_CONTEXT_TOKENS,
+        max_output_tokens=MAX_RESPONSE_TOKENS,
+        model_name=model_name or "unknown",
+        source="hardcoded",
+    )
+
+    # If no model name, return hardcoded
+    if not model_name:
+        return limits
+
+    # Try to upgrade from cache
+    if model_name in _model_limits_cache:
+        return _model_limits_cache[model_name]
+
+    # Try to upgrade from LiteLLM
+    try:
+        import litellm
+
+        model_info = litellm.get_model_info(model_name)
+
+        # Extract values with proper fallbacks
+        context_window = model_info.get("max_tokens")
+        if context_window is not None:
+            context_window = int(context_window)
+        else:
+            context_window = limits.context_window
+
+        max_input_tokens = model_info.get("max_input_tokens")
+        if max_input_tokens is not None:
+            max_input_tokens = int(max_input_tokens)
+        else:
+            max_input_tokens = limits.max_input_tokens
+
+        max_output_tokens = model_info.get("max_output_tokens")
+        if max_output_tokens is not None:
+            max_output_tokens = int(max_output_tokens)
+        else:
+            max_output_tokens = limits.max_output_tokens
+
+        limits = ModelLimits(
+            context_window=context_window,
+            max_input_tokens=max_input_tokens,
+            max_output_tokens=max_output_tokens,
+            model_name=model_name,
+            source="litellm",
+        )
+
+        # Cache and return what we have
+        _model_limits_cache[model_name] = limits
+        logger.debug(
+            f"Retrieved model limits from LiteLLM for {model_name}: {limits.max_input_tokens} input tokens"
+        )
+
+    except ImportError:
+        logger.debug("LiteLLM not available, using hardcoded defaults")
+    except Exception as e:
+        logger.debug(f"Failed to get model info from LiteLLM for {model_name}: {e}")
+        # Continue with hardcoded defaults
+
+    return limits
+
+
+def get_llm_input_limit(model_name: str | None = None) -> int:
+    """
+    Get dynamic context token limit for conversation history.
+
+    This replaces the hardcoded MAX_CONTEXT_TOKENS with model-specific limits.
+
+    Args:
+        model_name: Name of the model (optional)
+
+    Returns:
+        Maximum tokens for conversation history/context
+    """
+    limits = get_model_limits(model_name)
+    return limits.max_input_tokens
+
+
+def get_llm_output_limit(model_name: str | None = None) -> int:
+    """
+    Get dynamic response token limit for LLM output.
+
+    This replaces the hardcoded MAX_RESPONSE_TOKENS with model-specific limits.
+
+    Args:
+        model_name: Name of the model (optional)
+
+    Returns:
+        Maximum tokens for LLM response/output
+    """
+    limits = get_model_limits(model_name)
+    return limits.max_output_tokens
@@ -21,6 +21,9 @@ class ConversationRecord(SQLModel, table=True):
     source: str  # tool name
     input: str  # tool input query
     output: str  # tool output string
+    tokens: int = Field(
+        default=0
+    )  # combined token count for input + output (1 token ≈ 4 characters)
     timestamp: datetime = Field(
         default_factory=datetime.utcnow, index=True
     )  # when the record was created