OtherVibes
diff --git a/‎src/mcp_as_a_judge/db/conversation_history_service.py‎
Lines changed: 8 additions & 4 deletions b/‎src/mcp_as_a_judge/db/conversation_history_service.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/mcp_as_a_judge/db/dynamic_token_limits.py‎
Lines changed: 106 additions & 0 deletions b/‎src/mcp_as_a_judge/db/dynamic_token_limits.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/db/providers/sqlite_provider.py‎
Lines changed: 18 additions & 16 deletions b/‎src/mcp_as_a_judge/db/providers/sqlite_provider.py‎
Lines changed: 18 additions & 16 deletions
@@ -13,7 +13,9 @@
     create_database_provider,
 )
 from mcp_as_a_judge.db.db_config import Config
-from mcp_as_a_judge.db.token_utils import filter_records_by_token_limit
+from mcp_as_a_judge.db.token_utils import (
+    filter_records_by_token_limit,
+)
 from mcp_as_a_judge.logging_config import get_logger
 
 # Set up logger
@@ -37,7 +39,7 @@ def __init__(
         self.db = db_provider or create_database_provider(config)
 
     async def load_filtered_context_for_enrichment(
-        self, session_id: str, current_prompt: str = ""
+        self, session_id: str, current_prompt: str = "", ctx=None
     ) -> list[ConversationRecord]:
         """
         Load recent conversation records for LLM context enrichment.
@@ -49,6 +51,7 @@ async def load_filtered_context_for_enrichment(
         Args:
             session_id: Session identifier
             current_prompt: Current prompt that will be sent to LLM (for token calculation)
+            ctx: MCP context for model detection and accurate token counting (optional)
 
         Returns:
             List of conversation records for LLM context (filtered for LLM limits)
@@ -63,8 +66,9 @@ async def load_filtered_context_for_enrichment(
 
         # Apply LLM context filtering: ensure history + current prompt will fit within token limit
         # This filters the list without modifying the database (only token limit matters for LLM)
-        filtered_records = filter_records_by_token_limit(
-            recent_records, current_prompt=current_prompt
+        # Pass ctx for accurate token counting when available
+        filtered_records = await filter_records_by_token_limit(
+            recent_records, current_prompt=current_prompt, ctx=ctx
         )
 
         logger.info(
 
@@ -0,0 +1,106 @@
+"""
+Dynamic token limits based on actual model capabilities.
+
+This module provides dynamic token limit calculation based on the actual model
+being used, replacing hardcoded MAX_CONTEXT_TOKENS and MAX_RESPONSE_TOKENS
+with model-specific limits from LiteLLM.
+"""
+
+from dataclasses import dataclass
+
+from mcp_as_a_judge.constants import MAX_CONTEXT_TOKENS, MAX_RESPONSE_TOKENS
+
+
+@dataclass
+class ModelLimits:
+    """Model-specific token limits."""
+
+    context_window: int  # Total context window size
+    max_input_tokens: int  # Maximum tokens for input (context + prompt)
+    max_output_tokens: int  # Maximum tokens for output/response
+    model_name: str  # Model name for reference
+    source: str  # Where the limits came from ("litellm", "hardcoded", "estimated")
+
+
+# Cache for model limits to avoid repeated API calls
+_model_limits_cache: dict[str, ModelLimits] = {}
+
+
+def get_model_limits(model_name: str | None = None) -> ModelLimits:
+    """
+    Get token limits: start with hardcoded, upgrade from cache or LiteLLM if available.
+    """
+    # Start with hardcoded defaults
+    limits = ModelLimits(
+        context_window=MAX_CONTEXT_TOKENS + MAX_RESPONSE_TOKENS,
+        max_input_tokens=MAX_CONTEXT_TOKENS,
+        max_output_tokens=MAX_RESPONSE_TOKENS,
+        model_name=model_name or "unknown",
+        source="hardcoded",
+    )
+
+    # If no model name, return hardcoded
+    if not model_name:
+        return limits
+
+    # Try to upgrade from cache
+    if model_name in _model_limits_cache:
+        return _model_limits_cache[model_name]
+
+    # Try to upgrade from LiteLLM
+    try:
+        import litellm
+
+        model_info = litellm.get_model_info(model_name)
+
+        limits = ModelLimits(
+            context_window=model_info.get("max_tokens", limits.context_window),
+            max_input_tokens=model_info.get(
+                "max_input_tokens", limits.max_input_tokens
+            ),
+            max_output_tokens=model_info.get(
+                "max_output_tokens", limits.max_output_tokens
+            ),
+            model_name=model_name,
+            source="litellm",
+        )
+
+        # Cache and return what we have
+        _model_limits_cache[model_name] = limits
+
+    except Exception:
+        pass
+
+    return limits
+
+
+def get_llm_input_limit(model_name: str | None = None) -> int:
+    """
+    Get dynamic context token limit for conversation history.
+
+    This replaces the hardcoded MAX_CONTEXT_TOKENS with model-specific limits.
+
+    Args:
+        model_name: Name of the model (optional)
+
+    Returns:
+        Maximum tokens for conversation history/context
+    """
+    limits = get_model_limits(model_name)
+    return limits.max_input_tokens
+
+
+def get_llm_output_limit(model_name: str | None = None) -> int:
+    """
+    Get dynamic response token limit for LLM output.
+
+    This replaces the hardcoded MAX_RESPONSE_TOKENS with model-specific limits.
+
+    Args:
+        model_name: Name of the model (optional)
+
+    Returns:
+        Maximum tokens for LLM response/output
+    """
+    limits = get_model_limits(model_name)
+    return limits.max_output_tokens
@@ -11,10 +11,10 @@
 from sqlalchemy import create_engine
 from sqlmodel import Session, SQLModel, desc, select
 
-from mcp_as_a_judge.constants import MAX_CONTEXT_TOKENS
 from mcp_as_a_judge.db.cleanup_service import ConversationCleanupService
+from mcp_as_a_judge.db.dynamic_token_limits import get_llm_input_limit
 from mcp_as_a_judge.db.interface import ConversationHistoryDB, ConversationRecord
-from mcp_as_a_judge.db.token_utils import calculate_record_tokens
+from mcp_as_a_judge.db.token_utils import calculate_tokens_in_record, detect_model_name
 from mcp_as_a_judge.logging_config import get_logger
 
 # Set up logger
@@ -94,16 +94,14 @@ def _cleanup_excess_sessions(self) -> int:
         """
         return self._cleanup_service.cleanup_excess_sessions()
 
-    def _cleanup_old_messages(self, session_id: str) -> int:
+    async def _cleanup_old_messages(self, session_id: str) -> int:
         """
-        Remove old messages from a session using efficient hybrid FIFO strategy.
+        Remove old messages from a session using token-based FIFO cleanup.
 
-        Two-step process:
-        1. If record count > max_records, remove oldest record
-        2. If total tokens > max_tokens, remove oldest records until within limit
+        Uses dynamic token limits based on current model (get_llm_input_limit).
+        Removes oldest records until total tokens are within the model's input limit.
 
         Optimization: Single DB query with ORDER BY, then in-memory list operations.
-        Eliminates 2 extra database queries compared to naive implementation.
         """
         with Session(self.engine) as session:
             # Get current records ordered by timestamp DESC (newest first for token calculation)
@@ -140,25 +138,29 @@ def _cleanup_old_messages(self, session_id: str) -> int:
                 # Update our in-memory list to reflect the deletion
                 current_records.remove(oldest_record)
 
-            # STEP 2: Handle token limit (list is already sorted newest first - perfect for token calculation)
+            # STEP 2: Handle token limit using dynamic model-specific limits
             current_tokens = sum(record.tokens for record in current_records)
 
+            # Get dynamic token limit based on current model
+            model_name = await detect_model_name()
+            max_input_tokens = get_llm_input_limit(model_name)
+
             logger.info(
                 f"   🔢 {len(current_records)} records, {current_tokens} tokens "
-                f"(max: {MAX_CONTEXT_TOKENS})"
+                f"(max: {max_input_tokens} for model: {model_name or 'default'})"
             )
 
-            if current_tokens > MAX_CONTEXT_TOKENS:
+            if current_tokens > max_input_tokens:
                 logger.info(
-                    f"   🚨 Token limit exceeded, removing oldest records to fit within {MAX_CONTEXT_TOKENS} tokens"
+                    f"   🚨 Token limit exceeded, removing oldest records to fit within {max_input_tokens} tokens"
                 )
 
                 # Calculate which records to keep (newest first, within token limit)
                 records_to_keep = []
                 running_tokens = 0
 
                 for record in current_records:  # Already ordered newest first
-                    if running_tokens + record.tokens <= MAX_CONTEXT_TOKENS:
+                    if running_tokens + record.tokens <= max_input_tokens:
                         records_to_keep.append(record)
                         running_tokens += record.tokens
                     else:
@@ -220,7 +222,7 @@ async def save_conversation(
         is_new_session = self._is_new_session(session_id)
 
         # Calculate token count for input + output
-        token_count = calculate_record_tokens(input_data, output)
+        token_count = await calculate_tokens_in_record(input_data, output)
 
         # Create new record
         record = ConversationRecord(
@@ -244,9 +246,9 @@ async def save_conversation(
             logger.info(f"🆕 New session detected: {session_id}, running LRU cleanup")
             self._cleanup_excess_sessions()
 
-        # Per-session FIFO cleanup: maintain max 20 records per session
+        # Per-session FIFO cleanup: maintain max records per session and model-specific token limits
         # (runs on every save)
-        self._cleanup_old_messages(session_id)
+        await self._cleanup_old_messages(session_id)
 
         return record_id