Skip to content
2 changes: 2 additions & 0 deletions src/mcp_as_a_judge/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@
DATABASE_URL = "sqlite://:memory:"
MAX_SESSION_RECORDS = 20 # Maximum records to keep per session (FIFO)
MAX_TOTAL_SESSIONS = 50 # Maximum total sessions to keep (LRU cleanup)
MAX_CONTEXT_TOKENS = 50000 # Maximum tokens for session token (1 token ≈ 4 characters)
MAX_RESPONSE_TOKENS = 5000 # Maximum tokens for LLM responses
69 changes: 36 additions & 33 deletions src/mcp_as_a_judge/db/conversation_history_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@
3. Managing session-based conversation history
"""

from typing import Any

from mcp_as_a_judge.db import (
ConversationHistoryDB,
ConversationRecord,
create_database_provider,
)
from mcp_as_a_judge.db.db_config import Config
from mcp_as_a_judge.db.token_utils import (
filter_records_by_token_limit,
)
from mcp_as_a_judge.logging_config import get_logger

# Set up logger
Expand All @@ -35,34 +40,54 @@ def __init__(
self.config = config
self.db = db_provider or create_database_provider(config)

async def load_context_for_enrichment(
self, session_id: str
async def load_filtered_context_for_enrichment(
self, session_id: str, current_prompt: str = "", ctx: Any = None
) -> list[ConversationRecord]:
"""
Load recent conversation records for LLM context enrichment.

Two-level filtering approach:
1. Database already enforces storage limits (record count + token limits)
2. Load-time filtering ensures history + current prompt fits within LLM context limits

Args:
session_id: Session identifier
current_prompt: Current prompt that will be sent to LLM (for token calculation)
ctx: MCP context for model detection and accurate token counting (optional)

Returns:
List of conversation records for LLM context
List of conversation records for LLM context (filtered for LLM limits)
"""
logger.info(f"🔍 Loading conversation history for session: {session_id}")

# Load recent conversations for this session
recent_records = await self.db.get_session_conversations(
session_id=session_id,
limit=self.config.database.max_session_records, # load last X records (same as save limit)
)
# Load all conversations for this session - database already contains
# records within storage limits, but we may need to filter further for LLM context
recent_records = await self.db.get_session_conversations(session_id)

logger.info(f"📚 Retrieved {len(recent_records)} conversation records from DB")
return recent_records

async def save_tool_interaction(
# Apply LLM context filtering: ensure history + current prompt will fit within token limit
# This filters the list without modifying the database (only token limit matters for LLM)
# Pass ctx for accurate token counting when available
filtered_records = await filter_records_by_token_limit(
recent_records, current_prompt=current_prompt, ctx=ctx
)

logger.info(
f"✅ Returning {len(filtered_records)} conversation records for LLM context"
)
return filtered_records

async def save_tool_interaction_and_cleanup(
self, session_id: str, tool_name: str, tool_input: str, tool_output: str
) -> str:
"""
Save a tool interaction as a conversation record.
Save a tool interaction as a conversation record and perform automatic cleanup.in the provider layer

After saving, the database provider automatically performs cleanup to enforce limits:
- Removes old records if session exceeds MAX_SESSION_RECORDS (20)
- Removes old records if session exceeds MAX_CONTEXT_TOKENS (50,000)
- Removes least recently used sessions if total sessions exceed MAX_TOTAL_SESSIONS (50)

Args:
session_id: Session identifier from AI agent
Expand All @@ -87,28 +112,6 @@ async def save_tool_interaction(
logger.info(f"✅ Saved conversation record with ID: {record_id}")
return record_id

async def get_conversation_history(
self, session_id: str
) -> list[ConversationRecord]:
"""
Get conversation history for a session to be injected into user prompts.

Args:
session_id: Session identifier

Returns:
List of conversation records for the session (most recent first)
"""
logger.info(f"🔄 Loading conversation history for session {session_id}")

context_records = await self.load_context_for_enrichment(session_id)

logger.info(
f"📝 Retrieved {len(context_records)} conversation records for session {session_id}"
)

return context_records

def format_conversation_history_as_json_array(
self, conversation_history: list[ConversationRecord]
) -> list[dict]:
Expand Down
131 changes: 131 additions & 0 deletions src/mcp_as_a_judge/db/dynamic_token_limits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""
Dynamic token limits based on actual model capabilities.

This module provides dynamic token limit calculation based on the actual model
being used, replacing hardcoded MAX_CONTEXT_TOKENS and MAX_RESPONSE_TOKENS
with model-specific limits from LiteLLM.
"""

from dataclasses import dataclass

from mcp_as_a_judge.constants import MAX_CONTEXT_TOKENS, MAX_RESPONSE_TOKENS
from mcp_as_a_judge.logging_config import get_logger

# Set up logger
logger = get_logger(__name__)


@dataclass
class ModelLimits:
"""Model-specific token limits."""

context_window: int # Total context window size
max_input_tokens: int # Maximum tokens for input (context + prompt)
max_output_tokens: int # Maximum tokens for output/response
model_name: str # Model name for reference
source: str # Where the limits came from ("litellm", "hardcoded", "estimated")


# Cache for model limits to avoid repeated API calls
_model_limits_cache: dict[str, ModelLimits] = {}


def get_model_limits(model_name: str | None = None) -> ModelLimits:
"""
Get token limits: start with hardcoded, upgrade from cache or LiteLLM if available.
"""
# Start with hardcoded defaults
limits = ModelLimits(
context_window=MAX_CONTEXT_TOKENS + MAX_RESPONSE_TOKENS,
max_input_tokens=MAX_CONTEXT_TOKENS,
max_output_tokens=MAX_RESPONSE_TOKENS,
model_name=model_name or "unknown",
source="hardcoded",
)

# If no model name, return hardcoded
if not model_name:
return limits

# Try to upgrade from cache
if model_name in _model_limits_cache:
return _model_limits_cache[model_name]

# Try to upgrade from LiteLLM
try:
import litellm

model_info = litellm.get_model_info(model_name)

# Extract values with proper fallbacks
context_window = model_info.get("max_tokens")
if context_window is not None:
context_window = int(context_window)
else:
context_window = limits.context_window

max_input_tokens = model_info.get("max_input_tokens")
if max_input_tokens is not None:
max_input_tokens = int(max_input_tokens)
else:
max_input_tokens = limits.max_input_tokens

max_output_tokens = model_info.get("max_output_tokens")
if max_output_tokens is not None:
max_output_tokens = int(max_output_tokens)
else:
max_output_tokens = limits.max_output_tokens

limits = ModelLimits(
context_window=context_window,
max_input_tokens=max_input_tokens,
max_output_tokens=max_output_tokens,
model_name=model_name,
source="litellm",
)

# Cache and return what we have
_model_limits_cache[model_name] = limits
logger.debug(
f"Retrieved model limits from LiteLLM for {model_name}: {limits.max_input_tokens} input tokens"
)

except ImportError:
logger.debug("LiteLLM not available, using hardcoded defaults")
except Exception as e:
logger.debug(f"Failed to get model info from LiteLLM for {model_name}: {e}")
# Continue with hardcoded defaults

return limits


def get_llm_input_limit(model_name: str | None = None) -> int:
"""
Get dynamic context token limit for conversation history.

This replaces the hardcoded MAX_CONTEXT_TOKENS with model-specific limits.

Args:
model_name: Name of the model (optional)

Returns:
Maximum tokens for conversation history/context
"""
limits = get_model_limits(model_name)
return limits.max_input_tokens


def get_llm_output_limit(model_name: str | None = None) -> int:
"""
Get dynamic response token limit for LLM output.

This replaces the hardcoded MAX_RESPONSE_TOKENS with model-specific limits.

Args:
model_name: Name of the model (optional)

Returns:
Maximum tokens for LLM response/output
"""
limits = get_model_limits(model_name)
return limits.max_output_tokens
3 changes: 3 additions & 0 deletions src/mcp_as_a_judge/db/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class ConversationRecord(SQLModel, table=True):
source: str # tool name
input: str # tool input query
output: str # tool output string
tokens: int = Field(
default=0
) # combined token count for input + output (1 token ≈ 4 characters)
timestamp: datetime = Field(
default_factory=datetime.utcnow, index=True
) # when the record was created
Expand Down
Loading