Add context usage percentage to working memory endpoints

claude[bot] · abrookins · claude[bot] · commit 19d5af5cc4a4 · 2025-07-22T20:18:38.000Z
- Add context_usage_percentage field to WorkingMemoryResponse model - Add _calculate_context_usage_percentage() helper function - Update GET /v1/working-memory/{session_id} to return percentage - Update PUT /v1/working-memory/{session_id} to return percentage based on final state (after potential summarization) - Percentage calculated as (current_tokens / token_threshold) * 100 where token_threshold = context_window * 0.7 - Returns None when no model info provided, otherwise 0-100% value Resolves #37 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Andrew Brookins <abrookins@users.noreply.github.com>
diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py
@@ -63,6 +63,41 @@ def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int:
     return total_tokens
 
 
+def _calculate_context_usage_percentage(
+    messages: list[MemoryMessage],
+    model_name: ModelNameLiteral | None,
+    context_window_max: int | None,
+) -> float | None:
+    """
+    Calculate the percentage of context window used before auto-summarization triggers.
+
+    Args:
+        messages: List of messages to calculate token count for
+        model_name: The client's LLM model name for context window determination
+        context_window_max: Direct specification of context window max tokens
+
+    Returns:
+        Percentage (0-100) of context used, or None if no model info provided
+    """
+    if not messages or (not model_name and not context_window_max):
+        return None
+
+    # Calculate current token usage
+    current_tokens = _calculate_messages_token_count(messages)
+
+    # Get effective token limit for the client's model
+    max_tokens = _get_effective_token_limit(model_name, context_window_max)
+
+    # Use the same threshold as _summarize_working_memory (70% of context window)
+    token_threshold = int(max_tokens * 0.7)
+
+    # Calculate percentage of threshold used
+    percentage = (current_tokens / token_threshold) * 100.0
+
+    # Cap at 100% for display purposes
+    return min(percentage, 100.0)
+
+
 async def _summarize_working_memory(
     memory: WorkingMemory,
     model_name: ModelNameLiteral | None = None,
@@ -269,7 +304,18 @@ async def get_working_memory(
 
     logger.debug(f"Working mem: {working_mem}")
 
-    return working_mem
+    # Calculate context usage percentage
+    context_usage_percentage = _calculate_context_usage_percentage(
+        messages=working_mem.messages,
+        model_name=model_name,
+        context_window_max=context_window_max,
+    )
+
+    # Return WorkingMemoryResponse with percentage
+    return WorkingMemoryResponse(
+        **working_mem.model_dump(),
+        context_usage_percentage=context_usage_percentage,
+    )
 
 
 @router.put("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse)
@@ -348,7 +394,18 @@ async def put_working_memory(
             namespace=updated_memory.namespace,
         )
 
-    return updated_memory
+    # Calculate context usage percentage based on the final state (after potential summarization)
+    context_usage_percentage = _calculate_context_usage_percentage(
+        messages=updated_memory.messages,
+        model_name=model_name,
+        context_window_max=context_window_max,
+    )
+
+    # Return WorkingMemoryResponse with percentage
+    return WorkingMemoryResponse(
+        **updated_memory.model_dump(),
+        context_usage_percentage=context_usage_percentage,
+    )
 
 
 @router.delete("/v1/working-memory/{session_id}", response_model=AckResponse)
diff --git a/agent_memory_server/models.py b/agent_memory_server/models.py
@@ -222,6 +222,11 @@ class WorkingMemory(BaseModel):
 class WorkingMemoryResponse(WorkingMemory):
     """Response containing working memory"""
 
+    context_usage_percentage: float | None = Field(
+        default=None,
+        description="Percentage of context window used before auto-summarization triggers (0-100)",
+    )
+
 
 class WorkingMemoryRequest(BaseModel):
     """Request parameters for working memory operations"""
diff --git a/tests/test_full_integration.py b/tests/test_full_integration.py
@@ -773,9 +773,9 @@ async def test_memory_prompt_with_long_term_search(
             )
             for msg in messages
         )
-        assert (
-            relevant_context_found
-        ), f"No relevant memory context found in messages: {messages}"
+        assert relevant_context_found, (
+            f"No relevant memory context found in messages: {messages}"
+        )
 
         # Cleanup
         await client.delete_long_term_memories([m.id for m in test_memories])
@@ -1079,9 +1079,9 @@ async def test_full_workflow_integration(
             )
             print(f"No topic filter search results: {no_topic_search}")
 
-        assert (
-            len(search_results["memories"]) > 0
-        ), f"No memories found in search results: {search_results}"
+        assert len(search_results["memories"]) > 0, (
+            f"No memories found in search results: {search_results}"
+        )
 
         # 6. Test tool integration with a realistic scenario
         tool_call = {
@@ -1126,9 +1126,9 @@ async def test_full_workflow_integration(
             m for m in long_term_memories.memories if m.id.startswith(memory_id_prefix)
         ]
 
-        assert (
-            len(our_memories) == 0
-        ), f"Expected 0 of our memories but found {len(our_memories)}: {our_memories}"
+        assert len(our_memories) == 0, (
+            f"Expected 0 of our memories but found {len(our_memories)}: {our_memories}"
+        )
 
 
 @pytest.mark.integration