@@ -63,6 +63,45 @@ def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int:
6363 return total_tokens
6464
6565
66+ def _calculate_context_usage_percentages (
67+ messages : list [MemoryMessage ],
68+ model_name : ModelNameLiteral | None ,
69+ context_window_max : int | None ,
70+ ) -> tuple [float | None , float | None ]:
71+ """
72+ Calculate context usage percentages for total usage and until summarization triggers.
73+
74+ Args:
75+ messages: List of messages to calculate token count for
76+ model_name: The client's LLM model name for context window determination
77+ context_window_max: Direct specification of context window max tokens
78+
79+ Returns:
80+ Tuple of (total_percentage, until_summarization_percentage)
81+ - total_percentage: Percentage (0-100) of total context window used
82+ - until_summarization_percentage: Percentage (0-100) until summarization triggers
83+ Both values are None if no model info provided
84+ """
85+ if not messages or (not model_name and not context_window_max ):
86+ return None , None
87+
88+ # Calculate current token usage
89+ current_tokens = _calculate_messages_token_count (messages )
90+
91+ # Get effective token limit for the client's model
92+ max_tokens = _get_effective_token_limit (model_name , context_window_max )
93+
94+ # Calculate percentage of total context window used
95+ total_percentage = (current_tokens / max_tokens ) * 100.0
96+
97+ # Calculate percentage until summarization threshold
98+ token_threshold = int (max_tokens * settings .summarization_threshold )
99+ until_summarization_percentage = (current_tokens / token_threshold ) * 100.0
100+
101+ # Cap both at 100% for display purposes
102+ return min (total_percentage , 100.0 ), min (until_summarization_percentage , 100.0 )
103+
104+
66105async def _summarize_working_memory (
67106 memory : WorkingMemory ,
68107 model_name : ModelNameLiteral | None = None ,
@@ -88,8 +127,8 @@ async def _summarize_working_memory(
88127 max_tokens = _get_effective_token_limit (model_name , context_window_max )
89128
90129 # Reserve space for new messages, function calls, and response generation
91- # Use 70% of context window to leave room for new content
92- token_threshold = int (max_tokens * 0.7 )
130+ # Use configurable threshold to leave room for new content
131+ token_threshold = int (max_tokens * settings . summarization_threshold )
93132
94133 if current_tokens <= token_threshold :
95134 return memory
@@ -269,7 +308,22 @@ async def get_working_memory(
269308
270309 logger .debug (f"Working mem: { working_mem } " )
271310
272- return working_mem
311+ # Calculate context usage percentages
312+ total_percentage , until_summarization_percentage = (
313+ _calculate_context_usage_percentages (
314+ messages = working_mem .messages ,
315+ model_name = model_name ,
316+ context_window_max = context_window_max ,
317+ )
318+ )
319+
320+ # Return WorkingMemoryResponse with both percentage values
321+ working_mem_data = working_mem .model_dump ()
322+ working_mem_data ["context_percentage_total_used" ] = total_percentage
323+ working_mem_data ["context_percentage_until_summarization" ] = (
324+ until_summarization_percentage
325+ )
326+ return WorkingMemoryResponse (** working_mem_data )
273327
274328
275329@router .put ("/v1/working-memory/{session_id}" , response_model = WorkingMemoryResponse )
@@ -348,7 +402,22 @@ async def put_working_memory(
348402 namespace = updated_memory .namespace ,
349403 )
350404
351- return updated_memory
405+ # Calculate context usage percentages based on the final state (after potential summarization)
406+ total_percentage , until_summarization_percentage = (
407+ _calculate_context_usage_percentages (
408+ messages = updated_memory .messages ,
409+ model_name = model_name ,
410+ context_window_max = context_window_max ,
411+ )
412+ )
413+
414+ # Return WorkingMemoryResponse with both percentage values
415+ updated_memory_data = updated_memory .model_dump ()
416+ updated_memory_data ["context_percentage_total_used" ] = total_percentage
417+ updated_memory_data ["context_percentage_until_summarization" ] = (
418+ until_summarization_percentage
419+ )
420+ return WorkingMemoryResponse (** updated_memory_data )
352421
353422
354423@router .delete ("/v1/working-memory/{session_id}" , response_model = AckResponse )
0 commit comments