feat: add debug output with token/cost metrics

MervinPraison · MervinPraison · commit c3d60c82ca85 · 2026-01-13T17:20:06.000Z
- Added llm_end callback to openai_client.py and llm.py with:
  - tokens_in, tokens_out from usage
  - cost from litellm.completion_cost()
  - latency_ms timing
- Updated trace.py and status.py to display metrics
- Added show_metrics param to enable debug mode metrics display
- Agent.py passes metrics flag to enable_status_output

Debug output now shows:
  [17:18:22] Calling LLM (gpt-4o-mini)...
  [17:18:24]   │ 📊 gpt-4o-mini: 104→14 tokens [1.5s]
  [17:18:24] ▸ get_weather(city='Tokyo')
  [17:18:24] Calling LLM (gpt-4o-mini)...
  [17:18:25]   │ 📊 gpt-4o-mini: 133→8 tokens [0.7s]
  Response: The weather in Tokyo is sunny.
diff --git a/src/praisonai-agents/basic-agents-tools.py b/src/praisonai-agents/basic-agents-tools.py
@@ -5,9 +5,9 @@ def get_weather(city: str) -> str:
 
 agent = Agent(
     instructions="You are a helpful assistant",
-    llm="openai/gpt-4o-mini",
+    llm="gpt-4o-mini",
     tools=[get_weather],
-    output="status"
+    output="debug"
 )
 
 agent.start("What is the weather in Tokyo?")
diff --git a/src/praisonai-agents/praisonaiagents/agent/agent.py b/src/praisonai-agents/praisonaiagents/agent/agent.py
@@ -500,11 +500,13 @@ def __init__(
                 if not is_status_output_enabled():
                     output_format = "jsonl" if json_output else "text"
                     # simple_output=True means status preset (no timestamps)
+                    # metrics=True means debug preset (show token/cost info)
                     enable_status_output(
                         redact=True,
                         use_color=True,
                         format=output_format,
-                        show_timestamps=not simple_output
+                        show_timestamps=not simple_output,
+                        show_metrics=metrics
                     )
             except ImportError:
                 pass  # Status module not available
diff --git a/src/praisonai-agents/praisonaiagents/llm/llm.py b/src/praisonai-agents/praisonaiagents/llm/llm.py
@@ -1574,6 +1574,39 @@ def get_response(
                         if self.metrics:
                             self._track_token_usage(final_response, self.model)
                         
+                        # Trigger llm_end callback with metrics for debug output
+                        llm_latency_ms = (time.time() - current_time) * 1000
+                        
+                        # Extract usage - handle both dict and ModelResponse object
+                        tokens_in = 0
+                        tokens_out = 0
+                        if isinstance(final_response, dict):
+                            usage = final_response.get("usage", {})
+                            tokens_in = usage.get("prompt_tokens", 0)
+                            tokens_out = usage.get("completion_tokens", 0)
+                        else:
+                            # ModelResponse object
+                            usage = getattr(final_response, 'usage', None)
+                            if usage:
+                                tokens_in = getattr(usage, 'prompt_tokens', 0) or 0
+                                tokens_out = getattr(usage, 'completion_tokens', 0) or 0
+                        
+                        # Calculate cost if available
+                        llm_cost = None
+                        try:
+                            llm_cost = litellm.completion_cost(completion_response=final_response)
+                        except Exception:
+                            pass
+                        
+                        execute_sync_callback(
+                            'llm_end',
+                            model=self.model,
+                            tokens_in=tokens_in,
+                            tokens_out=tokens_out,
+                            cost=llm_cost,
+                            latency_ms=llm_latency_ms
+                        )
+                        
                         # Execute callbacks and display based on verbose setting
                         generation_time_val = time.time() - current_time
                         response_content = f"Reasoning:\n{reasoning_content}\n\nAnswer:\n{response_text}" if reasoning_content else response_text
diff --git a/src/praisonai-agents/praisonaiagents/llm/openai_client.py b/src/praisonai-agents/praisonaiagents/llm/openai_client.py
@@ -1205,6 +1205,35 @@ def chat_completion_with_tools(
             if not final_response:
                 return None
             
+            # Trigger llm_end callback with metrics for debug output
+            llm_end_time = time.perf_counter()
+            llm_latency_ms = (llm_end_time - start_time) * 1000
+            
+            # Extract usage info if available
+            usage = getattr(final_response, 'usage', None)
+            tokens_in = getattr(usage, 'prompt_tokens', 0) if usage else 0
+            tokens_out = getattr(usage, 'completion_tokens', 0) if usage else 0
+            
+            # Calculate cost if litellm available
+            cost = None
+            try:
+                import litellm
+                if hasattr(final_response, 'model_dump'):
+                    cost = litellm.completion_cost(completion_response=final_response.model_dump())
+                elif isinstance(final_response, dict):
+                    cost = litellm.completion_cost(completion_response=final_response)
+            except Exception:
+                pass  # Cost calculation is optional
+            
+            execute_sync_callback(
+                'llm_end',
+                model=model,
+                tokens_in=tokens_in,
+                tokens_out=tokens_out,
+                cost=cost,
+                latency_ms=llm_latency_ms
+            )
+            
             # Check for tool calls
             tool_calls = getattr(final_response.choices[0].message, 'tool_calls', None)
             
diff --git a/src/praisonai-agents/praisonaiagents/output/status.py b/src/praisonai-agents/praisonaiagents/output/status.py
@@ -79,12 +79,14 @@ def __init__(
         redact: bool = True,
         use_color: bool = True,
         show_timestamps: bool = True,  # NEW: control timestamp display
+        show_metrics: bool = False,  # Enable metrics display for debug mode
     ):
         self._file = file or sys.stderr  # Use stderr to not interfere with agent output
         self._format = format
         self._redact = redact
         self._use_color = use_color
         self._show_timestamps = show_timestamps
+        self._show_metrics = show_metrics
         self._console = None
         self._tool_start_times: Dict[str, float] = {}
         self._lock = threading.Lock()  # Per-sink lock for thread safety
@@ -170,23 +172,55 @@ def llm_start(self, model: str = None, agent_name: Optional[str] = None) -> None
         else:
             self._emit_text(f"▸ AI → {context}...", ts, "yellow")
     
-    def llm_end(self, duration_ms: Optional[float] = None, agent_name: Optional[str] = None) -> None:
-        """Record LLM call end."""
+    def llm_end(
+        self, 
+        duration_ms: Optional[float] = None, 
+        agent_name: Optional[str] = None,
+        model: Optional[str] = None,
+        tokens_in: int = 0,
+        tokens_out: int = 0,
+        cost: Optional[float] = None,
+        latency_ms: Optional[float] = None,
+    ) -> None:
+        """Record LLM call end with optional metrics for debug mode."""
         ts = time.time()
         
-        # Calculate duration if not provided
-        if duration_ms is None and hasattr(self, '_llm_start_time'):
+        # Use latency_ms if provided, otherwise calculate from start time
+        if latency_ms is not None:
+            duration_ms = latency_ms
+        elif duration_ms is None and hasattr(self, '_llm_start_time'):
             start_ts = self._llm_start_time
             if start_ts:
                 duration_ms = (ts - start_ts) * 1000
         
-        duration_str = f" [{_format_duration(duration_ms)}]" if duration_ms else ""
+        # Track session totals for summary
+        if not hasattr(self, '_session_tokens_in'):
+            self._session_tokens_in = 0
+            self._session_tokens_out = 0
+            self._session_cost = 0.0
+            self._session_llm_calls = 0
+        
+        self._session_tokens_in += tokens_in
+        self._session_tokens_out += tokens_out
+        if cost:
+            self._session_cost += cost
+        self._session_llm_calls += 1
+        
+        # Only show metrics line in debug mode (when show_metrics is enabled)
+        show_metrics = getattr(self, '_show_metrics', False)
         
         if self._format == "jsonl":
-            self._emit_jsonl("llm_end", agent_name=agent_name, timestamp=ts, duration_ms=duration_ms)
-        else:
-            prefix = f"[{agent_name}] " if agent_name else ""
-            self._emit_text(f"{prefix}✓ LLM responded{duration_str}", ts, "green")
+            self._emit_jsonl("llm_end", agent_name=agent_name, timestamp=ts, 
+                           duration_ms=duration_ms, model=model, 
+                           tokens_in=tokens_in, tokens_out=tokens_out, cost=cost)
+        elif show_metrics and (tokens_in > 0 or tokens_out > 0):
+            # Debug mode: show metrics line
+            duration_str = f" [{_format_duration(duration_ms)}]" if duration_ms else ""
+            model_str = model.split('/')[-1] if model else "?"  # Short model name
+            cost_str = f" (~${cost:.4f})" if cost else ""
+            
+            metrics_line = f"  │ 📊 {model_str}: {tokens_in}→{tokens_out} tokens{cost_str}{duration_str}"
+            self._emit_text(metrics_line, ts, "dim", show_timestamp=False)
     
     def tool_start(self, tool_name: str, tool_args: Optional[Dict[str, Any]] = None, agent_name: Optional[str] = None) -> None:
         """Record tool start - stores info for inline display with result."""
@@ -285,6 +319,7 @@ def enable_status_output(
     redact: bool = True,
     use_color: bool = True,
     show_timestamps: bool = True,
+    show_metrics: bool = False,  # Enable metrics for debug mode
 ) -> StatusOutput:
     """
     Enable actions output mode globally.
@@ -298,6 +333,7 @@ def enable_status_output(
         redact: Whether to redact sensitive data (default: True)
         use_color: Whether to use colored output (default: True)
         show_timestamps: Whether to show timestamps (default: True)
+        show_metrics: Whether to show token/cost metrics (default: False)
     
     Returns:
         StatusOutput instance for programmatic access
@@ -313,6 +349,7 @@ def enable_status_output(
         redact=redact,
         use_color=use_color,
         show_timestamps=show_timestamps,
+        show_metrics=show_metrics,
     )
     _status_output_enabled = True
     
@@ -360,11 +397,19 @@ def on_llm_start(model: str = None, agent_name: str = None, **kwargs):
         
         _status_output.llm_start(model=model, agent_name=agent_name)
     
+    def on_llm_end(model: str = None, tokens_in: int = 0, tokens_out: int = 0, cost: float = None, latency_ms: float = None, **kwargs):
+        """Callback for LLM call completion with metrics."""
+        if not _status_output_enabled or _status_output is None:
+            return
+        
+        _status_output.llm_end(model=model, tokens_in=tokens_in, tokens_out=tokens_out, cost=cost, latency_ms=latency_ms)
+    
     # Register the callbacks
     register_display_callback('tool_call', on_tool_call)
     register_display_callback('interaction', on_interaction)
     register_display_callback('error', on_error)
     register_display_callback('llm_start', on_llm_start)
+    register_display_callback('llm_end', on_llm_end)
     
     return _status_output
 
diff --git a/src/praisonai-agents/praisonaiagents/output/trace.py b/src/praisonai-agents/praisonaiagents/output/trace.py
@@ -98,12 +98,32 @@ def llm_start(self, model: str = None) -> None:
         model_str = f" ({model})" if model else ""
         self._emit(f"Calling LLM{model_str}...", "cyan")
     
-    def llm_end(self, duration_ms: float = None) -> None:
-        """Record LLM call end."""
-        if duration_ms is None and self._llm_start_time:
+    def llm_end(
+        self, 
+        duration_ms: float = None,
+        model: str = None,
+        tokens_in: int = 0,
+        tokens_out: int = 0,
+        cost: float = None,
+    ) -> None:
+        """Record LLM call end with optional metrics."""
+        # If duration_ms was passed and is positive, use it directly
+        # Otherwise calculate from internal tracking
+        if duration_ms is not None and duration_ms > 0:
+            pass  # Use the passed value
+        elif self._llm_start_time:
             duration_ms = (time.time() - self._llm_start_time) * 1000
-        duration_str = f" ({duration_ms/1000:.1f}s)" if duration_ms else ""
-        self._emit(f"LLM responded{duration_str}", "green")
+        
+        duration_str = f" [{duration_ms/1000:.1f}s]" if duration_ms and duration_ms > 0 else ""
+        
+        # Show metrics if tokens are available
+        if tokens_in > 0 or tokens_out > 0:
+            model_str = model.split('/')[-1] if model else "?"
+            cost_str = f" (~${cost:.4f})" if cost and cost > 0 else ""
+            metrics_line = f"  │ 📊 {model_str}: {tokens_in}→{tokens_out} tokens{cost_str}{duration_str}"
+            self._emit(metrics_line, "dim")
+        else:
+            self._emit(f"LLM responded{duration_str}", "green")
         self._llm_start_time = None
     
     def tool_start(self, tool_name: str, tool_args: Dict[str, Any] = None) -> None:
@@ -266,10 +286,32 @@ def on_error(message: str = None, **kwargs):
         if message:
             _trace_output.error(message)
     
+    def on_llm_start(model: str = None, agent_name: str = None, **kwargs):
+        """Callback for LLM call start."""
+        if not _trace_output_enabled or _trace_output is None:
+            return
+        
+        _trace_output.llm_start(model=model)
+    
+    def on_llm_end(model: str = None, tokens_in: int = 0, tokens_out: int = 0, cost: float = None, latency_ms: float = None, **kwargs):
+        """Callback for LLM call completion with optional metrics."""
+        if not _trace_output_enabled or _trace_output is None:
+            return
+        
+        _trace_output.llm_end(
+            duration_ms=latency_ms, 
+            model=model, 
+            tokens_in=tokens_in, 
+            tokens_out=tokens_out, 
+            cost=cost
+        )
+    
     # Register the callbacks
     register_display_callback('tool_call', on_tool_call)
     register_display_callback('interaction', on_interaction)
     register_display_callback('error', on_error)
+    register_display_callback('llm_start', on_llm_start)
+    register_display_callback('llm_end', on_llm_end)
     
     return _trace_output