Merge pull request #1381 from MervinPraison/claude/issue-1370-20260413-0932

MervinPraison · web-flow · commit 89d0a8ba3373 · 2026-04-15T10:57:54.000+01:00
fix: Address 3 critical production robustness gaps
diff --git a/src/praisonai-agents/praisonaiagents/agent/agent.py b/src/praisonai-agents/praisonaiagents/agent/agent.py
@@ -1622,6 +1622,8 @@ def __init__(
         # Token budget guard (zero overhead when _max_budget is None)
         self._max_budget = _max_budget
         self._on_budget_exceeded = _on_budget_exceeded
+        # Thread-safe cost/token tracking (Gap 1a fix)
+        self._cost_lock = threading.Lock()
         self._total_cost = 0.0
         self._total_tokens_in = 0
         self._total_tokens_out = 0
@@ -1965,7 +1967,9 @@ def thinking_budget(self, value: Optional[int]) -> None:
     @property
     def total_cost(self) -> float:
         """Cumulative USD cost of all LLM calls in this agent run."""
-        return self._total_cost
+        # Thread-safe cost reading (Gap 1a fix)
+        with self._cost_lock:
+            return self._total_cost
 
     @property
     def cost_summary(self) -> dict:
@@ -1974,12 +1978,14 @@ def cost_summary(self) -> dict:
         Returns:
             dict with keys: tokens_in, tokens_out, cost, llm_calls
         """
-        return {
-            "tokens_in": self._total_tokens_in,
-            "tokens_out": self._total_tokens_out,
-            "cost": self._total_cost,
-            "llm_calls": self._llm_call_count,
-        }
+        # Thread-safe cost reading (Gap 1a fix)
+        with self._cost_lock:
+            return {
+                "tokens_in": self._total_tokens_in,
+                "tokens_out": self._total_tokens_out,
+                "cost": self._total_cost,
+                "llm_calls": self._llm_call_count,
+            }
 
     @property
     def context_manager(self) -> Optional[Any]:
diff --git a/src/praisonai-agents/praisonaiagents/agent/chat_mixin.py b/src/praisonai-agents/praisonaiagents/agent/chat_mixin.py
@@ -653,26 +653,31 @@ def _chat_completion(self, messages, temperature=1.0, tools=None, stream=True, r
             )
 
             # Budget tracking & enforcement (zero overhead when _max_budget is None)
-            self._total_cost += _cost_usd
-            self._total_tokens_in += _prompt_tokens
-            self._total_tokens_out += _completion_tokens
-            self._llm_call_count += 1
-            if self._max_budget and self._total_cost >= self._max_budget:
+            # Thread-safe cost tracking (Gap 1a fix)
+            with self._cost_lock:
+                self._total_cost += _cost_usd
+                self._total_tokens_in += _prompt_tokens
+                self._total_tokens_out += _completion_tokens
+                self._llm_call_count += 1
+                budget_exceeded = self._max_budget and self._total_cost >= self._max_budget
+                current_cost = self._total_cost
+            
+            if budget_exceeded:
                 if self._on_budget_exceeded == "stop":
                     raise BudgetExceededError(
-                        f"Agent '{self.name}' exceeded budget: ${self._total_cost:.4f} >= ${self._max_budget:.4f}",
+                        f"Agent '{self.name}' exceeded budget: ${current_cost:.4f} >= ${self._max_budget:.4f}",
                         budget_type="cost",
                         limit=self._max_budget,
-                        used=self._total_cost,
+                        used=current_cost,
                         agent_id=self.name
                     )
                 elif self._on_budget_exceeded == "warn":
                     logging.warning(
-                        f"[budget] {self.name}: ${self._total_cost:.4f} exceeded "
+                        f"[budget] {self.name}: ${current_cost:.4f} exceeded "
                         f"${self._max_budget:.4f} budget"
                     )
                 elif callable(self._on_budget_exceeded):
-                    self._on_budget_exceeded(self._total_cost, self._max_budget)
+                    self._on_budget_exceeded(current_cost, self._max_budget)
             
             # Trigger AFTER_LLM hook
             from ..hooks import HookEvent, AfterLLMInput
diff --git a/src/praisonai-agents/praisonaiagents/agent/tool_execution.py b/src/praisonai-agents/praisonaiagents/agent/tool_execution.py
@@ -15,6 +15,7 @@
 import contextvars
 import concurrent.futures
 from typing import List, Optional, Any, Dict, Union, TYPE_CHECKING
+from ..errors import ToolExecutionError
 
 if TYPE_CHECKING:
     pass
@@ -310,8 +311,14 @@ def execute_with_context():
             _duration_ms = (_time.time() - _tool_start_time) * 1000
             _trace_emitter.tool_call_end(self.name, function_name, None, _duration_ms, str(e))
             
-            # Trigger OnError hook if needed (optional future step)
-            raise
+            # Gap 3a fix: Wrap exceptions in ToolExecutionError for better observability
+            is_retryable = not isinstance(e, (ValueError, TypeError, AttributeError))
+            raise ToolExecutionError(
+                f"Tool '{function_name}' failed: {e}",
+                tool_name=function_name,
+                agent_id=self.name,
+                is_retryable=is_retryable,
+            ) from e
 
     def _trigger_after_agent_hook(self, prompt, response, start_time, tools_used=None):
         """Trigger AFTER_AGENT hook and return response."""
diff --git a/src/praisonai-agents/praisonaiagents/session.py b/src/praisonai-agents/praisonaiagents/session.py
@@ -52,7 +52,8 @@ def __init__(
         agent_url: Optional[str] = None,
         memory_config: Optional[Dict[str, Any]] = None,
         knowledge_config: Optional[Dict[str, Any]] = None,
-        timeout: int = 30
+        timeout: int = 30,
+        session_ttl: Optional[int] = None  # Gap 2b: TTL in seconds
     ):
         """
         Initialize a new session with optional persistence or remote agent connectivity.
@@ -64,6 +65,7 @@ def __init__(
             memory_config: Configuration for memory system (defaults to RAG)
             knowledge_config: Configuration for knowledge base system  
             timeout: HTTP timeout for remote agent calls (default: 30 seconds)
+            session_ttl: Time-to-live in seconds after which session expires (Gap 2b)
         """
         self.session_id = session_id or str(uuid.uuid4())[:8]
         self.user_id = user_id or "default_user"
@@ -112,6 +114,10 @@ def __init__(
             self._agents_instance = None
             self._agents = {}  # Track agents and their chat histories
 
+        # Gap 2b: Session TTL and cleanup support  
+        self.session_ttl = session_ttl
+        self._created_at = time.time()  # Track creation time for TTL
+
     def _get_session_dir(self):
         """Return session-specific directory using paths.py."""
         from pathlib import Path
@@ -188,8 +194,8 @@ def Agent(
 
         agent = Agent(**agent_kwargs)
         
-        # Create a unique key for this agent (using name and role)
-        agent_key = f"{name}:{role}"
+        # Create a unique key for this agent (Gap 2a fix: include session_id for proper isolation)
+        agent_key = f"{self.session_id}:{name}:{role}"
         
         # Restore chat history if it exists from previous sessions
         if agent_key in self._agents:
@@ -272,7 +278,7 @@ def restore_state(self) -> Dict[str, Any]:
 
     def _restore_agent_chat_history(self, agent_key: str) -> List[Dict[str, Any]]:
         """
-        Restore agent chat history from memory.
+        Restore agent chat history from SessionStore first, then memory fallback (Gap 2c fix).
         
         Args:
             agent_key: Unique identifier for the agent
@@ -283,7 +289,21 @@ def _restore_agent_chat_history(self, agent_key: str) -> List[Dict[str, Any]]:
         if self.is_remote:
             return []
         
-        # Search for agent chat history in memory
+        # Gap 2c: Try SessionStore first for clean separation
+        try:
+            from .session.store import get_default_session_store
+            session_store = get_default_session_store()
+            chat_history = session_store.get_chat_history(agent_key)
+            if not chat_history:
+                # Backward compatibility: fall back to legacy
+                # "{session_id}_{agent_key}" format for existing stored conversations.
+                chat_history = session_store.get_chat_history(f"{self.session_id}_{agent_key}")
+            if chat_history:
+                return chat_history
+        except ImportError:
+            pass
+        
+        # Fallback: Search for agent chat history in memory (backward compatibility)
         results = self.memory.search_short_term(
             query="Agent chat history for",
             limit=10
@@ -352,21 +372,20 @@ def _save_agent_chat_histories(self) -> None:
                 chat_history = agent_data.get("chat_history")
             
             if chat_history is not None:
-                # G-2 FIX: Try SessionStore first for clean separation
+                # G-2 FIX: Use SessionStore for clean separation (Gap 2c fix)
                 session_store = None
                 try:
-                    from .session import get_default_session_store
+                    from .session.store import get_default_session_store
                     session_store = get_default_session_store()
                 except ImportError:
                     pass
                 
                 if session_store is not None:
                     # Use SessionStore for conversation history
-                    session_id = f"{self.session_id}_{agent_key}"
                     for msg in chat_history:
                         if isinstance(msg, dict):
                             session_store.add_message(
-                                session_id,
+                                agent_key,
                                 role=msg.get("role", "user"),
                                 content=msg.get("content", ""),
                             )
@@ -587,10 +606,48 @@ def send_message(self, message: str, **kwargs) -> str:
         """
         return self.chat(message, **kwargs)
 
+    def is_expired(self) -> bool:
+        """Check if the session has expired based on TTL (Gap 2b fix)."""
+        if self.session_ttl is None:
+            return False
+        return time.time() - self._created_at > self.session_ttl
+
+    def close(self) -> None:
+        """Close and cleanup the session (Gap 2b fix)."""
+        if self.is_remote:
+            return  # No cleanup needed for remote sessions
+        
+        # Clear memory
+        if self._memory:
+            try:
+                # Clear short-term and long-term memory for this session
+                # Note: This is a basic implementation - specific memory backends
+                # might need more sophisticated cleanup
+                self._memory = None
+            except Exception:
+                pass  # Ignore cleanup errors
+        
+        # Clear knowledge
+        if self._knowledge:
+            try:
+                self._knowledge = None
+            except Exception:
+                pass  # Ignore cleanup errors
+        
+        # Clear agents
+        self._agents.clear()
+
+    def time_to_expiry(self) -> Optional[float]:
+        """Get seconds until session expires, or None if no TTL set (Gap 2b)."""
+        if self.session_ttl is None:
+            return None
+        elapsed = time.time() - self._created_at
+        return max(0, self.session_ttl - elapsed)
+
     def __str__(self) -> str:
         if self.is_remote:
             return f"Session(id='{self.session_id}', user='{self.user_id}', remote_agent='{self.agent_url}')"
         return f"Session(id='{self.session_id}', user='{self.user_id}')"
 
     def __repr__(self) -> str:
-        return self.__str__()
+        return self.__str__()
diff --git a/src/praisonai-agents/praisonaiagents/workflows/workflows.py b/src/praisonai-agents/praisonaiagents/workflows/workflows.py
@@ -22,6 +22,8 @@
 import os
 import re
 import json
+import copy
+import time
 import logging
 from praisonaiagents._logging import get_logger
 from pathlib import Path
@@ -585,6 +587,8 @@ class AgentFlow:
     _reflection_config: Optional[Any] = field(default=None, repr=False)
     # Execution history for debugging (only populated when history=True)
     _execution_history: List[Dict[str, Any]] = field(default_factory=list, repr=False)
+    # Gap 3c: Cross-step handoff cycle detection
+    _handoff_chain: List[str] = field(default_factory=list, repr=False)
     
     def __post_init__(self):
         """Resolve consolidated params to internal values."""
@@ -899,6 +903,30 @@ def from_template(
                 "Install with: pip install praisonai"
             )
     
+    def _check_handoff_cycle(self, step: Any) -> None:
+        """Check for cross-step handoff cycles (Gap 3c fix)."""
+        step_id = getattr(step, 'name', str(step))
+        
+        # If step involves handoff (basic check for agent steps)
+        if hasattr(step, 'agent') and step.agent:
+            # Track this step in handoff chain
+            if step_id in self._handoff_chain:
+                # Cycle detected!
+                cycle_path = self._handoff_chain[self._handoff_chain.index(step_id):] + [step_id]
+                from ..errors import HandoffCycleError
+                raise HandoffCycleError(
+                    f"Cross-step handoff cycle detected: {' -> '.join(cycle_path)}",
+                    cycle_path=cycle_path
+                )
+            
+            # Add to chain (limit chain length to prevent memory issues)
+            self._handoff_chain.append(step_id)
+            if len(self._handoff_chain) > 100:  # Reasonable limit
+                self._handoff_chain = self._handoff_chain[-50:]  # Keep last 50
+        else:
+            # Non-agent step, reset chain 
+            self._handoff_chain.clear()
+    
     def run(
         self,
         input: str = "",
@@ -933,6 +961,9 @@ def run(
         Returns:
             Dict with 'output' (final result) and 'steps' (all step results)
         """
+        # Gap 3c: Clear handoff chain at start of new workflow run  
+        self._handoff_chain.clear()
+        
         # Use default LLM if not specified
         model = llm or self.llm or "gpt-4o-mini"
         logger.debug(f"Workflow using model: {model} (llm={llm}, default_llm={self.llm})")
@@ -1096,6 +1127,9 @@ def run(
                 except Exception as e:
                     logger.error(f"should_run failed for {step.name}: {e}")
             
+            # Gap 3c: Check for cross-step handoff cycles
+            self._check_handoff_cycle(step)
+            
             # Execute step with retry and guardrail support
             output = None
             stop = False
@@ -1273,6 +1307,24 @@ def run(
                 except Exception as e:
                     step_error = e
                     output = f"Error: {e}"
+                    
+                    # Gap 3b fix: Check if error is retryable and implement exponential backoff
+                    is_retryable = getattr(e, 'is_retryable', True)  # Default to retryable
+                    if not is_retryable:
+                        # Non-retryable error - break out of retry loop immediately
+                        if verbose:
+                            print(f"❌ {step.name} failed with non-retryable error: {e}")
+                        break
+                    
+                    retry_count += 1
+                    if retry_count <= max_retries:
+                        # Exponential backoff: wait 2^(retry_count-1) seconds
+                        backoff_seconds = 2 ** (retry_count - 1)
+                        if verbose:
+                            print(f"🔄 {step.name} failed (attempt {retry_count}/{max_retries}), retrying in {backoff_seconds}s: {e}")
+                        time.sleep(backoff_seconds)
+                        continue  # Retry
+                    
                     if self.on_step_error:
                         try:
                             self.on_step_error(step.name, e)
@@ -1826,7 +1878,6 @@ def _execute_specialized_agent(
                     # Only retry on transient errors (network, timeout, fetch failures)
                     if any(x in error_str for x in ['fetch', 'timeout', 'connection', 'network']):
                         if attempt < max_retries:
-                            import time
                             time.sleep(1 * (attempt + 1))  # Exponential backoff
                             continue
                     # Non-retryable error, raise immediately
@@ -2306,7 +2357,7 @@ def execute_with_branch(step=step, idx=idx, opt_prev=optimized_previous):
                     emitter.set_branch(f"parallel_{idx}")
                     try:
                         return self._execute_single_step_internal(
-                            step, opt_prev, input, all_variables.copy(), model, False, idx, stream, depth=depth+1
+                            step, opt_prev, input, copy.deepcopy(all_variables), model, False, idx, stream, depth=depth+1
                         )
                     finally:
                         emitter.clear_branch()
@@ -4290,7 +4341,6 @@ def _save_checkpoint(
             Path to checkpoint file
         """
         import json
-        import time
         from datetime import datetime
         
         checkpoint_file = self._get_checkpoints_dir() / f"{name}.json"
diff --git a/src/praisonai-agents/tests/unit/workflows/test_workflow_context.py b/src/praisonai-agents/tests/unit/workflows/test_workflow_context.py