sotopia-lab
diff --git a/‎codeconflict/agent_conversation.py‎
Lines changed: 135 additions & 0 deletions b/‎codeconflict/agent_conversation.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎codeconflict/codeagent/agent.py‎
Lines changed: 85 additions & 0 deletions b/‎codeconflict/codeagent/agent.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎codeconflict/codeagent/evaluate.py‎
Lines changed: 138 additions & 0 deletions b/‎codeconflict/codeagent/evaluate.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎codeconflict/codeagent/models.py‎
Lines changed: 13 additions & 0 deletions b/‎codeconflict/codeagent/models.py‎
Lines changed: 13 additions & 0 deletions
@@ -0,0 +1,135 @@
+import os
+from openai import OpenAI
+import sys
+
+class ConversationAgent:
+    def __init__(self, name, system_prompt):
+        self.name = name
+        self.system_prompt = system_prompt + "\nKeep your responses concise and conversational, around 2-3 sentences."
+        self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+        if not os.getenv('OPENAI_API_KEY'):
+            raise ValueError('OPENAI_API_KEY environment variable is not set')
+        self.conversation_history = []
+
+    def respond(self, message):
+        """Generate a streaming response based on the conversation history and new message"""
+        # Add the received message to conversation history
+        if message:
+            self.conversation_history.append({"role": "user", "content": message})
+
+        # Prepare messages for the API call
+        messages = [
+            {"role": "system", "content": self.system_prompt}
+        ] + self.conversation_history
+
+        try:
+            stream = self.client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=messages,
+                stream=True
+            )
+            
+            # Initialize response content
+            content = ""
+            print(f"\n{self.name} is typing: ", end="", flush=True)
+            
+            # Process the stream
+            for chunk in stream:
+                if chunk.choices[0].delta.content is not None:
+                    chunk_content = chunk.choices[0].delta.content
+                    content += chunk_content
+                    print(chunk_content, end="", flush=True)
+            
+            print()  # New line after response
+            
+            # Add the complete response to conversation history
+            self.conversation_history.append({"role": "assistant", "content": content})
+            return content
+        except Exception as e:
+            print(f'Error generating response: {e}')
+            return None
+
+def simulate_conversation(turns=2):
+    # Initialize two software engineer agents with planning phase prompts
+    agent1 = ConversationAgent(
+        "SWE1",
+        "You are a software engineer in the planning phase. Your goal is to write code "
+        "that outputs 'hello I am agent1'. First, discuss your approach with the other engineer "
+        "for 2 turns to ensure both individual goals can be achieved in the final merged code."
+    )
+
+    agent2 = ConversationAgent(
+        "SWE2",
+        "You are a software engineer in the planning phase. Your goal is to write code "
+        "that outputs 'hello I am agent2'. First, discuss your approach with the other engineer "
+        "for 2 turns to ensure both individual goals can be achieved in the final merged code."
+    )
+
+    # Planning Phase
+    print("\n=== Planning Phase ===")
+    current_message = "Let's discuss how we can structure our code to output both our messages when merged. What's your approach?"
+    print(f"\nInitial Question: {current_message}\n")
+
+    # Planning discussion for specified number of turns
+    for i in range(turns):
+        print(f"\n--- Planning Turn {i + 1} ---")
+
+        # Agent 1's turn
+        response1 = agent1.respond(current_message)
+        if not response1:
+            print("[Failed to generate response]")
+            break
+        current_message = response1
+
+        # Agent 2's turn
+        response2 = agent2.respond(current_message)
+        if not response2:
+            print("[Failed to generate response]")
+            break
+        current_message = response2
+
+    # Update agents with coding phase prompts
+    agent1.system_prompt = (
+        "You are now in the coding phase. Based on the previous discussion, implement code "
+        "that outputs 'hello I am agent1' in a way that will work when merged with your colleague's code."
+    )
+
+    agent2.system_prompt = (
+        "You are now in the coding phase. Based on the previous discussion, implement code "
+        "that outputs 'hello I am agent2' in a way that will work when merged with your colleague's code."
+    )
+
+    # Coding Phase
+    print("\n=== Coding Phase ===")
+    current_message = "Now, let's implement our code based on our discussion. Please share your implementation."
+
+    # Get implementations from both agents
+    print("\n--- Agent 1's Implementation ---")
+    agent1_code = agent1.respond(current_message)
+
+    print("\n--- Agent 2's Implementation ---")
+    agent2_code = agent2.respond(agent1_code)
+
+    # Code Comparison Phase
+    print("\n=== Code Comparison Phase ===")
+    
+    # Update agents with code review prompts
+    agent1.system_prompt = (
+        "You are now in the code review phase. Review both implementations and ensure they will work "
+        "together to achieve both goals. Suggest any necessary modifications."
+    )
+
+    agent2.system_prompt = (
+        "You are now in the code review phase. Review both implementations and ensure they will work "
+        "together to achieve both goals. Suggest any necessary modifications."
+    )
+
+    # Final review and agreement
+    review_message = "Let's review our implementations and ensure they will work together. Any suggestions for modifications?"
+    
+    print("\n--- Final Review ---")
+    agent1_review = agent1.respond(review_message)
+    agent2_review = agent2.respond(agent1_review)
+
+if __name__ == '__main__':
+    simulate_conversation()
@@ -0,0 +1,85 @@
+import os
+from openai import OpenAI
+from pydantic import BaseModel
+from typing import Optional, Type, Any, Dict, List, Union
+from typing_extensions import TypeAlias
+import json
+import logging
+from rich.console import Console
+from rich.panel import Panel
+from rich.text import Text
+
+from .models import AgentMessage
+from .utils import format_agent_response
+
+# # Configure logging
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+
+class CodeWeaverAgent:
+    def __init__(self, name: str, system_prompt: str, response_format: Type[BaseModel] = AgentMessage, action_descriptions: str = """) -> None:
+            Available actions:
+            - speak: Use this action to communicate your thoughts, suggestions, or responses. Always include meaningful content when using this action.
+            - leave: Use this action to exit the conversation when you've reached a conclusion or consensus. No content is required for this action.
+            """):
+        self.console = Console()
+        self.name = name
+        self.response_format = response_format
+        self.system_prompt = "You are " + self.name + ". " + system_prompt
+        if action_descriptions:
+            self.system_prompt += action_descriptions
+        self.system_prompt += "\nKeep your responses concise and conversational."
+        self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+        if not os.getenv('OPENAI_API_KEY'):
+            raise ValueError('OPENAI_API_KEY environment variable is not set')
+        self.conversation_history = []
+
+    def respond(self, message: Optional[str]) -> Optional[BaseModel]:
+        """Generate a structured response based on the conversation history and new message"""
+        # Add the received message to conversation history
+        if message:
+            self.conversation_history.append({"role": "user", "content": message})
+
+        # Prepare messages for the API call
+        messages = [
+            {"role": "system", "content": self.system_prompt}
+        ] + self.conversation_history
+
+        try:
+            response = self.client.beta.chat.completions.parse(
+                model="gpt-4o-mini",
+                messages=messages,
+                response_format=self.response_format
+            )
+            
+            # Log the raw response for debugging
+            # logger.debug(f"Raw response from API: {response}")
+            
+            # Get the parsed response
+            parsed_response = response.choices[0].message.parsed
+            
+            # Convert response to display string using utility function
+            display_content = format_agent_response(self.name, parsed_response)
+            
+            # Create a styled panel for the response
+            styled_text = Text(display_content)
+            styled_text.stylize(f"bold {self._get_agent_color()}")
+            # self.console.print(Panel(styled_text, title=self.name, border_style=self._get_agent_color()))
+
+            # Store the string representation in conversation history
+            self.conversation_history.append({
+                "role": "assistant",
+                "content": display_content
+            })
+            # print(parsed_response.model_dump_json())
+            return parsed_response
+        except Exception as e:
+            self.console.print(f"[red bold]Error generating response:[/red bold] {str(e)}")
+            return None
+
+    def _get_agent_color(self) -> str:
+        """Return a consistent color for the agent based on its name"""
+        # Use a simple hash of the agent name to select a color
+        colors = ["blue", "green", "yellow", "magenta", "cyan", "red"]
+        color_index = sum(ord(c) for c in self.name) % len(colors)
+        return colors[color_index]
@@ -0,0 +1,138 @@
+from typing import List, Tuple, Dict, Any
+import sys
+import os
+from multiprocessing import Pool
+from statistics import mean, stdev
+from datetime import datetime
+from tqdm import tqdm
+from functools import partial
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from codeconflict.codeagent.simulate import simulate_conversation
+
+import numpy as np
+
+def run_simulation_batch(max_attempts: int) -> Tuple[bool, int, float]:
+    """Run a single simulation with fixed turns=2 and specified max_attempts
+    
+    Args:
+        max_attempts: Maximum number of attempts allowed for conflict resolution
+    
+    Returns:
+        Tuple of (success, turns_taken, reward_score)
+    """
+    try:
+        result = simulate_conversation(turns=2, max_attempts=max_attempts)
+        return result['success'], result['turns_taken'], result.get('reward_score', 0.0)
+    except Exception as e:
+        print(f"Error in simulation: {str(e)}")
+        return False, 0, 0.0
+
+def run_parallel_simulations(num_runs: int, max_attempts: int, num_workers: int) -> List[Tuple[bool, int, float]]:
+    """Run multiple simulations in parallel
+    
+    Args:
+        num_runs: Number of simulations to run
+        max_attempts: Maximum attempts for each simulation
+        num_workers: Number of parallel workers
+        
+    Returns:
+        List of simulation results
+    """
+    with Pool(processes=num_workers) as pool:
+        # Run simulations in parallel
+        results = list(tqdm(
+            pool.imap(run_simulation_batch, [max_attempts] * num_runs),
+            total=num_runs,
+            desc=f"Runs (max_attempts={max_attempts})",
+            position=1,
+            leave=False
+        ))
+    return results
+
+def evaluate_simulations() -> Dict[str, Any]:
+    """Run simulations with fixed turns=2 and evaluate performance at different max_attempts.
+    Each configuration is run multiple times to calculate mean and standard deviation.
+    
+    Returns:
+        dict: Contains success statistics for each max_attempts configuration
+    """
+    results = {}
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    num_runs = 4  # Number of runs per configuration
+    max_turns = 2  # Fixed number of turns
+    num_workers = 4  # Number of parallel workers
+    
+    try:
+        # Main progress bar for different max_attempts configurations
+        with tqdm(total=3, desc="Evaluating configurations", position=0) as pbar:
+            for max_attempts in [1, 2, 3]:
+                print(f"\n=== Evaluating with max_attempts={max_attempts} ===\n")
+                
+                # Run simulations in parallel
+                simulation_results = run_parallel_simulations(num_runs, max_attempts, num_workers)
+                
+                # Process results
+                successes = []
+                turns = []
+                rewards = []
+                
+                for success, turns_taken, reward_score in simulation_results:
+                    successes.append(1 if success else 0)
+                    if success:
+                        turns.append(turns_taken)
+                        rewards.append(reward_score)
+                
+                # Calculate statistics
+                success_rate = mean(successes) if successes else 0
+                success_std = stdev(successes) if len(successes) > 1 else 0
+                mean_turns = mean(turns) if turns else 0
+                turns_std = stdev(turns) if len(turns) > 1 else 0
+                mean_reward = mean(rewards) if rewards else 0
+                reward_std = stdev(rewards) if len(rewards) > 1 else 0
+                
+                results[f'max_attempts_{max_attempts}'] = {
+                    'success_rate': success_rate,
+                    'success_std': success_std,
+                    'mean_turns': mean_turns,
+                    'turns_std': turns_std,
+                    'mean_reward': mean_reward,
+                    'reward_std': reward_std,
+                    'successes': successes,
+                    'turns': turns,
+                    'rewards': rewards
+                }
+                
+                # Print results
+                print(f"\n=== Results for max_attempts={max_attempts} ===")
+                print(f"Success Rate: {success_rate * 100:.1f}% ± {success_std * 100:.1f}%")
+                if turns:
+                    print(f"Average Turns to Success: {mean_turns:.1f} ± {turns_std:.1f}")
+                    print(f"Average Reward Score: {mean_reward:.1f} ± {reward_std:.1f}")
+                else:
+                    print("No successful runs recorded")
+                print()
+                
+                pbar.update(1)
+        
+        # Add metadata to results
+        results['metadata'] = {
+            'timestamp': timestamp,
+            'num_runs': num_runs,
+            'max_turns': max_turns,
+            'max_retries': max_attempts,
+            'num_workers': num_workers
+        }
+        
+        return results
+    
+    except Exception as e:
+        print(f"Error during evaluation: {str(e)}")
+        return {'error': str(e), 'metadata': {'timestamp': timestamp}}
+
+def main():
+    evaluate_simulations()
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,13 @@
+from typing import Literal, Optional
+from pydantic import BaseModel
+
+class AgentMessage(BaseModel):
+    action: Literal['speak', 'leave']
+    content: Optional[str] = None
+
+class AgentAction(BaseModel):
+    action: Literal['read', 'write', 'execute']
+    command: str | None = None
+    path: str | None = None
+    content: str | None = None
+    continue_action: bool | None = None  # Indicates if the agent needs to continue with more actions