Skip to content
This repository was archived by the owner on Jan 30, 2026. It is now read-only.

Commit c3807ca

Browse files
committed
Adding starter code with agent and benchmark
1 parent 671f762 commit c3807ca

File tree

11 files changed

+875
-0
lines changed

11 files changed

+875
-0
lines changed

codeconflict/agent_conversation.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
from openai import OpenAI
3+
import sys
4+
5+
class ConversationAgent:
6+
def __init__(self, name, system_prompt):
7+
self.name = name
8+
self.system_prompt = system_prompt + "\nKeep your responses concise and conversational, around 2-3 sentences."
9+
self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
10+
if not os.getenv('OPENAI_API_KEY'):
11+
raise ValueError('OPENAI_API_KEY environment variable is not set')
12+
self.conversation_history = []
13+
14+
def respond(self, message):
15+
"""Generate a streaming response based on the conversation history and new message"""
16+
# Add the received message to conversation history
17+
if message:
18+
self.conversation_history.append({"role": "user", "content": message})
19+
20+
# Prepare messages for the API call
21+
messages = [
22+
{"role": "system", "content": self.system_prompt}
23+
] + self.conversation_history
24+
25+
try:
26+
stream = self.client.chat.completions.create(
27+
model="gpt-4o-mini",
28+
messages=messages,
29+
stream=True
30+
)
31+
32+
# Initialize response content
33+
content = ""
34+
print(f"\n{self.name} is typing: ", end="", flush=True)
35+
36+
# Process the stream
37+
for chunk in stream:
38+
if chunk.choices[0].delta.content is not None:
39+
chunk_content = chunk.choices[0].delta.content
40+
content += chunk_content
41+
print(chunk_content, end="", flush=True)
42+
43+
print() # New line after response
44+
45+
# Add the complete response to conversation history
46+
self.conversation_history.append({"role": "assistant", "content": content})
47+
return content
48+
except Exception as e:
49+
print(f'Error generating response: {e}')
50+
return None
51+
52+
def simulate_conversation(turns=2):
53+
# Initialize two software engineer agents with planning phase prompts
54+
agent1 = ConversationAgent(
55+
"SWE1",
56+
"You are a software engineer in the planning phase. Your goal is to write code "
57+
"that outputs 'hello I am agent1'. First, discuss your approach with the other engineer "
58+
"for 2 turns to ensure both individual goals can be achieved in the final merged code."
59+
)
60+
61+
agent2 = ConversationAgent(
62+
"SWE2",
63+
"You are a software engineer in the planning phase. Your goal is to write code "
64+
"that outputs 'hello I am agent2'. First, discuss your approach with the other engineer "
65+
"for 2 turns to ensure both individual goals can be achieved in the final merged code."
66+
)
67+
68+
# Planning Phase
69+
print("\n=== Planning Phase ===")
70+
current_message = "Let's discuss how we can structure our code to output both our messages when merged. What's your approach?"
71+
print(f"\nInitial Question: {current_message}\n")
72+
73+
# Planning discussion for specified number of turns
74+
for i in range(turns):
75+
print(f"\n--- Planning Turn {i + 1} ---")
76+
77+
# Agent 1's turn
78+
response1 = agent1.respond(current_message)
79+
if not response1:
80+
print("[Failed to generate response]")
81+
break
82+
current_message = response1
83+
84+
# Agent 2's turn
85+
response2 = agent2.respond(current_message)
86+
if not response2:
87+
print("[Failed to generate response]")
88+
break
89+
current_message = response2
90+
91+
# Update agents with coding phase prompts
92+
agent1.system_prompt = (
93+
"You are now in the coding phase. Based on the previous discussion, implement code "
94+
"that outputs 'hello I am agent1' in a way that will work when merged with your colleague's code."
95+
)
96+
97+
agent2.system_prompt = (
98+
"You are now in the coding phase. Based on the previous discussion, implement code "
99+
"that outputs 'hello I am agent2' in a way that will work when merged with your colleague's code."
100+
)
101+
102+
# Coding Phase
103+
print("\n=== Coding Phase ===")
104+
current_message = "Now, let's implement our code based on our discussion. Please share your implementation."
105+
106+
# Get implementations from both agents
107+
print("\n--- Agent 1's Implementation ---")
108+
agent1_code = agent1.respond(current_message)
109+
110+
print("\n--- Agent 2's Implementation ---")
111+
agent2_code = agent2.respond(agent1_code)
112+
113+
# Code Comparison Phase
114+
print("\n=== Code Comparison Phase ===")
115+
116+
# Update agents with code review prompts
117+
agent1.system_prompt = (
118+
"You are now in the code review phase. Review both implementations and ensure they will work "
119+
"together to achieve both goals. Suggest any necessary modifications."
120+
)
121+
122+
agent2.system_prompt = (
123+
"You are now in the code review phase. Review both implementations and ensure they will work "
124+
"together to achieve both goals. Suggest any necessary modifications."
125+
)
126+
127+
# Final review and agreement
128+
review_message = "Let's review our implementations and ensure they will work together. Any suggestions for modifications?"
129+
130+
print("\n--- Final Review ---")
131+
agent1_review = agent1.respond(review_message)
132+
agent2_review = agent2.respond(agent1_review)
133+
134+
if __name__ == '__main__':
135+
simulate_conversation()

codeconflict/codeagent/agent.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import os
2+
from openai import OpenAI
3+
from pydantic import BaseModel
4+
from typing import Optional, Type, Any, Dict, List, Union
5+
from typing_extensions import TypeAlias
6+
import json
7+
import logging
8+
from rich.console import Console
9+
from rich.panel import Panel
10+
from rich.text import Text
11+
12+
from .models import AgentMessage
13+
from .utils import format_agent_response
14+
15+
# # Configure logging
16+
# logging.basicConfig(level=logging.INFO)
17+
# logger = logging.getLogger(__name__)
18+
19+
class CodeWeaverAgent:
20+
def __init__(self, name: str, system_prompt: str, response_format: Type[BaseModel] = AgentMessage, action_descriptions: str = """) -> None:
21+
Available actions:
22+
- speak: Use this action to communicate your thoughts, suggestions, or responses. Always include meaningful content when using this action.
23+
- leave: Use this action to exit the conversation when you've reached a conclusion or consensus. No content is required for this action.
24+
"""):
25+
self.console = Console()
26+
self.name = name
27+
self.response_format = response_format
28+
self.system_prompt = "You are " + self.name + ". " + system_prompt
29+
if action_descriptions:
30+
self.system_prompt += action_descriptions
31+
self.system_prompt += "\nKeep your responses concise and conversational."
32+
self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
33+
if not os.getenv('OPENAI_API_KEY'):
34+
raise ValueError('OPENAI_API_KEY environment variable is not set')
35+
self.conversation_history = []
36+
37+
def respond(self, message: Optional[str]) -> Optional[BaseModel]:
38+
"""Generate a structured response based on the conversation history and new message"""
39+
# Add the received message to conversation history
40+
if message:
41+
self.conversation_history.append({"role": "user", "content": message})
42+
43+
# Prepare messages for the API call
44+
messages = [
45+
{"role": "system", "content": self.system_prompt}
46+
] + self.conversation_history
47+
48+
try:
49+
response = self.client.beta.chat.completions.parse(
50+
model="gpt-4o-mini",
51+
messages=messages,
52+
response_format=self.response_format
53+
)
54+
55+
# Log the raw response for debugging
56+
# logger.debug(f"Raw response from API: {response}")
57+
58+
# Get the parsed response
59+
parsed_response = response.choices[0].message.parsed
60+
61+
# Convert response to display string using utility function
62+
display_content = format_agent_response(self.name, parsed_response)
63+
64+
# Create a styled panel for the response
65+
styled_text = Text(display_content)
66+
styled_text.stylize(f"bold {self._get_agent_color()}")
67+
# self.console.print(Panel(styled_text, title=self.name, border_style=self._get_agent_color()))
68+
69+
# Store the string representation in conversation history
70+
self.conversation_history.append({
71+
"role": "assistant",
72+
"content": display_content
73+
})
74+
# print(parsed_response.model_dump_json())
75+
return parsed_response
76+
except Exception as e:
77+
self.console.print(f"[red bold]Error generating response:[/red bold] {str(e)}")
78+
return None
79+
80+
def _get_agent_color(self) -> str:
81+
"""Return a consistent color for the agent based on its name"""
82+
# Use a simple hash of the agent name to select a color
83+
colors = ["blue", "green", "yellow", "magenta", "cyan", "red"]
84+
color_index = sum(ord(c) for c in self.name) % len(colors)
85+
return colors[color_index]

codeconflict/codeagent/evaluate.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from typing import List, Tuple, Dict, Any
2+
import sys
3+
import os
4+
from multiprocessing import Pool
5+
from statistics import mean, stdev
6+
from datetime import datetime
7+
from tqdm import tqdm
8+
from functools import partial
9+
10+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
11+
12+
from codeconflict.codeagent.simulate import simulate_conversation
13+
14+
import numpy as np
15+
16+
def run_simulation_batch(max_attempts: int) -> Tuple[bool, int, float]:
17+
"""Run a single simulation with fixed turns=2 and specified max_attempts
18+
19+
Args:
20+
max_attempts: Maximum number of attempts allowed for conflict resolution
21+
22+
Returns:
23+
Tuple of (success, turns_taken, reward_score)
24+
"""
25+
try:
26+
result = simulate_conversation(turns=2, max_attempts=max_attempts)
27+
return result['success'], result['turns_taken'], result.get('reward_score', 0.0)
28+
except Exception as e:
29+
print(f"Error in simulation: {str(e)}")
30+
return False, 0, 0.0
31+
32+
def run_parallel_simulations(num_runs: int, max_attempts: int, num_workers: int) -> List[Tuple[bool, int, float]]:
33+
"""Run multiple simulations in parallel
34+
35+
Args:
36+
num_runs: Number of simulations to run
37+
max_attempts: Maximum attempts for each simulation
38+
num_workers: Number of parallel workers
39+
40+
Returns:
41+
List of simulation results
42+
"""
43+
with Pool(processes=num_workers) as pool:
44+
# Run simulations in parallel
45+
results = list(tqdm(
46+
pool.imap(run_simulation_batch, [max_attempts] * num_runs),
47+
total=num_runs,
48+
desc=f"Runs (max_attempts={max_attempts})",
49+
position=1,
50+
leave=False
51+
))
52+
return results
53+
54+
def evaluate_simulations() -> Dict[str, Any]:
55+
"""Run simulations with fixed turns=2 and evaluate performance at different max_attempts.
56+
Each configuration is run multiple times to calculate mean and standard deviation.
57+
58+
Returns:
59+
dict: Contains success statistics for each max_attempts configuration
60+
"""
61+
results = {}
62+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
63+
num_runs = 4 # Number of runs per configuration
64+
max_turns = 2 # Fixed number of turns
65+
num_workers = 4 # Number of parallel workers
66+
67+
try:
68+
# Main progress bar for different max_attempts configurations
69+
with tqdm(total=3, desc="Evaluating configurations", position=0) as pbar:
70+
for max_attempts in [1, 2, 3]:
71+
print(f"\n=== Evaluating with max_attempts={max_attempts} ===\n")
72+
73+
# Run simulations in parallel
74+
simulation_results = run_parallel_simulations(num_runs, max_attempts, num_workers)
75+
76+
# Process results
77+
successes = []
78+
turns = []
79+
rewards = []
80+
81+
for success, turns_taken, reward_score in simulation_results:
82+
successes.append(1 if success else 0)
83+
if success:
84+
turns.append(turns_taken)
85+
rewards.append(reward_score)
86+
87+
# Calculate statistics
88+
success_rate = mean(successes) if successes else 0
89+
success_std = stdev(successes) if len(successes) > 1 else 0
90+
mean_turns = mean(turns) if turns else 0
91+
turns_std = stdev(turns) if len(turns) > 1 else 0
92+
mean_reward = mean(rewards) if rewards else 0
93+
reward_std = stdev(rewards) if len(rewards) > 1 else 0
94+
95+
results[f'max_attempts_{max_attempts}'] = {
96+
'success_rate': success_rate,
97+
'success_std': success_std,
98+
'mean_turns': mean_turns,
99+
'turns_std': turns_std,
100+
'mean_reward': mean_reward,
101+
'reward_std': reward_std,
102+
'successes': successes,
103+
'turns': turns,
104+
'rewards': rewards
105+
}
106+
107+
# Print results
108+
print(f"\n=== Results for max_attempts={max_attempts} ===")
109+
print(f"Success Rate: {success_rate * 100:.1f}% ± {success_std * 100:.1f}%")
110+
if turns:
111+
print(f"Average Turns to Success: {mean_turns:.1f} ± {turns_std:.1f}")
112+
print(f"Average Reward Score: {mean_reward:.1f} ± {reward_std:.1f}")
113+
else:
114+
print("No successful runs recorded")
115+
print()
116+
117+
pbar.update(1)
118+
119+
# Add metadata to results
120+
results['metadata'] = {
121+
'timestamp': timestamp,
122+
'num_runs': num_runs,
123+
'max_turns': max_turns,
124+
'max_retries': max_attempts,
125+
'num_workers': num_workers
126+
}
127+
128+
return results
129+
130+
except Exception as e:
131+
print(f"Error during evaluation: {str(e)}")
132+
return {'error': str(e), 'metadata': {'timestamp': timestamp}}
133+
134+
def main():
135+
evaluate_simulations()
136+
137+
if __name__ == '__main__':
138+
main()

codeconflict/codeagent/models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from typing import Literal, Optional
2+
from pydantic import BaseModel
3+
4+
class AgentMessage(BaseModel):
5+
action: Literal['speak', 'leave']
6+
content: Optional[str] = None
7+
8+
class AgentAction(BaseModel):
9+
action: Literal['read', 'write', 'execute']
10+
command: str | None = None
11+
path: str | None = None
12+
content: str | None = None
13+
continue_action: bool | None = None # Indicates if the agent needs to continue with more actions

0 commit comments

Comments
 (0)