tau2 enhancements (#2)

peterjgilbert · web-flow · commit 2582cf92c5ce · 2025-12-22T10:19:49.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,7 @@
 .python-version
 .venv/
 __pycache__/
-*.pyc
+*.pyc
+
+# tau2-bench data (downloaded via setup.sh)
+scenarios/tau2/tau2-bench/
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,9 +14,12 @@ dependencies = [
     "google-genai>=1.36.0",
     "litellm>=1.0.0",
     "loguru>=0.7.0",
+    "nest-asyncio>=1.6.0",
     "pydantic>=2.11.9",
     "python-dotenv>=1.1.1",
     "uvicorn>=0.35.0",
+    # tau2 from GitHub
+    "tau2 @ git+https://github.com/sierra-research/tau2-bench.git",
 ]
 
 [project.scripts]
@@ -28,5 +31,8 @@ dev-dependencies = [
     "mypy>=1.18.1",
 ]
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 [tool.hatch.build.targets.wheel]
 packages = ["src/agentbeats"]
diff --git a/scenarios/tau2/Dockerfile.tau2-evaluator b/scenarios/tau2/Dockerfile.tau2-evaluator
@@ -12,16 +12,11 @@ RUN \
     --mount=type=cache,target=/home/agentbeats/.cache/uv,uid=1000 \
     uv sync --locked
 
-# Install tau2 from GitHub (not on PyPI)
-RUN \
-    --mount=type=cache,target=/home/agentbeats/.cache/uv,uid=1000 \
-    uv pip install "tau2 @ git+https://github.com/sierra-research/tau2-bench.git"
-
-# Download tau2 data
 USER root
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 USER agentbeats
 
+# Download tau2 data
 RUN git clone --depth 1 --filter=blob:none --sparse https://github.com/sierra-research/tau2-bench.git /home/agentbeats/tau2-bench && \
     cd /home/agentbeats/tau2-bench && \
     git sparse-checkout set data
diff --git a/scenarios/tau2/README.md b/scenarios/tau2/README.md
@@ -0,0 +1,41 @@
+# Tau2 Benchmark Scenario
+
+This scenario evaluates agents on the [tau2-bench](https://github.com/sierra-research/tau2-bench) benchmark, which tests customer service agents on realistic tasks.
+
+## Setup
+
+1. **Download the tau2-bench data** (required once):
+
+   ```bash
+   ./scenarios/tau2/setup.sh
+   ```
+
+2. **Set your API key** in `.env`:
+
+   ```
+   OPENAI_API_KEY=your-key-here
+   ```
+
+## Running the Benchmark
+
+```bash
+TAU2_DATA_DIR=./scenarios/tau2/tau2-bench/data uv run agentbeats-run scenarios/tau2/scenario.toml
+```
+
+## Configuration
+
+Edit `scenario.toml` to configure the benchmark:
+
+```toml
+[config]
+domain = "airline"      # airline, retail, telecom, or mock
+num_tasks = 5           # number of tasks to run
+user_llm = "openai/gpt-4.1"  # LLM for user simulator (optional, defaults to gpt-4.1)
+```
+
+The agent LLM defaults to `openai/gpt-4.1` and can be configured via the `--agent-llm` CLI argument in `tau2_agent.py`.
+
+## Architecture
+
+- **tau2_evaluator.py** (Green Agent): Runs the tau2 Orchestrator which coordinates the user simulator, environment, and agent
+- **tau2_agent.py** (Purple Agent): The agent being tested - receives task descriptions and responds with tool calls or user responses
diff --git a/scenarios/tau2/scenario.toml b/scenarios/tau2/scenario.toml
@@ -9,4 +9,4 @@ cmd = "python scenarios/tau2/tau2_agent.py --host 127.0.0.1 --port 9019"
 
 [config]
 domain = "airline"
-num_tasks = 3
+num_tasks = 4
diff --git a/scenarios/tau2/setup.sh b/scenarios/tau2/setup.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Setup script for tau2 scenario
+# This downloads the tau2-bench data required to run the benchmark
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TAU2_BENCH_DIR="$SCRIPT_DIR/tau2-bench"
+
+if [ -d "$TAU2_BENCH_DIR" ]; then
+    echo "tau2-bench already exists at $TAU2_BENCH_DIR"
+    echo "To re-download, remove the directory first: rm -rf $TAU2_BENCH_DIR"
+    exit 0
+fi
+
+echo "Cloning tau2-bench repository..."
+git clone --depth 1 https://github.com/sierra-research/tau2-bench.git "$TAU2_BENCH_DIR"
+
+echo ""
+echo "Setup complete! tau2-bench data is now available at:"
+echo "  $TAU2_BENCH_DIR/data"
+echo ""
+echo "To run the tau2 scenario:"
+echo "  TAU2_DATA_DIR=$TAU2_BENCH_DIR/data uv run agentbeats-run scenarios/tau2/scenario.toml"
diff --git a/scenarios/tau2/tau2_agent.py b/scenarios/tau2/tau2_agent.py
@@ -4,10 +4,8 @@
 This is the agent being tested. It:
 1. Receives task descriptions with available tools from the green agent
 2. Decides which tool to call or how to respond
-3. Returns responses in the expected JSON format wrapped in <json>...</json> tags
 """
 import argparse
-import os
 import uvicorn
 from dotenv import load_dotenv
 
@@ -45,38 +43,14 @@ def prepare_agent_card(url: str) -> AgentCard:
     )
 
 
-SYSTEM_PROMPT = """You are a helpful customer service agent being evaluated on your ability to solve tasks.
-
-You will receive:
-1. A policy describing your role and guidelines
-2. A list of available tools you can use
-3. User messages that you need to respond to
-
-CRITICAL: You MUST respond in the exact JSON format specified, wrapped in <json>...</json> tags.
-
-For tool calls, respond with:
-<json>
-{"name": "tool_name", "arguments": {"arg1": "value1", ...}}
-</json>
-
-To respond directly to the user, use:
-<json>
-{"name": "respond", "arguments": {"content": "Your message here"}}
-</json>
-
-Rules:
-- Only use one tool at a time
-- You cannot respond to the user AND use a tool in the same message
-- Follow the policy guidelines provided
-- Be helpful and accurate
-- ALWAYS wrap your response in <json>...</json> tags
-"""
+SYSTEM_PROMPT = """You are a helpful customer service agent. Follow the policy and tool instructions provided in each message."""
 
 
 class Tau2AgentExecutor(AgentExecutor):
     """Executor for the tau2 purple agent."""
 
-    def __init__(self):
+    def __init__(self, model: str):
+        self.model = model
         self.ctx_id_to_messages: dict[str, list[dict]] = {}
 
     async def execute(self, context: RequestContext, event_queue: EventQueue) -> None:
@@ -96,8 +70,9 @@ async def execute(self, context: RequestContext, event_queue: EventQueue) -> Non
         try:
             response = completion(
                 messages=messages,
-                model="openai/gpt-4o",
+                model=self.model,
                 temperature=0.0,
+                response_format={ "type": "json_object" },
             )
             assistant_content = response.choices[0].message.content
             logger.info(f"LLM response: {assistant_content[:200]}...")
@@ -122,13 +97,14 @@ def main():
     parser.add_argument("--host", type=str, default="127.0.0.1", help="Host to bind the server")
     parser.add_argument("--port", type=int, default=9019, help="Port to bind the server")
     parser.add_argument("--card-url", type=str, help="External URL for the agent card")
+    parser.add_argument("--agent-llm", type=str, default="openai/gpt-4.1", help="LLM model to use")
     args = parser.parse_args()
 
     logger.info("Starting tau2 agent...")
     card = prepare_agent_card(args.card_url or f"http://{args.host}:{args.port}/")
 
     request_handler = DefaultRequestHandler(
-        agent_executor=Tau2AgentExecutor(),
+        agent_executor=Tau2AgentExecutor(model=args.agent_llm),
         task_store=InMemoryTaskStore(),
     )
 
diff --git a/scenarios/tau2/tau2_evaluator.py b/scenarios/tau2/tau2_evaluator.py
diff --git a/uv.lock b/uv.lock