diff --git a/docs/cookbooks/codex-coding.mdx b/docs/cookbooks/codex-coding.mdx
new file mode 100644
index 00000000..7903eb9d
--- /dev/null
+++ b/docs/cookbooks/codex-coding.mdx
@@ -0,0 +1,378 @@
+---
+title: "Codex Agent"
+description: "Build coding agents with OpenAI's native shell and apply_patch tools"
+icon: "code"
+---
+
+HUD provides native support for OpenAI's coding tools (`shell` and `apply_patch`), enabling you to build powerful coding agents that can create, modify, and execute code.
+
+
+ Follow along with the full working example on GitHub.
+
+
+## Overview
+
+OpenAI's Responses API includes specialized tools for coding tasks:
+
+| Tool | Purpose | HUD Implementation |
+| ------------- | ------------------------------------------------------ | -------------------------------------- |
+| `shell` | Execute shell commands in a persistent bash session | `hud.tools.shell.ShellTool` |
+| `apply_patch` | Create, update, and delete files using V4A diff format | `hud.tools.apply_patch.ApplyPatchTool` |
+
+When you register tools named `shell` or `apply_patch` in your environment, the `OpenAIAgent` automatically converts them to OpenAI's native tool types for optimal performance.
+
+## Two Modes
+
+HUD supports two execution modes for coding agents:
+
+| Mode | Tools Run On | Inference Via | API Keys Required |
+| --------------------- | ------------ | --------------- | ----------------- |
+| **Local** (`--local`) | Your machine | OpenAI directly | `OPENAI_API_KEY` |
+| **Hub** (default) | HUD Cloud | HUD Gateway | `HUD_API_KEY` |
+
+Both modes support traces on hud.ai if `HUD_API_KEY` is set.
+
+## Quick Start
+
+### Local Mode (No Docker)
+
+Run coding agents directly on your machine without any infrastructure:
+
+```python
+import hud
+from hud.agents.openai import OpenAIAgent
+from hud.tools.shell import ShellTool
+from hud.tools.apply_patch import ApplyPatchTool
+
+# Create environment with coding tools
+env = hud.Environment("coding")
+shell_tool = ShellTool()
+apply_patch_tool = ApplyPatchTool(base_path="/path/to/workspace")
+
+@env.tool()
+async def shell(commands: list[str], timeout_ms: int | None = None):
+ """Execute shell commands."""
+ result = await shell_tool(commands=commands, timeout_ms=timeout_ms)
+ return result.to_dict()
+
+@env.tool()
+async def apply_patch(type: str, path: str, diff: str | None = None):
+ """Apply file patches."""
+ result = await apply_patch_tool(type=type, path=path, diff=diff)
+ return result.to_dict()
+
+# Run with OpenAI agent (calls OpenAI directly)
+agent = OpenAIAgent.create(model="gpt-5.1")
+
+async with hud.eval(env(), name="coding-task") as ctx:
+ result = await agent.run(ctx, max_steps=20)
+```
+
+### Hub Mode (Cloud Execution)
+
+
+ **Prerequisites**: You must create the `codex_environment_sandbox` environment
+ in [hud.ai](https://hud.ai) first before using hub mode. Go to
+ [hud.ai](https://hud.ai) → **New** → **Environment** → Import from
+ [hud-evals/codex_environment_sandbox](https://github.com/hud-evals/codex_environment_sandbox).
+ Once deployed, your environment will be accessible via `connect_hub()`.
+
+
+Connect to HUD Hub for full cloud execution and telemetry:
+
+```python
+import hud
+from hud.agents.openai import OpenAIAgent
+from hud.settings import settings
+from openai import AsyncOpenAI
+
+# Connect to HUD Hub environment
+env = hud.Environment()
+env.connect_hub("codex_environment_sandbox")
+
+# Use HUD Gateway for inference (full telemetry)
+model_client = AsyncOpenAI(
+ base_url=settings.hud_gateway_url,
+ api_key=settings.api_key,
+)
+agent = OpenAIAgent.create(
+ model="gpt-5.1",
+ model_client=model_client,
+ validate_api_key=False,
+)
+
+async with hud.eval(env(), name="coding-task") as ctx:
+ result = await agent.run(ctx, max_steps=20)
+```
+
+
+ The first request may take a few seconds while the environment spins up in the
+ cloud. Subsequent requests will be faster.
+
+
+## Tool Specifications
+
+### Shell Tool
+
+The `ShellTool` provides a persistent bash session for executing commands.
+
+**Features:**
+
+- Auto-restart on error (session automatically restarts if needed)
+- Dynamic timeout via `timeout_ms` parameter
+- Persistent environment (exported variables, working directory)
+- Concurrent command execution support
+
+**Input Schema:**
+
+```python
+{
+ "commands": ["ls -la", "cat file.py"], # List of commands
+ "timeout_ms": 30000, # Optional timeout per command
+ "max_output_length": 10000 # Optional output limit
+}
+```
+
+**Output Format:**
+
+```python
+{
+ "output": [
+ {
+ "stdout": "file1.py\nfile2.py",
+ "stderr": "",
+ "outcome": {"type": "exit", "exit_code": 0}
+ }
+ ]
+}
+```
+
+### Apply Patch Tool
+
+The `ApplyPatchTool` creates, updates, and deletes files using OpenAI's V4A diff format.
+
+**Operations:**
+
+| Operation | Description | Diff Required |
+| ------------- | -------------------- | ------------- |
+| `create_file` | Create a new file | Yes |
+| `update_file` | Modify existing file | Yes |
+| `delete_file` | Remove a file | No |
+
+**Input Schema:**
+
+```python
+{
+ "type": "update_file",
+ "path": "src/main.py",
+ "diff": "..." # V4A diff content
+}
+```
+
+**V4A Diff Format Example:**
+
+```diff
+@@ def hello():
+- print("Hello")
++ print("Hello, World!")
+```
+
+**Output Format:**
+
+```python
+{
+ "status": "completed", # or "failed"
+ "output": "Updated src/main.py"
+}
+```
+
+## Agent Integration
+
+The `OpenAIAgent` automatically detects `shell` and `apply_patch` tools and converts them to OpenAI's native types:
+
+```python
+# In hud/agents/openai.py
+def _to_openai_tool(self, tool):
+ if tool.name == "shell":
+ return FunctionShellToolParam(type="shell")
+ if tool.name == "apply_patch":
+ return ApplyPatchToolParam(type="apply_patch")
+ # ... regular function tools
+```
+
+This means:
+
+1. The model sees native `shell` and `apply_patch` tools
+2. Responses include `shell_call` and `apply_patch_call` output types
+3. The agent routes these back to your registered tools
+
+## Complete Example
+
+Here's the full local mode example with a working directory:
+
+```python
+import asyncio
+import os
+import tempfile
+
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+
+load_dotenv() # Load .env file
+
+import hud
+from hud.agents.openai import OpenAIAgent
+from hud.tools.shell import ShellTool
+from hud.tools.apply_patch import ApplyPatchTool
+
+
+async def main():
+ # Set up working directory
+ work_dir = "./codex_output"
+ os.makedirs(work_dir, exist_ok=True)
+ base_path = os.path.abspath(work_dir)
+
+ # Initialize tools
+ shell_tool = ShellTool()
+ apply_patch_tool = ApplyPatchTool(base_path=base_path)
+
+ # Create environment with local tools
+ env = hud.Environment("local-codex")
+
+ @env.tool()
+ async def shell(
+ commands: list[str],
+ timeout_ms: int | None = None,
+ max_output_length: int | None = None,
+ ) -> dict:
+ """Execute shell commands in a bash session."""
+ import shlex
+ # Change to working directory before executing
+ safe_path = shlex.quote(base_path)
+ prefixed_commands = [f"cd {safe_path} && {cmd}" for cmd in commands]
+ result = await shell_tool(
+ commands=prefixed_commands,
+ timeout_ms=timeout_ms,
+ max_output_length=max_output_length,
+ )
+ return result.to_dict()
+
+ @env.tool()
+ async def apply_patch(
+ type: str,
+ path: str,
+ diff: str | None = None,
+ ) -> dict:
+ """Apply file operations using V4A diff format."""
+ result = await apply_patch_tool(type=type, path=path, diff=diff)
+ return result.to_dict()
+
+ # Define scenario
+ @env.scenario("coding_task")
+ async def coding_task_scenario(task_description: str):
+ yield f"""You are a skilled software developer. Complete the following task:
+
+{task_description}
+
+Use the available tools:
+- `shell` to run commands (ls, cat, python, etc.)
+- `apply_patch` to create or modify files
+
+Work in the current directory. When done, verify your work runs correctly."""
+
+ yield 1.0
+
+ # Create agent
+ agent = OpenAIAgent.create(model="gpt-5.1", verbose=True)
+
+ # Run the task
+ task = "Create a Python script called main.py that prints Hello World"
+ eval_task = env("coding_task", task_description=task)
+
+ async with hud.eval(eval_task, name="codex-coding-local") as ctx:
+ await agent.run(ctx, max_steps=20)
+
+ print(f"Reward: {ctx.reward}")
+ print(f"Files created in: {base_path}")
+
+ # Show created files
+ for f in os.listdir(base_path):
+ print(f" - {f}")
+
+
+asyncio.run(main())
+```
+
+## CLI Usage
+
+### Setting Up API Keys
+
+Create a `.env` file in your project root:
+
+```bash
+# For local mode (calls OpenAI directly)
+OPENAI_API_KEY=sk-...
+
+# For hub mode OR traces (recommended)
+HUD_API_KEY=sk-hud-...
+```
+
+Get your keys:
+
+- **HUD_API_KEY**: [hud.ai/project/api-keys](https://hud.ai/project/api-keys)
+- **OPENAI_API_KEY**: [platform.openai.com/api-keys](https://platform.openai.com/api-keys)
+
+
+ If you have both keys set, you get local execution with cloud traces - the
+ best of both worlds!
+
+
+### Running the Example
+
+```bash
+# Local mode - tools run on your machine
+uv run python examples/06_codex_coding_agent.py --local
+
+# Local mode with persistent output directory
+uv run python examples/06_codex_coding_agent.py --local --work-dir ./codex_output
+
+# Hub mode - full cloud execution (default)
+uv run python examples/06_codex_coding_agent.py
+
+# Custom task
+uv run python examples/06_codex_coding_agent.py --local \
+ --task "Create a Python script that prints the Fibonacci sequence up to 10 numbers"
+
+# Verbose output
+uv run python examples/06_codex_coding_agent.py --local --verbose
+```
+
+### CLI Options
+
+| Flag | Default | Description |
+| ------------- | ------------------ | -------------------------------------------------- |
+| `--local` | Off | Run locally (tools on your machine, OpenAI direct) |
+| `--task` | Hello World script | The coding task to complete |
+| `--model` | `gpt-5.1` | Codex-capable model (`gpt-5.1`, `gpt-5.1-codex`) |
+| `--work-dir` | Temp directory | Working directory (local mode only) |
+| `--max-steps` | `20` | Maximum agent steps |
+| `--verbose` | Off | Enable verbose output |
+
+## Security Considerations
+
+
+ The shell and apply_patch tools can execute arbitrary commands and modify
+ files. Use them in sandboxed environments for untrusted tasks.
+
+
+## See Also
+
+- [Codex-capable models](https://platform.openai.com/docs/guides/tools-shell#supported-models) - OpenAI models that support native shell and apply_patch tools
+- [Tools Reference](/reference/tools) - Complete tool documentation
+- [OpenAI Agent](/reference/agents#openaiagent) - Agent configuration options
+- [Integrations](/guides/integrations) - Using HUD with other frameworks
+- [Sandboxing](/guides/sandboxing) - Running agents safely
diff --git a/docs/docs.json b/docs/docs.json
index 71e69e11..114ba090 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -60,6 +60,12 @@
"migration"
]
},
+ {
+ "group": "Cookbooks",
+ "pages": [
+ "cookbooks/codex-coding"
+ ]
+ },
{
"group": "Advanced",
"pages": [
@@ -231,4 +237,4 @@
"twitter:description": "OSS Evaluations and RL Environments SDK"
}
}
-}
\ No newline at end of file
+}
diff --git a/examples/06_codex_coding_agent.py b/examples/06_codex_coding_agent.py
new file mode 100644
index 00000000..1873e7dd
--- /dev/null
+++ b/examples/06_codex_coding_agent.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+"""
+Codex Coding Agent Example
+
+This example demonstrates how to use OpenAI's **Codex-capable** models with
+native `shell` and `apply_patch` tools via the HUD SDK.
+
+What this shows:
+- **Local mode**: Run locally without Docker - tools execute on your machine
+- **Hub mode**: Connect to HUD Hub for full telemetry and cloud execution
+- OpenAIAgent automatically converts tools to OpenAI's native tool types
+
+Usage:
+ # Local mode (no Docker required, no HUD_API_KEY required for OPENAI_API_KEY users)
+ uv run python examples/06_codex_coding_agent.py --local
+
+ # Hub mode (requires HUD_API_KEY)
+ export HUD_API_KEY="sk-hud-..."
+ uv run python examples/06_codex_coding_agent.py
+
+ # Custom task
+ uv run python examples/06_codex_coding_agent.py --local \\
+ --task "Create a Python script that prints the Fibonacci sequence"
+
+Requirements:
+ - Install deps: `uv sync`
+ - For local mode: OPENAI_API_KEY environment variable
+ - For hub mode: HUD_API_KEY environment variable
+ - For traces (hud.eval): HUD_API_KEY environment variable
+"""
+
+import argparse
+import asyncio
+import os
+import shlex
+
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+
+# Load .env file from current directory or parent directories
+load_dotenv()
+
+import hud
+from hud.agents.openai import OpenAIAgent
+from hud.settings import settings
+from hud.tools.apply_patch import ApplyPatchTool
+from hud.tools.shell import ShellTool
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+# Default hub environment name
+DEFAULT_HUB = "codex_environment_sandbox"
+
+# Codex-capable models that support native shell/apply_patch tools
+CODEX_MODELS = {
+ "gpt-5.1-codex",
+ "gpt-5.1",
+}
+
+
+# =============================================================================
+# Run Coding Task Locally (No Docker)
+# =============================================================================
+
+
+async def run_coding_task_local(
+ task: str,
+ model: str = "gpt-5.1",
+ max_steps: int = 20,
+ verbose: bool = False,
+ work_dir: str | None = None,
+) -> None:
+ """
+ Run a coding task locally without Docker.
+
+ Uses ShellTool and ApplyPatchTool running on your local machine.
+ Files are created in a temporary directory (or specified work_dir).
+
+ Args:
+ task: Description of the coding task
+ model: OpenAI model to use (default: gpt-5.1)
+ max_steps: Maximum agent steps (default: 20)
+ verbose: Enable verbose output
+ work_dir: Working directory for file operations (default: temp dir)
+ """
+ # Validate model is Codex-capable
+ if model not in CODEX_MODELS:
+ raise ValueError(
+ f"Model '{model}' is not in the Codex-capable list {sorted(CODEX_MODELS)}.\n"
+ "Use a model that supports native shell/apply_patch tools."
+ )
+
+ # Create working directory
+ if work_dir:
+ os.makedirs(work_dir, exist_ok=True)
+ base_path = os.path.abspath(work_dir)
+ else:
+ # Default to ./codex_output
+ work_dir = "./codex_output"
+ os.makedirs(work_dir, exist_ok=True)
+ base_path = os.path.abspath(work_dir)
+
+ print(f"📁 Working directory: {base_path}")
+
+ # Initialize tools
+ shell_tool = ShellTool()
+ apply_patch_tool = ApplyPatchTool(base_path=base_path)
+
+ # Create environment with local tools
+ env = hud.Environment("local-codex")
+
+ @env.tool()
+ async def shell(
+ commands: list[str],
+ timeout_ms: int | None = None,
+ max_output_length: int | None = None,
+ ) -> dict:
+ """Execute shell commands in a bash session.
+
+ Args:
+ commands: List of shell commands to execute
+ timeout_ms: Optional timeout in milliseconds for each command
+ max_output_length: Optional max output length hint
+ """
+ # Change to working directory before executing
+ # Use shlex.quote to safely handle paths with spaces or special characters
+ safe_path = shlex.quote(base_path)
+ prefixed_commands = [f"cd {safe_path} && {cmd}" for cmd in commands]
+ result = await shell_tool(
+ commands=prefixed_commands,
+ timeout_ms=timeout_ms,
+ max_output_length=max_output_length,
+ )
+ return result.to_dict()
+
+ @env.tool()
+ async def apply_patch(
+ type: str,
+ path: str,
+ diff: str | None = None,
+ ) -> dict:
+ """Apply file operations using V4A diff format.
+
+ Args:
+ type: Operation type - "create_file", "update_file", or "delete_file"
+ path: The file path to operate on
+ diff: The diff content (required for create_file and update_file)
+ """
+ result = await apply_patch_tool(type=type, path=path, diff=diff)
+ return result.to_dict()
+
+ # Create OpenAI client
+ model_client = AsyncOpenAI()
+ agent = OpenAIAgent.create(
+ model=model,
+ model_client=model_client,
+ verbose=verbose,
+ )
+
+ print(f"🤖 Model: {model}")
+ print(f"📋 Task: {task}")
+ print("=" * 60)
+
+ # Define a scenario for the coding task
+ @env.scenario("coding_task")
+ async def coding_task_scenario(task_description: str):
+ yield f"""You are a skilled software developer. Complete the following task:
+
+{task_description}
+
+Use the available tools:
+- `shell` to run commands (ls, cat, python, etc.)
+- `apply_patch` to create or modify files
+
+Work in the current directory. When done, verify your work runs correctly."""
+
+ # Simple success - task completed
+ yield 1.0
+
+ # Run the agent
+ eval_task = env("coding_task", task_description=task)
+
+ async with hud.eval(eval_task, name="codex-coding-local") as ctx:
+ await agent.run(ctx, max_steps=max_steps)
+
+ print("=" * 60)
+ print("✅ Task completed!")
+ print(f"📊 Reward: {ctx.reward}")
+ print(f"📁 Files created in: {base_path}")
+
+ # List created files
+ if os.path.exists(base_path):
+ files = os.listdir(base_path)
+ if files:
+ print(f"📄 Files: {', '.join(files)}")
+
+
+# =============================================================================
+# Run Coding Task via HUD Hub
+# =============================================================================
+
+
+async def run_coding_task_hub(
+ task: str,
+ model: str = "gpt-5.1",
+ max_steps: int = 20,
+ hub_name: str = DEFAULT_HUB,
+ verbose: bool = False,
+) -> None:
+ """
+ Run a coding task against the codex_environment_sandbox via HUD Hub.
+
+ Uses connect_hub() to route through HUD's infrastructure, enabling
+ full telemetry (both inference and environment steps visible in trace).
+
+ Note: You must create the codex_environment_sandbox environment in hud.ai
+ first before using this function.
+
+ Args:
+ task: Description of the coding task
+ model: OpenAI model to use (default: gpt-5.1)
+ max_steps: Maximum agent steps (default: 20)
+ hub_name: Hub environment name (default: codex_environment_sandbox)
+ verbose: Enable verbose output
+ """
+ # Require HUD_API_KEY for hub mode
+ if not settings.api_key:
+ raise ValueError(
+ "HUD_API_KEY is required for hub mode.\n"
+ "Get yours at: https://hud.ai/project/api-keys\n"
+ "Then: export HUD_API_KEY='sk-hud-...'\n\n"
+ "Or use --local flag to run without HUD infrastructure."
+ )
+
+ print(f"🌐 Connecting to hub: {hub_name}")
+
+ # Create environment and connect via HUD Hub (full telemetry)
+ env = hud.Environment()
+ env.connect_hub(hub_name)
+
+ # Validate model is Codex-capable
+ if model not in CODEX_MODELS:
+ raise ValueError(
+ f"Model '{model}' is not in the Codex-capable list {sorted(CODEX_MODELS)}.\n"
+ "Use a model that supports native shell/apply_patch tools."
+ )
+
+ # Create agent with HUD Gateway for inference telemetry
+ model_client = AsyncOpenAI(
+ base_url=settings.hud_gateway_url,
+ api_key=settings.api_key,
+ )
+ agent = OpenAIAgent.create(
+ model=model,
+ model_client=model_client,
+ validate_api_key=False, # HUD key won't validate against OpenAI
+ verbose=verbose,
+ )
+ print("🌐 Using HUD Gateway for inference")
+
+ print(f"🤖 Model: {model}")
+ print(f"📋 Task: {task}")
+ print("=" * 60)
+
+ # Define a scenario for the coding task
+ @env.scenario("coding_task")
+ async def coding_task_scenario(task_description: str):
+ yield f"""You are a skilled software developer. Complete the following task:
+
+{task_description}
+
+Use the available tools:
+- `shell` to run commands (ls, cat, python, etc.)
+- `apply_patch` to create or modify files
+
+Work in the current directory. When done, verify your work runs correctly."""
+
+ # Evaluation is handled by the environment's evaluate tool
+ yield 1.0
+
+ # Run the agent
+ eval_task = env("coding_task", task_description=task)
+
+ async with hud.eval(eval_task, name="codex-coding") as ctx:
+ await agent.run(ctx, max_steps=max_steps)
+
+ print("=" * 60)
+ print("✅ Task completed!")
+ print(f"📊 Reward: {ctx.reward}")
+
+
+# =============================================================================
+# CLI
+# =============================================================================
+
+
+def _parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description="Run coding tasks with OpenAI's native shell and apply_patch tools",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Local mode (no Docker, no HUD_API_KEY required)
+ uv run python examples/06_codex_coding_agent.py --local
+
+ # Local mode with custom working directory
+ uv run python examples/06_codex_coding_agent.py --local --work-dir ./codex_output
+
+ # Hub mode (full telemetry, requires HUD_API_KEY)
+ uv run python examples/06_codex_coding_agent.py
+
+ # Custom task
+ uv run python examples/06_codex_coding_agent.py --local \\
+ --task "Create a Python script that prints the Fibonacci sequence up to 10 numbers"
+
+ # Verbose output
+ uv run python examples/06_codex_coding_agent.py --local --verbose
+
+ # Use a different Codex model
+ uv run python examples/06_codex_coding_agent.py --local --model gpt-5.1-codex
+""",
+ )
+ parser.add_argument(
+ "--local",
+ action="store_true",
+ help="Run locally without Docker (tools execute on your machine)",
+ )
+ parser.add_argument(
+ "--task",
+ type=str,
+ default="Create a Python script called main.py that prints 'Hello, World!' and the current date/time",
+ help="The coding task to complete",
+ )
+ parser.add_argument(
+ "--model",
+ type=str,
+ default="gpt-5.1",
+ help="Codex-capable OpenAI model (default: gpt-5.1)",
+ )
+ parser.add_argument(
+ "--max-steps",
+ type=int,
+ default=20,
+ help="Maximum agent steps (default: 20)",
+ )
+ parser.add_argument(
+ "--work-dir",
+ type=str,
+ default=None,
+ help="Working directory for file operations (local mode only, default: ./codex_output)",
+ )
+ parser.add_argument(
+ "--verbose",
+ action="store_true",
+ help="Enable verbose output",
+ )
+ return parser.parse_args()
+
+
+async def main() -> None:
+ args = _parse_args()
+
+ if args.local:
+ await run_coding_task_local(
+ task=args.task,
+ model=args.model,
+ max_steps=args.max_steps,
+ verbose=args.verbose,
+ work_dir=args.work_dir,
+ )
+ else:
+ await run_coding_task_hub(
+ task=args.task,
+ model=args.model,
+ max_steps=args.max_steps,
+ verbose=args.verbose,
+ )
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/hud/tools/shell.py b/hud/tools/shell.py
index fe6a7efa..f357c593 100644
--- a/hud/tools/shell.py
+++ b/hud/tools/shell.py
@@ -82,10 +82,10 @@ async def start(self) -> None:
await asyncio.sleep(0)
return
- # preexec_fn and user demotion only available on Unix
+ # preexec_fn and user demotion only available on Unix when running as root
preexec_fn = None
- if sys.platform != "win32":
-
+ if sys.platform != "win32" and os.getuid() == 0:
+ # Only demote when running as root (e.g., inside Docker containers)
def demote() -> None:
# This only runs in the child process (Unix only)
os.setsid() # type: ignore[attr-defined]