hud-evals
diff --git a/‎hud/cli/__init__.py‎
Lines changed: 69 additions & 12 deletions b/‎hud/cli/__init__.py‎
Lines changed: 69 additions & 12 deletions
diff --git a/‎hud/cli/analyze_metadata.py‎
Lines changed: 4 additions & 1 deletion b/‎hud/cli/analyze_metadata.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎hud/cli/env_utils.py‎
Lines changed: 133 additions & 0 deletions b/‎hud/cli/env_utils.py‎
Lines changed: 133 additions & 0 deletions
@@ -29,7 +29,6 @@
 from .push import push_command
 from .remove import remove_command
 from .utils import CaptureLogger
-from .eval import eval_command
 
 # Create the main Typer app
 app = typer.Typer(
@@ -132,7 +131,7 @@ def analyze(
 def debug(
     params: list[str] = typer.Argument(  # type: ignore[arg-type]  # noqa: B008
         None,
-        help="Docker image followed by optional Docker run arguments (e.g., 'hud-image:latest -e KEY=value')",  # noqa: E501
+        help="Docker image, environment directory, or config file followed by optional Docker arguments",  # noqa: E501
     ),
     config: Path = typer.Option(  # noqa: B008
         None,
@@ -148,6 +147,12 @@ def debug(
         "--cursor",
         help="Debug a server from Cursor config",
     ),
+    build: bool = typer.Option(
+        False,
+        "--build",
+        "-b",
+        help="Build image before debugging (for directory mode)",
+    ),
     max_phase: int = typer.Option(
         5,
         "--max-phase",
@@ -160,15 +165,24 @@ def debug(
     """🐛 Debug MCP environment - test initialization, tools, and readiness.
 
     Examples:
-        hud debug hud-text-2048:latest
-        hud debug my-mcp-server:v1 -e API_KEY=xxx -p 8080:8080
+        hud debug .                              # Debug current directory
+        hud debug environments/browser           # Debug specific directory
+        hud debug . --build                      # Build then debug
+        hud debug hud-text-2048:latest          # Debug Docker image
+        hud debug my-mcp-server:v1 -e API_KEY=xxx
         hud debug --config mcp-config.json
         hud debug --cursor text-2048-dev
-        hud debug hud-browser:dev --max-phase 3
+        hud debug . --max-phase 3               # Stop after phase 3
     """
-
+    # Import here to avoid circular imports
+    from .env_utils import get_image_name, is_environment_directory, build_environment, image_exists
+    from hud.utils.design import HUDDesign
+    
+    design = HUDDesign()
+    
     # Determine the command to run
     command = None
+    docker_args = []
 
     if config:
         # Load config from JSON file
@@ -186,13 +200,44 @@ def debug(
             console.print(f"[red]❌ {error or 'Failed to parse cursor config'}[/red]")
             raise typer.Exit(1)
     elif params:
-        image, *docker_args = params
-        # Build Docker command
-        command = ["docker", "run", "--rm", "-i", *docker_args, image]
+        first_param = params[0]
+        docker_args = params[1:] if len(params) > 1 else []
+        
+        # Check if it's a directory
+        if Path(first_param).exists() and is_environment_directory(first_param):
+            # Directory mode - like hud dev
+            directory = first_param
+            
+            # Get or generate image name
+            image_name, source = get_image_name(directory)
+            
+            if source == "auto":
+                design.info(f"Auto-generated image name: {image_name}")
+            
+            # Build if requested or if image doesn't exist
+            if build or not image_exists(image_name):
+                if not build and not image_exists(image_name):
+                    if typer.confirm(f"Image {image_name} not found. Build it now?"):
+                        build = True
+                    else:
+                        raise typer.Exit(1)
+                
+                if build:
+                    if not build_environment(directory, image_name):
+                        raise typer.Exit(1)
+            
+            # Build Docker command
+            command = ["docker", "run", "--rm", "-i", *docker_args, image_name]
+        else:
+            # Assume it's an image name
+            image = first_param
+            command = ["docker", "run", "--rm", "-i", *docker_args, image]
     else:
-        console.print("[red]Error: Must specify either a Docker image, --config, or --cursor[/red]")
+        console.print("[red]Error: Must specify a directory, Docker image, --config, or --cursor[/red]")
         console.print("\nExamples:")
-        console.print("  hud debug hud-text-2048:latest")
+        console.print("  hud debug .                      # Debug current directory")
+        console.print("  hud debug environments/browser   # Debug specific directory")
+        console.print("  hud debug hud-text-2048:latest  # Debug Docker image")
         console.print("  hud debug --config mcp-config.json")
         console.print("  hud debug --cursor my-server")
         raise typer.Exit(1)
@@ -699,7 +744,19 @@ def eval(
         design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
         raise typer.Exit(1)
 
-    # Import and run the command
+    # Import eval_command lazily to avoid importing agent dependencies
+    try:
+        from .eval import eval_command
+    except ImportError as e:
+        from hud.utils.design import HUDDesign
+        design = HUDDesign()
+        design.error(
+            "Evaluation dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+    
+    # Run the command
     eval_command(
         source=source,
         full=full,
 
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+from urllib.parse import quote
 
 import requests
 import yaml
@@ -26,7 +27,9 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
         if "/" in reference and ":" not in reference:
             reference = f"{reference}:latest"
 
-        registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{reference}"
+        # URL-encode the path segments to handle special characters in tags
+        url_safe_path = "/".join(quote(part, safe="") for part in reference.split("/"))
+        registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{url_safe_path}"
 
         headers = {}
         if settings.api_key:
 
@@ -0,0 +1,133 @@
+"""Shared utilities for environment directory handling."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from typing import Any
+
+import toml
+
+from hud.utils.design import HUDDesign
+
+design = HUDDesign()
+
+
+def get_image_name(directory: str | Path, image_override: str | None = None) -> tuple[str, str]:
+    """
+    Resolve image name with source tracking.
+
+    Returns:
+        Tuple of (image_name, source) where source is "override", "cache", or "auto"
+    """
+    if image_override:
+        return image_override, "override"
+
+    # Check pyproject.toml
+    pyproject_path = Path(directory) / "pyproject.toml"
+    if pyproject_path.exists():
+        try:
+            with open(pyproject_path) as f:
+                config = toml.load(f)
+            if config.get("tool", {}).get("hud", {}).get("image"):
+                return config["tool"]["hud"]["image"], "cache"
+        except Exception:
+            pass  # Silent failure, will use auto-generated name
+
+    # Auto-generate with :dev tag
+    dir_path = Path(directory).resolve()  # Get absolute path first
+    dir_name = dir_path.name
+    if not dir_name or dir_name == ".":
+        # If we're in root or have empty name, use parent directory
+        dir_name = dir_path.parent.name
+    clean_name = dir_name.replace("_", "-")
+    return f"hud-{clean_name}:dev", "auto"
+
+
+def update_pyproject_toml(directory: str | Path, image_name: str, silent: bool = False) -> None:
+    """Update pyproject.toml with image name."""
+    pyproject_path = Path(directory) / "pyproject.toml"
+    if pyproject_path.exists():
+        try:
+            with open(pyproject_path) as f:
+                config = toml.load(f)
+
+            # Ensure [tool.hud] exists
+            if "tool" not in config:
+                config["tool"] = {}
+            if "hud" not in config["tool"]:
+                config["tool"]["hud"] = {}
+
+            # Update image name
+            config["tool"]["hud"]["image"] = image_name
+
+            # Write back
+            with open(pyproject_path, "w") as f:
+                toml.dump(config, f)
+
+            if not silent:
+                design.success(f"Updated pyproject.toml with image: {image_name}")
+        except Exception as e:
+            if not silent:
+                design.warning(f"Could not update pyproject.toml: {e}")
+
+
+def build_environment(directory: str | Path, image_name: str, no_cache: bool = False) -> bool:
+    """Build Docker image for an environment.
+    
+    Returns:
+        True if build succeeded, False otherwise
+    """
+    build_cmd = ["docker", "build", "-t", image_name]
+    if no_cache:
+        build_cmd.append("--no-cache")
+    build_cmd.append(str(directory))
+
+    design.info(f"🔨 Building image: {image_name}{' (no cache)' if no_cache else ''}")
+    design.info("")  # Empty line before Docker output
+
+    # Just run Docker build directly - it has its own nice live display
+    result = subprocess.run(build_cmd)  # noqa: S603
+
+    if result.returncode == 0:
+        design.info("")  # Empty line after Docker output
+        design.success(f"Build successful! Image: {image_name}")
+        # Update pyproject.toml (silently since we already showed success)
+        update_pyproject_toml(directory, image_name, silent=True)
+        return True
+    else:
+        design.error("Build failed!")
+        return False
+
+
+def image_exists(image_name: str) -> bool:
+    """Check if a Docker image exists locally."""
+    result = subprocess.run(  # noqa: S603
+        ["docker", "image", "inspect", image_name],  # noqa: S607
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    return result.returncode == 0
+
+
+def is_environment_directory(path: str | Path) -> bool:
+    """Check if a path looks like an environment directory.
+    
+    An environment directory should have:
+    - A Dockerfile
+    - A pyproject.toml file
+    - Optionally a src directory
+    """
+    dir_path = Path(path)
+    if not dir_path.is_dir():
+        return False
+    
+    # Must have Dockerfile
+    if not (dir_path / "Dockerfile").exists():
+        return False
+        
+    # Must have pyproject.toml
+    if not (dir_path / "pyproject.toml").exists():
+        return False
+        
+    return True