Merge pull request #120 from WecoAI/dev

aliroberts · web-flow · commit 6f2430a6c12b · 2026-03-17T15:14:30.000Z
Merge Dev - Add support for observability + external optimizer
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ name = "weco"
 authors = [{ name = "Weco AI Team", email = "contact@weco.ai" }]
 description = "Documentation for `weco`, a CLI for using Weco AI's code optimizer."
 readme = "README.md"
-version = "0.3.18"
+version = "0.3.19"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
 dependencies = [
diff --git a/tests/langsmith/wizard/test_server.py b/tests/langsmith/wizard/test_server.py
@@ -206,7 +206,7 @@ def test_set_key_failure_clears_key(self, mock_os, mock_client_prop, wizard_serv
         resp, data = post_json(conn, "/api/set-key", {"key": "bad-key"})
         assert resp.status == 200
         assert data["connected"] is False
-        assert data["error"] == "Connection failed. Check that your API key is valid."
+        assert data["error"] == "Connection failed: Check that your API key is valid."
         mock_os.environ.pop.assert_called_once_with("LANGCHAIN_API_KEY", None)
 
     @patch.object(WizardServer, "client", new_callable=PropertyMock)
diff --git a/weco/cli.py b/weco/cli.py
@@ -6,6 +6,7 @@
 
 from .auth import perform_login
 from .config import clear_api_key, load_weco_api_key
+from .observe.cli import configure_observe_parser, execute_observe_command
 from .constants import DEFAULT_MODELS
 from .events import (
     send_event,
@@ -468,6 +469,14 @@ def _main() -> None:
     setup_parser = subparsers.add_parser("setup", help="Set up Weco for use with AI tools")
     configure_setup_parser(setup_parser)
 
+    # --- Observe Command Parser Setup ---
+    observe_parser = subparsers.add_parser(
+        "observe",
+        help="Track external optimization runs (init, log, complete, fail)",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    configure_observe_parser(observe_parser)
+
     args = parser.parse_args()
 
     # Create event context with via_skill flag
@@ -513,6 +522,9 @@ def _main() -> None:
 
         handle_setup_command(args, console)
         sys.exit(0)
+    elif args.command == "observe":
+        execute_observe_command(args)
+        sys.exit(0)
     else:
         # This case should be hit if 'weco' is run alone and chatbot logic didn't catch it,
         # or if an invalid command is provided.
diff --git a/weco/config.py b/weco/config.py
@@ -43,7 +43,17 @@ def save_api_key(api_key: str):
 
 
 def load_weco_api_key() -> str | None:
-    """Loads the Weco API key."""
+    """Loads the Weco API key.
+
+    Resolution order:
+      1. WECO_API_KEY environment variable
+      2. Credentials file (~/.config/weco/credentials.json)
+    """
+    # Environment variable takes precedence
+    env_key = os.environ.get("WECO_API_KEY")
+    if env_key:
+        return env_key
+
     if not CREDENTIALS_FILE.exists():
         return None
     try:
diff --git a/weco/integrations/langsmith/wizard/server.py b/weco/integrations/langsmith/wizard/server.py
@@ -137,6 +137,7 @@ def handle_status(self):
                 list(self.server.client.list_datasets(limit=1))
                 connected = True
             except Exception:
+                raise
                 pass
 
         state = self.server.initial_state
@@ -179,10 +180,21 @@ def handle_set_key(self):
         try:
             list(self.server.client.list_datasets(limit=1))
             self.send_json({"connected": True, "error": None})
-        except Exception:
+        except Exception as e:
             os.environ.pop("LANGCHAIN_API_KEY", None)
             self.server.reset_client()
-            self.send_json({"connected": False, "error": "Connection failed. Check that your API key is valid."})
+            # Show error type/status without leaking the full exception (which may contain the key)
+            error_type = type(e).__name__
+            detail = ""
+            if "401" in str(e) or "403" in str(e):
+                detail = "API key was rejected (check it's valid and for the correct workspace)."
+            elif "404" in str(e):
+                detail = "LangSmith API endpoint not found (check LANGCHAIN_ENDPOINT)."
+            elif "ConnectionError" in error_type or "timeout" in str(e).lower():
+                detail = "Could not reach LangSmith API (check your network connection)."
+            else:
+                detail = "Check that your API key is valid."
+            self.send_json({"connected": False, "error": f"Connection failed: {detail}"})
 
     def handle_list_datasets(self):
         try:
diff --git a/weco/observe/__init__.py b/weco/observe/__init__.py
@@ -0,0 +1,26 @@
+"""Weco Observe — observability SDK for external optimization loops.
+
+Usage:
+    from weco.observe import WecoObserver
+
+    obs = WecoObserver()
+    run = obs.create_run(
+        name="val_bpb sweep v3",
+        source_code={"train.py": open("train.py").read()},
+        primary_metric="val_bpb",
+        maximize=False,
+    )
+
+    run.log_step(
+        step=i,
+        status="completed",
+        description="Added RMSNorm",
+        metrics={"val_bpb": 1.03, "memory_gb": 34.5},
+        code={"train.py": open("train.py").read()},
+    )
+
+"""
+
+from .observer import WecoObserver, ObserveRun
+
+__all__ = ["WecoObserver", "ObserveRun"]
diff --git a/weco/observe/api.py b/weco/observe/api.py
@@ -0,0 +1,76 @@
+"""HTTP client for external run API endpoints.
+
+All functions are synchronous (using requests) and never raise exceptions.
+Errors are returned as None so the caller can warn without crashing.
+"""
+
+import warnings
+from typing import Any
+
+import requests
+
+from weco import __base_url__
+
+
+def create_run(
+    *,
+    source_code: dict[str, str],
+    metric_name: str,
+    maximize: bool,
+    name: str | None = None,
+    additional_instructions: str | None = None,
+    metadata: dict[str, Any] | None = None,
+    auth_headers: dict[str, str],
+) -> dict | None:
+    """Create an external run. Returns response dict or None on failure."""
+    try:
+        payload: dict[str, Any] = {"source_code": source_code, "metric_name": metric_name, "maximize": maximize}
+        if name is not None:
+            payload["name"] = name
+        if additional_instructions is not None:
+            payload["additional_instructions"] = additional_instructions
+        if metadata:
+            payload["metadata"] = metadata
+
+        response = requests.post(f"{__base_url__}/external/runs", json=payload, headers=auth_headers, timeout=(5, 30))
+        response.raise_for_status()
+        return response.json()
+    except Exception as e:
+        warnings.warn(f"weco observe: failed to create run: {e}", stacklevel=2)
+        return None
+
+
+def log_step(
+    *,
+    run_id: str,
+    step: int,
+    status: str = "completed",
+    description: str | None = None,
+    metrics: dict[str, float] | None = None,
+    code: dict[str, str] | None = None,
+    parent_step: int | None = None,
+    metadata: dict[str, Any] | None = None,
+    auth_headers: dict[str, str],
+) -> dict | None:
+    """Log a step for an external run. Returns response dict or None on failure."""
+    try:
+        payload: dict[str, Any] = {"step": step, "status": status}
+        if description is not None:
+            payload["description"] = description
+        if metrics:
+            payload["metrics"] = metrics
+        if code is not None:
+            payload["code"] = code
+        if parent_step is not None:
+            payload["parent_step"] = parent_step
+        if metadata:
+            payload["metadata"] = metadata
+
+        response = requests.post(
+            f"{__base_url__}/external/runs/{run_id}/steps", json=payload, headers=auth_headers, timeout=(5, 30)
+        )
+        response.raise_for_status()
+        return response.json()
+    except Exception as e:
+        warnings.warn(f"weco observe: failed to log step {step}: {e}", stacklevel=2)
+        return None
diff --git a/weco/observe/cli.py b/weco/observe/cli.py
@@ -0,0 +1,152 @@
+"""CLI commands for weco observe.
+
+All commands follow the fire-and-forget pattern: they print warnings to
+stderr on failure but always exit 0 so they never crash an agent's loop.
+"""
+
+import argparse
+import json
+import sys
+import warnings
+
+from weco.auth import handle_authentication
+from weco.observe import api
+
+
+def configure_observe_parser(observe_parser: argparse.ArgumentParser) -> None:
+    """Configure the observe command parser and all its subcommands."""
+    subparsers = observe_parser.add_subparsers(dest="observe_command", help="Observe commands")
+
+    # --- init ---
+    init_parser = subparsers.add_parser("init", help="Initialize an external run for tracking")
+    init_parser.add_argument("--name", type=str, default=None, help="Run name")
+    init_parser.add_argument("--metric", type=str, required=True, help="Primary metric name (e.g. val_bpb)")
+    init_parser.add_argument(
+        "-g",
+        "--goal",
+        type=str,
+        choices=["maximize", "max", "minimize", "min"],
+        default="minimize",
+        help="Specify 'maximize'/'max' or 'minimize'/'min' (default: minimize)",
+    )
+    init_source_group = init_parser.add_mutually_exclusive_group(required=True)
+    init_source_group.add_argument(
+        "-s", "--source", type=str, help="Path to a single source code file to track (e.g. train.py)"
+    )
+    init_source_group.add_argument(
+        "--sources", nargs="+", type=str, help="Paths to multiple source code files to track (e.g. train.py prepare.py)"
+    )
+    init_parser.add_argument(
+        "-i", "--additional-instructions", type=str, default=None, help="Additional instructions for the run"
+    )
+
+    # --- log ---
+    log_parser = subparsers.add_parser("log", help="Log a step for an external run")
+    log_parser.add_argument("--run-id", type=str, required=True, help="Run ID (from weco observe init)")
+    log_parser.add_argument("--step", type=int, required=True, help="Step number")
+    log_parser.add_argument(
+        "--status", type=str, default="completed", choices=["completed", "failed"], help="Step status (default: completed)"
+    )
+    log_parser.add_argument("--description", type=str, default=None, help="Description of what was tried")
+    log_parser.add_argument("--metrics", type=str, default=None, help="Metrics as JSON (e.g. '{\"val_bpb\": 1.03}')")
+    log_source_group = log_parser.add_mutually_exclusive_group()
+    log_source_group.add_argument("-s", "--source", type=str, default=None, help="Single source code file to snapshot")
+    log_source_group.add_argument(
+        "--sources", nargs="+", type=str, default=None, help="Multiple source code files to snapshot"
+    )
+    log_parser.add_argument("--parent-step", type=int, default=None, help="Parent step number for tree lineage")
+
+    # --- complete/fail are no longer needed ---
+    # External run lifecycle is managed by the dashboard, not the CLI.
+    # Logging a step to a closed run will silently reopen it.
+
+
+def _read_code_files(paths: list[str]) -> dict[str, str]:
+    """Read source code files from disk."""
+    source_code = {}
+    for path in paths:
+        try:
+            with open(path) as f:
+                source_code[path] = f.read()
+        except FileNotFoundError:
+            warnings.warn(f"weco observe: file not found: {path}", stacklevel=2)
+        except Exception as e:
+            warnings.warn(f"weco observe: error reading {path}: {e}", stacklevel=2)
+    return source_code
+
+
+def execute_observe_command(args: argparse.Namespace) -> None:
+    """Execute an observe subcommand. Always exits 0."""
+    if not args.observe_command:
+        print("Usage: weco observe {init,log,complete,fail}", file=sys.stderr)
+        sys.exit(0)
+
+    # Authenticate
+    try:
+        _, auth_headers = handle_authentication(None)
+        if not auth_headers:
+            print("weco observe: not logged in. Run `weco login` first.", file=sys.stderr)
+            sys.exit(0)
+    except Exception as e:
+        print(f"weco observe: authentication failed: {e}", file=sys.stderr)
+        sys.exit(0)
+
+    if args.observe_command == "init":
+        _handle_init(args, auth_headers)
+    elif args.observe_command == "log":
+        _handle_log(args, auth_headers)
+
+
+def _handle_init(args: argparse.Namespace, auth_headers: dict) -> None:
+    """Handle `weco observe init`."""
+    source_arg = args.sources if args.sources is not None else [args.source]
+    source_code = _read_code_files(source_arg)
+    if not source_code:
+        print("weco observe: no source files could be read", file=sys.stderr)
+        sys.exit(0)
+
+    maximize = args.goal in ("maximize", "max")
+
+    result = api.create_run(
+        source_code=source_code,
+        metric_name=args.metric,
+        maximize=maximize,
+        name=args.name,
+        additional_instructions=args.additional_instructions,
+        auth_headers=auth_headers,
+    )
+
+    if result and result.get("run_id"):
+        # Print only the run_id to stdout so it can be captured by $(...)
+        print(result["run_id"])
+    else:
+        print("weco observe: failed to create run", file=sys.stderr)
+
+
+def _handle_log(args: argparse.Namespace, auth_headers: dict) -> None:
+    """Handle `weco observe log`."""
+    # Parse metrics JSON
+    metrics = {}
+    if args.metrics:
+        try:
+            metrics = json.loads(args.metrics)
+        except json.JSONDecodeError as e:
+            print(f"weco observe: invalid metrics JSON: {e}", file=sys.stderr)
+            sys.exit(0)
+
+    # Read source files if specified
+    code = None
+    source_arg = args.sources if args.sources is not None else ([args.source] if args.source else None)
+    if source_arg:
+        code = _read_code_files(source_arg)
+
+    api.log_step(
+        run_id=args.run_id,
+        step=args.step,
+        status=args.status,
+        description=args.description,
+        metrics=metrics,
+        code=code,
+        parent_step=args.parent_step,
+        auth_headers=auth_headers,
+    )
diff --git a/weco/observe/observer.py b/weco/observe/observer.py