Merge pull request #93 from WecoAI/feature/byok

aliroberts · web-flow · commit 4263fc6d65da · 2025-12-11T15:11:10.000Z
Add --api-key parameter that allows users to bring their own API keys for calling models. This can be used with the run and resume commands.

Multiple keys can be provided, e.g.: --api-key openai=&lt;key&gt; gemini=&lt;key&gt;
diff --git a/README.md b/README.md
@@ -95,6 +95,7 @@ For more advanced examples, including [Triton](/examples/triton/README.md), [CUD
 | `--eval-timeout`       | Timeout in seconds for each step in evaluation.                                                                                                                                                                             | No timeout (unlimited)                                                                                                                                                  | `--eval-timeout 3600`             |
 | `--save-logs`          | Save execution output from each optimization step to disk. Creates timestamped directories with raw output files and a JSONL index for tracking execution history.                                                        | `False`                                                                                                                                                 | `--save-logs`       |
 | `--apply-change`       | Automatically apply the best solution to the source file without prompting.                                                                                                                                                | `False`                                                                                                                                                 | `--apply-change`       |
+| `--api-key`            | API keys for LLM providers (BYOK). Format: `provider=key`. Can specify multiple providers.                                                                                                                                  | `None`                                                                                                                                                  | `--api-key openai=sk-xxx` |
 
 ---
 
@@ -149,6 +150,7 @@ Arguments for `weco resume`:
 |----------|-------------|---------|
 | `run-id` | The UUID of the run to resume (shown at the start of each run) | `0002e071-1b67-411f-a514-36947f0c4b31` |
 | `--apply-change` | Automatically apply the best solution to the source file without prompting | `--apply-change` |
+| `--api-key` | (Optional) API keys for LLM providers (BYOK). Format: `provider=key` | `--api-key openai=sk-xxx` |
 
 Notes:
 - Works only for interrupted runs (status: `error`, `terminated`, etc.).
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "gitingest",
     "fastapi",
     "slowapi",
-    "psutil",
+    "psutil"
 ]
 keywords = ["AI", "Code Optimization", "Code Generation"]
 classifiers = [
@@ -34,7 +34,7 @@ weco = "weco.cli:main"
 Homepage = "https://github.com/WecoAI/weco-cli"
 
 [project.optional-dependencies]
-dev = ["ruff", "build", "setuptools_scm"]
+dev = ["ruff", "build", "setuptools_scm", "pytest>=7.0.0"]
 
 [tool.setuptools]
 packages = ["weco"]
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for weco CLI."""
diff --git a/tests/test_byok.py b/tests/test_byok.py
@@ -0,0 +1,192 @@
+"""Tests to verify API keys are correctly passed through the system and sent to the API."""
+
+import pytest
+from unittest.mock import patch, MagicMock
+from rich.console import Console
+
+from weco.api import start_optimization_run, evaluate_feedback_then_suggest_next_solution
+
+
+class TestApiKeysInStartOptimizationRun:
+    """Test that api_keys are correctly included in start_optimization_run requests."""
+
+    @pytest.fixture
+    def mock_console(self):
+        """Create a mock console for testing."""
+        return MagicMock(spec=Console)
+
+    @pytest.fixture
+    def base_params(self, mock_console):
+        """Base parameters for start_optimization_run."""
+        return {
+            "console": mock_console,
+            "source_code": "print('hello')",
+            "source_path": "test.py",
+            "evaluation_command": "python test.py",
+            "metric_name": "accuracy",
+            "maximize": True,
+            "steps": 10,
+            "code_generator_config": {"model": "o4-mini"},
+            "evaluator_config": {"model": "o4-mini"},
+            "search_policy_config": {"num_drafts": 2},
+        }
+
+    @patch("weco.api.requests.post")
+    def test_api_keys_included_in_request(self, mock_post, base_params):
+        """Test that api_keys are included in the request JSON when provided."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "run_id": "test-run-id",
+            "solution_id": "test-solution-id",
+            "code": "print('hello')",
+            "plan": "test plan",
+        }
+        mock_response.raise_for_status = MagicMock()
+        mock_post.return_value = mock_response
+
+        api_keys = {"openai": "sk-test-key", "anthropic": "sk-ant-test"}
+        start_optimization_run(**base_params, api_keys=api_keys)
+
+        # Verify the request was made with api_keys in the JSON payload
+        mock_post.assert_called_once()
+        call_kwargs = mock_post.call_args
+        request_json = call_kwargs.kwargs["json"]
+        assert "api_keys" in request_json
+        assert request_json["api_keys"] == {"openai": "sk-test-key", "anthropic": "sk-ant-test"}
+
+    @patch("weco.api.requests.post")
+    def test_api_keys_not_included_when_none(self, mock_post, base_params):
+        """Test that api_keys field is not included when api_keys is None."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "run_id": "test-run-id",
+            "solution_id": "test-solution-id",
+            "code": "print('hello')",
+            "plan": "test plan",
+        }
+        mock_response.raise_for_status = MagicMock()
+        mock_post.return_value = mock_response
+
+        start_optimization_run(**base_params, api_keys=None)
+
+        # Verify the request was made without api_keys
+        mock_post.assert_called_once()
+        call_kwargs = mock_post.call_args
+        request_json = call_kwargs.kwargs["json"]
+        assert "api_keys" not in request_json
+
+    @patch("weco.api.requests.post")
+    def test_api_keys_not_included_when_empty_dict(self, mock_post, base_params):
+        """Test that api_keys field is not included when api_keys is an empty dict."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "run_id": "test-run-id",
+            "solution_id": "test-solution-id",
+            "code": "print('hello')",
+            "plan": "test plan",
+        }
+        mock_response.raise_for_status = MagicMock()
+        mock_post.return_value = mock_response
+
+        # Empty dict is falsy, so api_keys should not be included
+        start_optimization_run(**base_params, api_keys={})
+
+        mock_post.assert_called_once()
+        call_kwargs = mock_post.call_args
+        request_json = call_kwargs.kwargs["json"]
+        assert "api_keys" not in request_json
+
+
+class TestApiKeysInEvaluateFeedbackThenSuggest:
+    """Test that api_keys are correctly included in evaluate_feedback_then_suggest_next_solution requests."""
+
+    @pytest.fixture
+    def mock_console(self):
+        """Create a mock console for testing."""
+        return MagicMock(spec=Console)
+
+    @patch("weco.api.requests.post")
+    def test_api_keys_included_in_suggest_request(self, mock_post, mock_console):
+        """Test that api_keys are included in the suggest request JSON when provided."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "run_id": "test-run-id",
+            "solution_id": "new-solution-id",
+            "code": "print('improved')",
+            "plan": "improvement plan",
+            "is_done": False,
+        }
+        mock_response.raise_for_status = MagicMock()
+        mock_post.return_value = mock_response
+
+        api_keys = {"openai": "sk-test-key"}
+        evaluate_feedback_then_suggest_next_solution(
+            console=mock_console,
+            run_id="test-run-id",
+            step=1,
+            execution_output="accuracy: 0.95",
+            auth_headers={"Authorization": "Bearer test-token"},
+            api_keys=api_keys,
+        )
+
+        mock_post.assert_called_once()
+        call_kwargs = mock_post.call_args
+        request_json = call_kwargs.kwargs["json"]
+        assert "api_keys" in request_json
+        assert request_json["api_keys"] == {"openai": "sk-test-key"}
+
+    @patch("weco.api.requests.post")
+    def test_api_keys_not_included_in_suggest_when_none(self, mock_post, mock_console):
+        """Test that api_keys field is not included in suggest request when api_keys is None."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "run_id": "test-run-id",
+            "solution_id": "new-solution-id",
+            "code": "print('improved')",
+            "plan": "improvement plan",
+            "is_done": False,
+        }
+        mock_response.raise_for_status = MagicMock()
+        mock_post.return_value = mock_response
+
+        evaluate_feedback_then_suggest_next_solution(
+            console=mock_console,
+            run_id="test-run-id",
+            step=1,
+            execution_output="accuracy: 0.95",
+            auth_headers={"Authorization": "Bearer test-token"},
+            api_keys=None,
+        )
+
+        mock_post.assert_called_once()
+        call_kwargs = mock_post.call_args
+        request_json = call_kwargs.kwargs["json"]
+        assert "api_keys" not in request_json
+
+    @patch("weco.api.requests.post")
+    def test_api_keys_not_included_in_suggest_when_empty_dict(self, mock_post, mock_console):
+        """Test that api_keys field is not included in suggest request when api_keys is None."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "run_id": "test-run-id",
+            "solution_id": "new-solution-id",
+            "code": "print('improved')",
+            "plan": "improvement plan",
+            "is_done": False,
+        }
+        mock_response.raise_for_status = MagicMock()
+        mock_post.return_value = mock_response
+
+        evaluate_feedback_then_suggest_next_solution(
+            console=mock_console,
+            run_id="test-run-id",
+            step=1,
+            execution_output="accuracy: 0.95",
+            auth_headers={"Authorization": "Bearer test-token"},
+            api_keys={},
+        )
+
+        mock_post.assert_called_once()
+        call_kwargs = mock_post.call_args
+        request_json = call_kwargs.kwargs["json"]
+        assert "api_keys" not in request_json
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -0,0 +1,70 @@
+"""Tests for CLI functions, particularly parse_api_keys."""
+
+import pytest
+from weco.cli import parse_api_keys
+
+
+class TestParseApiKeys:
+    """Test cases for parse_api_keys function."""
+
+    def test_parse_api_keys_none(self):
+        """Test that None input returns empty dict."""
+        result = parse_api_keys(None)
+        assert result == {}
+        assert isinstance(result, dict)
+
+    def test_parse_api_keys_empty_list(self):
+        """Test that empty list returns empty dict."""
+        result = parse_api_keys([])
+        assert result == {}
+        assert isinstance(result, dict)
+
+    def test_parse_api_keys_single_key(self):
+        """Test parsing a single API key."""
+        result = parse_api_keys(["openai=sk-xxx"])
+        assert result == {"openai": "sk-xxx"}
+
+    def test_parse_api_keys_multiple_keys(self):
+        """Test parsing multiple API keys."""
+        result = parse_api_keys(["openai=sk-xxx", "anthropic=sk-ant-yyy"])
+        assert result == {"openai": "sk-xxx", "anthropic": "sk-ant-yyy"}
+
+    def test_parse_api_keys_whitespace_handling(self):
+        """Test that whitespace is stripped from provider and key."""
+        result = parse_api_keys([" openai = sk-xxx ", "  anthropic  =  sk-ant-yyy  "])
+        assert result == {"openai": "sk-xxx", "anthropic": "sk-ant-yyy"}
+
+    def test_parse_api_keys_key_contains_equals(self):
+        """Test that keys containing '=' are handled correctly (split on first '=' only)."""
+        result = parse_api_keys(["openai=sk-xxx=extra=more"])
+        assert result == {"openai": "sk-xxx=extra=more"}
+
+    def test_parse_api_keys_no_equals(self):
+        """Test that missing '=' raises ValueError."""
+        with pytest.raises(ValueError, match="Invalid API key format.*Expected format: 'provider=key'"):
+            parse_api_keys(["openai"])
+
+    def test_parse_api_keys_empty_provider(self):
+        """Test that empty provider raises ValueError."""
+        with pytest.raises(ValueError, match="Provider and key must be non-empty"):
+            parse_api_keys(["=sk-xxx"])
+
+    def test_parse_api_keys_empty_key(self):
+        """Test that empty key raises ValueError."""
+        with pytest.raises(ValueError, match="Provider and key must be non-empty"):
+            parse_api_keys(["openai="])
+
+    def test_parse_api_keys_both_empty(self):
+        """Test that both empty provider and key raises ValueError."""
+        with pytest.raises(ValueError, match="Provider and key must be non-empty"):
+            parse_api_keys(["="])
+
+    def test_parse_api_keys_duplicate_provider(self):
+        """Test that duplicate providers overwrite previous value."""
+        result = parse_api_keys(["openai=sk-xxx", "openai=sk-yyy"])
+        assert result == {"openai": "sk-yyy"}
+
+    def test_parse_api_keys_mixed_case_provider(self):
+        """Test that mixed case providers are normalized correctly."""
+        result = parse_api_keys(["OpenAI=sk-xxx", "ANTHROPIC=sk-ant-yyy"])
+        assert result == {"openai": "sk-xxx", "anthropic": "sk-ant-yyy"}
diff --git a/weco/api.py b/weco/api.py
@@ -109,31 +109,31 @@ def start_optimization_run(
     log_dir: str = ".runs",
     auth_headers: dict = {},
     timeout: Union[int, Tuple[int, int]] = (10, 3650),
+    api_keys: Optional[Dict[str, str]] = None,
 ) -> Optional[Dict[str, Any]]:
     """Start the optimization run."""
     with console.status("[bold green]Starting Optimization..."):
         try:
-            response = requests.post(
-                f"{__base_url__}/runs/",
-                json={
-                    "source_code": source_code,
-                    "source_path": source_path,
-                    "additional_instructions": additional_instructions,
-                    "objective": {"evaluation_command": evaluation_command, "metric_name": metric_name, "maximize": maximize},
-                    "optimizer": {
-                        "steps": steps,
-                        "code_generator": code_generator_config,
-                        "evaluator": evaluator_config,
-                        "search_policy": search_policy_config,
-                    },
-                    "eval_timeout": eval_timeout,
-                    "save_logs": save_logs,
-                    "log_dir": log_dir,
-                    "metadata": {"client_name": "cli", "client_version": __pkg_version__},
+            request_json = {
+                "source_code": source_code,
+                "source_path": source_path,
+                "additional_instructions": additional_instructions,
+                "objective": {"evaluation_command": evaluation_command, "metric_name": metric_name, "maximize": maximize},
+                "optimizer": {
+                    "steps": steps,
+                    "code_generator": code_generator_config,
+                    "evaluator": evaluator_config,
+                    "search_policy": search_policy_config,
                 },
-                headers=auth_headers,
-                timeout=timeout,
-            )
+                "eval_timeout": eval_timeout,
+                "save_logs": save_logs,
+                "log_dir": log_dir,
+                "metadata": {"client_name": "cli", "client_version": __pkg_version__},
+            }
+            if api_keys:
+                request_json["api_keys"] = api_keys
+
+            response = requests.post(f"{__base_url__}/runs/", json=request_json, headers=auth_headers, timeout=timeout)
             response.raise_for_status()
             result = response.json()
             # Handle None values for code and plan fields
@@ -156,11 +156,10 @@ def resume_optimization_run(
     """Request the backend to resume an interrupted run."""
     with console.status("[bold green]Resuming run..."):
         try:
+            request_json = {"metadata": {"client_name": "cli", "client_version": __pkg_version__}}
+
             response = requests.post(
-                f"{__base_url__}/runs/{run_id}/resume",
-                json={"metadata": {"client_name": "cli", "client_version": __pkg_version__}},
-                headers=auth_headers,
-                timeout=timeout,
+                f"{__base_url__}/runs/{run_id}/resume", json=request_json, headers=auth_headers, timeout=timeout
             )
             response.raise_for_status()
             result = response.json()
@@ -180,17 +179,19 @@ def evaluate_feedback_then_suggest_next_solution(
     execution_output: str,
     auth_headers: dict = {},
     timeout: Union[int, Tuple[int, int]] = (10, 3650),
+    api_keys: Optional[Dict[str, str]] = None,
 ) -> Dict[str, Any]:
     """Evaluate the feedback and suggest the next solution."""
     try:
         # Truncate the execution output before sending to backend
         truncated_output = truncate_output(execution_output)
 
+        request_json = {"execution_output": truncated_output, "metadata": {}}
+        if api_keys:
+            request_json["api_keys"] = api_keys
+
         response = requests.post(
-            f"{__base_url__}/runs/{run_id}/suggest",
-            json={"execution_output": truncated_output, "metadata": {}},
-            headers=auth_headers,
-            timeout=timeout,
+            f"{__base_url__}/runs/{run_id}/suggest", json=request_json, headers=auth_headers, timeout=timeout
         )
         response.raise_for_status()
         result = response.json()
diff --git a/weco/cli.py b/weco/cli.py
diff --git a/weco/optimizer.py b/weco/optimizer.py