ci: test-e2e reworked to use TestPypi (#105)

tarilabs · claude · web-flow · commit 3bc6c621cb3a · 2026-03-31T13:56:12.000+02:00
* ci: test-e2e reworked to use TestPypi

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;

* chore: linting

Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;

* chore: impl code review feedback

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;

* chore: impl review feedback

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;

---------

Signed-off-by: tarilabs &lt;matteo.mortari@gmail.com&gt;
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -49,12 +49,9 @@ jobs:
       run: uv run mypy src/evalhub
 
   e2e:
-    name: "Optional: E2E Tests (latest_eval_hub_server=${{ matrix.latest_eval_hub_server }})"
+    name: "Optional: E2E Tests"
     runs-on: ubuntu-latest
     continue-on-error: true
-    strategy:
-      matrix:
-        latest_eval_hub_server: [true, false]
     steps:
     - uses: actions/checkout@v4
     - name: Set up uv
@@ -66,13 +63,6 @@ jobs:
       run: uv python install ${{ env.MIN_PY_VER }}
     - name: Install dependencies
       run: uv sync --all-extras
-    - name: Fetch latest eval-hub server from eval-hub repo workflow
-      if: matrix.latest_eval_hub_server == true
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      run: |
-        uv run ./scripts/fetch_latest_eval_hub_server.py
-        tree .venv/lib/python${{ env.MIN_PY_VER }}/site-packages/evalhub_server
     - name: Create kind cluster
       uses: helm/kind-action@v1
     - name: Deploy distribution registry
diff --git a/Makefile b/Makefile
@@ -8,9 +8,8 @@ test:
 
 .PHONY: test-e2e
 test-e2e:
-	@echo "*** WARN: Running E2E with uv run --no-sync so not to override any replacement for eval-hub-server ***"
-	uv run --no-sync uv pip show eval-hub-server
-	uv run --no-sync pytest --e2e -s -x --color=yes -ra
+	uv run uv pip show eval-hub-server
+	uv run pytest --e2e --e2e-debug -s -x --color=yes -ra
 
 .PHONY: ruff
 ruff:
diff --git a/pyproject.toml b/pyproject.toml
@@ -194,7 +194,7 @@ tag_format = "v$version"
 
 [dependency-groups]
 dev = [
-    "eval-hub-server @ git+https://github.com/eval-hub/eval-hub#subdirectory=python-server",  # TODO: this should be moved to a pypi release
+    "eval-hub-server",
     "fastapi>=0.124.4",
     "httpx>=0.28.1",
     "mypy>=1.19.1",
@@ -209,3 +209,11 @@ dev = [
     "types-pyyaml>=6.0.12.20250915",
     "uvicorn>=0.38.0",
 ]
+
+[[tool.uv.index]]
+name = "testpypi"
+url = "https://test.pypi.org/simple/"
+explicit = true  # uv will only use TestPyPI for packages explicitly configured to use it
+
+[tool.uv.sources]
+eval-hub-server = { index = "testpypi" }
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,6 @@
 """Pytest configuration for eval-hub-sdk tests."""
 
+import logging
 from typing import Any
 
 import pytest
@@ -13,13 +14,26 @@ def pytest_addoption(parser: Any) -> None:
         default=False,
         help="Run only E2E tests",
     )
+    parser.addoption(
+        "--e2e-debug",
+        action="store_true",
+        default=False,
+        help="Enable DEBUG logging for E2E test fixtures",
+    )
 
 
 def pytest_configure(config: Any) -> None:
-    """Register custom markers."""
+    """Register custom markers and configure logging."""
     config.addinivalue_line(
         "markers", "e2e: mark test as end-to-end test (run with --e2e flag)"
     )
+    if config.getoption("--e2e-debug", default=False):
+        e2e_logger = logging.getLogger("tests.e2e.conftest")
+        e2e_logger.setLevel(logging.DEBUG)
+        handler = logging.StreamHandler()
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(logging.Formatter("%(name)s %(levelname)s: %(message)s"))
+        e2e_logger.addHandler(handler)
 
 
 def pytest_collection_modifyitems(config: Any, items: list[Any]) -> None:
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
@@ -1,5 +1,6 @@
 """Shared fixtures and utilities for E2E tests."""
 
+import logging
 import platform
 import shutil
 import subprocess
@@ -11,6 +12,8 @@
 import httpx
 import pytest
 
+logger = logging.getLogger(__name__)
+
 
 def _kill_process_on_port(port: int) -> bool:
     """
@@ -46,67 +49,12 @@ def _kill_process_on_port(port: int) -> bool:
     return False
 
 
-def _run_server(working_dir: str) -> None:
-    """
-    Run the eval-hub server binary in the specified working directory.
-
-    This function is intended to be used as a target for multiprocessing.Process.
-
-    Args:
-        working_dir: Directory containing the config subdirectory
-    """
-    from evalhub_server import get_binary_path
-
-    binary_path = get_binary_path()
-    subprocess.run([binary_path], cwd=working_dir, check=False)
-
-
 def _ensure_server_binary() -> bool:
-    """
-    TODO: this should be REMOVED when eval-hub-server is moved to a pypi release
-    TODO: this is temporary until eval-hub-server is release'd on Pypi because we need the binary(ies)
-    """
     try:
         from evalhub_server import get_binary_path
 
-        # Check if binary already exists
-        try:
-            binary_path = get_binary_path()
-            return Path(binary_path).exists()
-        except FileNotFoundError:
-            pass
-
-        # Try to copy from local eval-hub repo
-        system = platform.system().lower()
-        machine = platform.machine().lower()
-
-        if system == "darwin":
-            binary_name = (
-                f"eval-hub-darwin-{'arm64' if machine == 'arm64' else 'amd64'}"
-            )
-        elif system == "linux":
-            binary_name = f"eval-hub-linux-{'arm64' if 'aarch64' in machine or 'arm64' in machine else 'amd64'}"
-        else:
-            return False
-
-        # Look for eval-hub repo (assume it's a sibling directory)
-        eval_hub_repo = Path(__file__).parent.parent.parent.parent / "eval-hub"
-        binary_source = eval_hub_repo / "bin" / binary_name
-
-        if binary_source.exists():
-            # Copy to evalhub_server package
-            import evalhub_server
-
-            pkg_dir = Path(evalhub_server.__file__).parent
-            binaries_dir = pkg_dir / "binaries"
-            binaries_dir.mkdir(exist_ok=True)
-
-            binary_dest = binaries_dir / binary_name
-            shutil.copy2(binary_source, binary_dest)
-            binary_dest.chmod(0o755)
-            return True
-
-        return False
+        binary_path = get_binary_path()
+        return Path(binary_path).exists()
     except Exception:
         return False
 
@@ -147,29 +95,33 @@ def evalhub_server_with_real_config() -> Generator[str, None, None]:
             "Please ensure the config directory is properly set up."
         )
 
-    # Create temporary directory for server files
-    with tempfile.TemporaryDirectory() as tmpdir:
+    # Create temporary directory for server files (preserved after run for debugging of server logfiles, etc)
+    tmpdir = tempfile.mkdtemp(prefix="evalhub-e2e-")
+    server_process = None
+    try:
+        logger.debug(f"\nTemp directory for this run: {tmpdir}")
         # Copy entire config directory to temp location (including providers subdirectory)
         config_dir = Path(tmpdir) / "config"
         shutil.copytree(config_source_dir, config_dir)
 
         # Debug: print directory structure
-        print("\n\n===== SERVER DIRECTORY STRUCTURE =====")
-        print(f"Working dir will be: {tmpdir}")
-        for item in sorted(Path(tmpdir).rglob("*")):
-            rel = item.relative_to(tmpdir)
-            print(f"  {rel}{'/' if item.is_dir() else ''}")
-        print("=" * 50)
+        dir_listing = "\n".join(
+            f"  {item.relative_to(tmpdir)}{'/' if item.is_dir() else ''}"
+            for item in sorted(Path(tmpdir).rglob("*"))
+        )
+        logger.debug(
+            "Server directory structure (working dir: %s):\n%s", tmpdir, dir_listing
+        )
 
         # Create log file for server output
         log_file = Path(tmpdir) / "server.log"
 
         # Kill any process already using port 8080
         port = 8080
         if _kill_process_on_port(port):
-            print(f"\n⚠️  WARNING: Killed existing process on port {port}")
-            print(
-                "    (This is normal if a previous test run didn't clean up properly)\n"
+            logger.warning(
+                "Killed existing process on port %d (normal if a previous test run didn't clean up properly)",
+                port,
             )
             # Give the OS a moment to release the port
             time.sleep(0.5)
@@ -208,22 +160,19 @@ def evalhub_server_with_real_config() -> Generator[str, None, None]:
 
         # Debug: Print server logs
         if log_file.exists():
-            print("\n\n===== SERVER LOGS =====")
             with open(log_file) as f:
                 logs = f.read()
-                # Only print first 3000 chars to avoid flooding output
-                if len(logs) > 3000:
-                    print(logs[:3000] + f"\n... ({len(logs) - 3000} more chars)")
-                else:
-                    print(logs)
-            print("=" * 50)
+            if len(logs) > 3000:
+                logs = logs[:3000] + f"\n... ({len(logs) - 3000} more chars)"
+            logger.debug("Server log file: %s\n%s", log_file.resolve(), logs)
 
         yield base_url
-
+    finally:
         # Cleanup: terminate the server subprocess
-        try:
-            server_process.terminate()
-            server_process.wait(timeout=5)
-        except subprocess.TimeoutExpired:
-            server_process.kill()
-            server_process.wait()
+        if server_process is not None:
+            try:
+                server_process.terminate()
+                server_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                server_process.kill()
+                server_process.wait()
diff --git a/uv.lock b/uv.lock