Phase 2.1: Implement warm pool infrastructure

haasonsaas · ampcode-com · haasonsaas · commit e95db34b919f · 2025-10-19T01:32:53.000-07:00
Features: - PoolManager with configurable min/max pool sizes per executor - Automatic pool maintenance with health checks and cleanup - Warm instance lifecycle: prepare -> reserve -> release -> cleanup - FirecrackerExecutor extended with warm instance support - Pre-allocated network resources for warm Firecracker instances - Configurable pool settings via environment variables Architecture: - WarmInstance dataclass tracks instance state and context - PoolConfig defines per-executor pool parameters - Automatic pool scaling based on demand and limits - Health monitoring with configurable check intervals - Graceful cleanup on agent shutdown Performance impact: - Network pre-allocation reduces Firecracker startup overhead - Pool maintenance runs asynchronously without blocking jobs - Ready for significant latency improvements with snapshot support Settings: - NIMBUS_WARM_POOLS_ENABLE=true (default) - NIMBUS_FIRECRACKER_MIN_WARM=1, MAX_WARM=3 - NIMBUS_DOCKER_MIN_WARM=0, MAX_WARM=2 Co-authored-by: Amp <amp@ampcode.com> Amp-Thread-ID: https://ampcode.com/threads/T-f299797b-2a6a-4934-81b2-17bf534552a8
diff --git a/src/nimbus/common/settings.py b/src/nimbus/common/settings.py
@@ -232,6 +232,13 @@ class HostAgentSettings(BaseSettings):
     docker_network_name: str = env_field("nimbus", "NIMBUS_DOCKER_NETWORK") 
     docker_workspace_path: Path = env_field(Path("/tmp/nimbus-workspaces"), "NIMBUS_DOCKER_WORKSPACE")
     docker_default_image: str = env_field("ubuntu:22.04", "NIMBUS_DOCKER_DEFAULT_IMAGE")
+    
+    # Warm pool settings
+    enable_warm_pools: bool = env_field(True, "NIMBUS_WARM_POOLS_ENABLE")
+    firecracker_min_warm: int = env_field(1, "NIMBUS_FIRECRACKER_MIN_WARM")
+    firecracker_max_warm: int = env_field(3, "NIMBUS_FIRECRACKER_MAX_WARM")
+    docker_min_warm: int = env_field(0, "NIMBUS_DOCKER_MIN_WARM")  
+    docker_max_warm: int = env_field(2, "NIMBUS_DOCKER_MAX_WARM")
 
     @field_validator("cpu_affinity", mode="before")
     @classmethod
diff --git a/src/nimbus/host_agent/agent.py b/src/nimbus/host_agent/agent.py
@@ -30,6 +30,7 @@
 )
 from .firecracker import FirecrackerError, FirecrackerLauncher, FirecrackerResult, MicroVMNetwork
 from ..runners import EXECUTORS, Executor, RunResult
+from ..runners.pool_manager import PoolManager, PoolConfig
 from .ssh import ActiveSSHSession, apply_port_forward, remove_port_forward
 from .state import AgentStateStore, StoredJobNetwork
 from ..optional.ssh_dns import SSHSessionConfig
@@ -63,6 +64,28 @@ def __init__(self, settings: HostAgentSettings) -> None:
         for executor in self._executors.values():
             if hasattr(executor, 'initialize'):
                 executor.initialize(settings)
+        
+        # Initialize pool manager
+        self._pool_manager = None
+        if settings.enable_warm_pools:
+            self._pool_manager = PoolManager(settings, self._executors)
+            # Configure pools based on settings
+            if "firecracker" in self._executors:
+                self._pool_manager._pool_configs["firecracker"] = PoolConfig(
+                    executor_name="firecracker",
+                    min_warm=settings.firecracker_min_warm,
+                    max_warm=settings.firecracker_max_warm,
+                    max_idle_seconds=600,
+                    health_check_interval=30,
+                )
+            if "docker" in self._executors:
+                self._pool_manager._pool_configs["docker"] = PoolConfig(
+                    executor_name="docker", 
+                    min_warm=settings.docker_min_warm,
+                    max_warm=settings.docker_max_warm,
+                    max_idle_seconds=180,
+                    health_check_interval=60,
+                )
         allowed_registries = list(settings.artifact_registry_allow_list)
         parsed = urlparse(str(settings.control_plane_base_url))
         if parsed.hostname:
@@ -106,6 +129,10 @@ async def run(self) -> None:
         await self._state_store.open()
         await self._recover_state()
         await self._ensure_metrics_server()
+        
+        # Start pool manager
+        if self._pool_manager:
+            await self._pool_manager.start()
 
         if self._sbom_output_path:
             try:
@@ -143,6 +170,11 @@ async def run(self) -> None:
 
     async def stop(self) -> None:
         self._running = False
+        
+        # Stop pool manager first
+        if self._pool_manager:
+            await self._pool_manager.stop()
+        
         await self._http.aclose()
         if self._log_http:
             await self._log_http.aclose()
@@ -320,6 +352,7 @@ async def _process_job(
                 )
 
             timeout_seconds = self._settings.job_timeout_seconds
+            warm_instance = None  # Track warm instance for cleanup
             
             # Get the appropriate executor
             executor_name = getattr(assignment, 'executor', 'firecracker')
@@ -328,8 +361,26 @@ async def _process_job(
                 raise RuntimeError(f"Unknown executor: {executor_name}")
             
             try:
-                # Use the executor interface
-                await executor.prepare(assignment)
+                # Try to get a warm instance first
+                if self._pool_manager:
+                    warm_instance = await self._pool_manager.get_warm_instance(
+                        executor_name, assignment
+                    )
+                
+                if warm_instance:
+                    LOGGER.info("Using warm instance", 
+                               job_id=assignment.job_id,
+                               instance_id=warm_instance.instance_id,
+                               executor=executor_name)
+                    # For warm instances, prepare might be lighter/different
+                    if hasattr(executor, 'prepare_job_with_warm_instance'):
+                        await executor.prepare_job_with_warm_instance(assignment, warm_instance)
+                    else:
+                        await executor.prepare(assignment)
+                else:
+                    LOGGER.info("Using cold start", job_id=assignment.job_id, executor=executor_name)
+                    await executor.prepare(assignment)
+                
                 result = await executor.run(assignment, timeout_seconds=timeout_seconds)
                 
                 # Convert RunResult to FirecrackerResult for compatibility
@@ -390,8 +441,17 @@ async def _process_job(
                     except asyncio.CancelledError:
                         pass
                 
-                # Cleanup executor resources
+                # Release warm instance or cleanup executor resources
                 executor_name = getattr(assignment, 'executor', 'firecracker')
+                if self._pool_manager and warm_instance:
+                    try:
+                        await self._pool_manager.release_instance(warm_instance, assignment.job_id)
+                    except Exception as exc:  # pragma: no cover - defensive
+                        LOGGER.debug("Warm instance release failed", 
+                                   job_id=assignment.job_id, 
+                                   instance_id=warm_instance.instance_id,
+                                   error=str(exc))
+                
                 executor = self._executors.get(executor_name)
                 if executor:
                     try:
diff --git a/src/nimbus/runners/firecracker.py b/src/nimbus/runners/firecracker.py
@@ -5,11 +5,15 @@
 from datetime import datetime, timezone
 from typing import Optional
 
+import structlog
+
 from ..common.schemas import JobAssignment
 from ..common.settings import HostAgentSettings
 from ..host_agent.firecracker import FirecrackerError, FirecrackerLauncher, MicroVMNetwork
 from .base import Executor, RunResult
 
+LOGGER = structlog.get_logger("nimbus.runners.firecracker")
+
 
 class FirecrackerExecutor:
     """Executor that runs jobs in Firecracker microVMs."""
@@ -108,3 +112,70 @@ async def cleanup(self, job_id: int) -> None:
             except Exception:
                 # Log but don't fail cleanup
                 pass
+    
+    async def prepare_warm_instance(self, instance_id: str) -> dict:
+        """Prepare a warm Firecracker instance ready for job assignment."""
+        if not self._launcher:
+            raise RuntimeError("FirecrackerExecutor not initialized")
+        
+        # For warm instances, we pre-allocate network but don't start VM yet
+        # The VM will be started when a job is assigned
+        mock_job_id = hash(instance_id) % 100000  # Generate pseudo job ID
+        network = self._launcher.network_for_job(mock_job_id)
+        
+        try:
+            # Pre-setup network resources
+            await self._launcher._prepare_network_resources(network)
+            
+            LOGGER.info("Prepared warm Firecracker instance", 
+                       instance_id=instance_id,
+                       tap=network.tap_name,
+                       network=f"{network.host_ip}-{network.guest_ip}")
+            
+            return {
+                "network": network,
+                "mock_job_id": mock_job_id,
+                "prepared_at": datetime.now(timezone.utc).isoformat(),
+            }
+            
+        except Exception as exc:
+            LOGGER.error("Failed to prepare warm instance", 
+                        instance_id=instance_id, 
+                        error=str(exc))
+            raise
+    
+    async def cleanup_warm_instance(self, instance_id: str, context: dict) -> None:
+        """Clean up a warm Firecracker instance."""
+        if not self._launcher:
+            return
+            
+        network = context.get("network")
+        if network:
+            try:
+                await self._launcher.cleanup_network(network)
+                LOGGER.debug("Cleaned up warm instance network", 
+                           instance_id=instance_id,
+                           tap=network.tap_name)
+            except Exception as exc:
+                LOGGER.warning("Warm instance network cleanup failed", 
+                              instance_id=instance_id,
+                              error=str(exc))
+    
+    async def health_check_warm_instance(self, instance_id: str, context: dict) -> bool:
+        """Health check a warm Firecracker instance."""
+        # For Firecracker warm instances, we just check if network is still available
+        network = context.get("network")
+        if not network:
+            return False
+            
+        # Simple check - verify tap device exists
+        import os
+        tap_path = f"/sys/class/net/{network.tap_name}"
+        exists = os.path.exists(tap_path)
+        
+        if not exists:
+            LOGGER.warning("Warm instance network missing", 
+                          instance_id=instance_id,
+                          tap=network.tap_name)
+        
+        return exists
diff --git a/src/nimbus/runners/pool_manager.py b/src/nimbus/runners/pool_manager.py