marin-community
diff --git a/‎lib/iris/AGENTS.md‎
Lines changed: 8 additions & 2 deletions b/‎lib/iris/AGENTS.md‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎lib/iris/examples/coreweave.yaml‎
Lines changed: 3 additions & 3 deletions b/‎lib/iris/examples/coreweave.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/iris/examples/demo.yaml‎
Lines changed: 3 additions & 3 deletions b/‎lib/iris/examples/demo.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/iris/examples/marin.yaml‎
Lines changed: 3 additions & 3 deletions b/‎lib/iris/examples/marin.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/iris/examples/smoke.yaml‎
Lines changed: 3 additions & 3 deletions b/‎lib/iris/examples/smoke.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/iris/scripts/smoke-test.py‎
Lines changed: 16 additions & 13 deletions b/‎lib/iris/scripts/smoke-test.py‎
Lines changed: 16 additions & 13 deletions
diff --git a/‎lib/iris/src/iris/cli/cluster.py‎
Lines changed: 6 additions & 6 deletions b/‎lib/iris/src/iris/cli/cluster.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎lib/iris/src/iris/cluster/config.py‎
Lines changed: 35 additions & 35 deletions b/‎lib/iris/src/iris/cluster/config.py‎
Lines changed: 35 additions & 35 deletions
@@ -232,7 +232,7 @@ for GHCR. See `docs/image-push.md` for full details.
 image tags to the AR remote repo for the VM's continent:
 - `ghcr.io/org/image:v1` → `us-docker.pkg.dev/project/ghcr-mirror/org/image:v1`
 
-Set `defaults.bootstrap.docker_image` to a `ghcr.io/...` tag. Non-GHCR tags
+Set `defaults.worker.docker_image` to a `ghcr.io/...` tag. Non-GHCR tags
 (`docker.io`, existing AR tags) pass through unchanged.
 
 **Bundle storage** (`controller.bundle_prefix`) is a GCS URI with no zone
@@ -251,8 +251,14 @@ Iris follows a clean layering architecture:
 - Owns autoscaling logic and scaling group state
 
 **Platform layer** (`cluster/platform/`): Platform abstractions for managing VMs
-- Provides VM lifecycle management (GCP, manual, local, CoreWeave)
 - Does NOT depend on controller layer
+- Four platform implementations with independent launch/teardown paths:
+  - `gcp.py` — GCP TPU/VM slices, SSH bootstrap
+  - `coreweave.py` — CoreWeave CKS, Kubernetes Pods on shared NodePools
+  - `manual.py` — Pre-existing hosts, SSH bootstrap
+  - `local.py` — Local development, in-process workers
+- Changes to shared interfaces (worker CLI, bootstrap flow, proto schemas)
+  must be applied to all four platforms
 
 **Cluster layer** (`cluster/`): High-level orchestration
 - `connect_cluster()` and `stop_all()` free functions for cluster lifecycle
 
@@ -58,12 +58,12 @@ defaults:
       milliseconds: 300000
     startup_grace_period:
       milliseconds: 2400000    # 40 min — covers autoscaler node provisioning + Pod startup
-  default_task_image: ghcr.io/marin-community/iris-task:latest
-  bootstrap:
+  worker:
     docker_image: ghcr.io/marin-community/iris-worker:latest
-    worker_port: 10001
+    port: 10001
     cache_dir: /mnt/local/iris-cache
     runtime: kubernetes
+    default_task_image: ghcr.io/marin-community/iris-task:latest
 
 scale_groups:
   # CPU general-purpose — used for data processing, orchestration, etc.
 
@@ -8,17 +8,17 @@ platform:
     project_id: hai-gcp-models
 
 defaults:
-  default_task_image: ghcr.io/marin-community/iris-task:latest
   autoscaler:
     evaluation_interval:
       milliseconds: 10000
     scale_up_delay:
       milliseconds: 60000
     scale_down_delay:
       milliseconds: 300000
-  bootstrap:
+  worker:
     docker_image: ghcr.io/marin-community/iris-worker:latest
-    worker_port: 10001
+    default_task_image: ghcr.io/marin-community/iris-task:latest
+    port: 10001
     controller_address: "${IRIS_CONTROLLER_ADDRESS}"
 
 storage:
 
@@ -7,17 +7,17 @@ platform:
     project_id: hai-gcp-models
 
 defaults:
-  default_task_image: ghcr.io/marin-community/iris-task:latest
   autoscaler:
     evaluation_interval:
       milliseconds: 10000
     scale_up_delay:
       milliseconds: 60000
     scale_down_delay:
       milliseconds: 300000
-  bootstrap:
+  worker:
     docker_image: ghcr.io/marin-community/iris-worker:latest
-    worker_port: 10001
+    default_task_image: ghcr.io/marin-community/iris-task:latest
+    port: 10001
 
 storage:
   bundle_prefix: gs://marin-us-central2/tmp/iris/bundles
 
@@ -7,17 +7,17 @@ platform:
     project_id: hai-gcp-models
 
 defaults:
-  default_task_image: ghcr.io/marin-community/iris-task:latest
   autoscaler:
     evaluation_interval:
       milliseconds: 10000
     scale_up_delay:
       milliseconds: 60000
     scale_down_delay:
       milliseconds: 300000
-  bootstrap:
+  worker:
     docker_image: ghcr.io/marin-community/iris-worker:latest
-    worker_port: 10001
+    default_task_image: ghcr.io/marin-community/iris-task:latest
+    port: 10001
 
 storage:
   bundle_prefix: gs://marin-us-central2/tmp/iris/bundles
 
@@ -27,6 +27,9 @@
     # Custom per-job timeout
     uv run python scripts/smoke-test.py --job-timeout 120
 
+    # Local mode: in-process controller and workers (no cloud VMs)
+    uv run python scripts/smoke-test.py --mode local
+
     # Keep cluster running on failure for debugging
     uv run python scripts/smoke-test.py --mode keep
 
@@ -697,8 +700,11 @@ class SmokeTestConfig:
     accelerator: AcceleratorConfig
     boot_timeout_seconds: int = DEFAULT_BOOT_TIMEOUT
     job_timeout_seconds: int = DEFAULT_JOB_TIMEOUT
-    local: bool = False  # Run locally without GCP
-    mode: Literal["full", "keep", "redeploy"] = "full"
+    mode: Literal["full", "keep", "redeploy", "local"] = "full"
+
+    @property
+    def local(self) -> bool:
+        return self.mode == "local"
 
 
 # =============================================================================
@@ -769,7 +775,7 @@ def run(self) -> bool:
             controller_url: str | None = None
 
             if self.config.mode != "redeploy":
-                if self.config.mode in ("full", "keep") and not self.config.local:
+                if self.config.mode in ("full", "keep"):
                     _log_section("PHASE 0: Clean Start")
                     self._cleanup_existing()
                     if self._interrupted:
@@ -837,10 +843,10 @@ def _print_header(self):
         logger.info("=" * 60)
         logger.info("")
         logger.info("Config: %s", self.config.config_path)
+        logger.info("Mode: %s", self.config.mode)
         logger.info("Boot timeout: %ds", self.config.boot_timeout_seconds)
         logger.info("Job timeout: %ds", self.config.job_timeout_seconds)
         logger.info("Accelerator: %s (%s)", self._accel.label(), self._accel.device_type)
-        logger.info("Local: %s", self.config.local)
 
     # ----- Cluster lifecycle via CLI -----
 
@@ -1479,22 +1485,17 @@ def _cleanup(self):
 )
 @click.option(
     "--mode",
-    type=click.Choice(["full", "keep", "redeploy"]),
+    type=click.Choice(["full", "keep", "redeploy", "local"]),
     default="full",
     show_default=True,
-    help="Execution mode: 'full' (clean start + teardown), 'keep' (clean start + keep VMs), 'redeploy' (reuse VMs)",
-)
-@click.option(
-    "--local",
-    is_flag=True,
-    help="Run locally without GCP (in-process controller and workers)",
+    help="Execution mode: 'full' (clean start + teardown), 'keep' (clean start + keep VMs), "
+    "'redeploy' (reuse VMs), 'local' (in-process controller and workers, no cloud VMs)",
 )
 def main(
     config_path: Path,
     boot_timeout_seconds: int,
     job_timeout_seconds: int,
     mode: str,
-    local: bool,
 ):
     """Run Iris cluster autoscaling smoke test.
 
@@ -1507,6 +1508,9 @@ def main(
         # Basic smoke test (uses examples/smoke.yaml by default)
         uv run python scripts/smoke-test.py
 
+        # Local mode: in-process controller and workers
+        uv run python scripts/smoke-test.py --mode local
+
         # CoreWeave GPU smoke test
         uv run python scripts/smoke-test.py --config examples/coreweave.yaml
 
@@ -1527,7 +1531,6 @@ def main(
         boot_timeout_seconds=boot_timeout_seconds,
         job_timeout_seconds=job_timeout_seconds,
         mode=mode,  # type: ignore
-        local=local,
     )
 
     runner = SmokeTestRunner(config)
 
@@ -150,12 +150,12 @@ def _build_and_push_task_image(task_tag: str, verbose: bool = False) -> None:
 def _build_cluster_images(config, verbose: bool = False) -> dict[str, str]:
     built: dict[str, str] = {}
 
-    for tag, typ in [(config.defaults.bootstrap.docker_image, "worker"), (config.controller.image, "controller")]:
+    for tag, typ in [(config.defaults.worker.docker_image, "worker"), (config.controller.image, "controller")]:
         if tag:
             _build_and_push_for_tag(tag, typ, verbose=verbose)
             built[typ] = tag
 
-    task_tag = config.defaults.default_task_image
+    task_tag = config.defaults.worker.default_task_image
     if task_tag:
         _build_and_push_task_image(task_tag, verbose=verbose)
         built["task"] = task_tag
@@ -175,8 +175,8 @@ def _pin_tag(tag: str | None, git_sha: str) -> str | None:
 
     tags = {
         "controller": config.controller.image,
-        "worker": config.defaults.bootstrap.docker_image,
-        "task": config.defaults.default_task_image,
+        "worker": config.defaults.worker.docker_image,
+        "task": config.defaults.worker.default_task_image,
     }
     needs_pin = any(tag.endswith(":latest") for tag in tags.values() if tag)
     if not needs_pin:
@@ -188,9 +188,9 @@ def _pin_tag(tag: str | None, git_sha: str) -> str | None:
     if pinned["controller"]:
         config.controller.image = pinned["controller"]
     if pinned["worker"]:
-        config.defaults.bootstrap.docker_image = pinned["worker"]
+        config.defaults.worker.docker_image = pinned["worker"]
     if pinned["task"]:
-        config.defaults.default_task_image = pinned["task"]
+        config.defaults.worker.default_task_image = pinned["task"]
 
     click.echo("Pinning :latest image tags to git SHA for this run:")
     for name, tag in pinned.items():
 
@@ -45,9 +45,11 @@
         scale_up_delay=Duration.from_seconds(60).to_proto(),
         scale_down_delay=Duration.from_seconds(300).to_proto(),
     ),
-    bootstrap=config_pb2.BootstrapConfig(
-        worker_port=10001,
+    worker=config_pb2.WorkerConfig(
+        port=10001,
         cache_dir="/var/cache/iris",
+        host="0.0.0.0",
+        port_range="30000-40000",
     ),
 )
 
@@ -248,33 +250,31 @@ def validate_config(config: config_pb2.IrisClusterConfig) -> None:
     _validate_scale_group_resources(config)
     _validate_slice_templates(config)
     _validate_worker_settings(config)
-    _validate_bootstrap_defaults(config)
+    _validate_worker_defaults(config)
 
 
-def _validate_bootstrap_defaults(config: config_pb2.IrisClusterConfig) -> None:
-    """Validate bootstrap defaults required for worker-based platforms.
+def _validate_worker_defaults(config: config_pb2.IrisClusterConfig) -> None:
+    """Validate worker defaults required for worker-based platforms.
 
-    Local platform runs workers in-process and does not require bootstrap image/runtime.
+    Local platform runs workers in-process and does not require a docker image/runtime.
     GCP/manual/CoreWeave create remote worker processes and must provide a worker image.
     """
     # Some unit tests validate partial proto configs directly (without load_config/apply_defaults).
-    # Only enforce bootstrap image checks once defaults/platform are explicitly present.
+    # Only enforce worker image checks once defaults/platform are explicitly present.
     if not config.HasField("defaults"):
         return
 
     platform_kind = config.platform.WhichOneof("platform")
     if platform_kind in (None, "local"):
         return
 
-    docker_image = config.defaults.bootstrap.docker_image.strip()
+    docker_image = config.defaults.worker.docker_image.strip()
     if not docker_image:
-        raise ValueError(
-            "defaults.bootstrap.docker_image is required for non-local platforms " "(gcp/manual/coreweave)."
-        )
+        raise ValueError("defaults.worker.docker_image is required for non-local platforms (gcp/manual/coreweave).")
 
-    runtime = config.defaults.bootstrap.runtime.strip()
+    runtime = config.defaults.worker.runtime.strip()
     if runtime and runtime not in {"docker", "kubernetes"}:
-        raise ValueError(f"defaults.bootstrap.runtime must be one of docker/kubernetes, got {runtime!r}.")
+        raise ValueError(f"defaults.worker.runtime must be one of docker/kubernetes, got {runtime!r}.")
 
 
 def _scale_groups_to_config(scale_groups: dict[str, config_pb2.ScaleGroupConfig]) -> config_pb2.IrisClusterConfig:
@@ -326,16 +326,16 @@ def _merge_proto_fields(target, source) -> None:
 def _deep_merge_defaults(target: config_pb2.DefaultsConfig, source: config_pb2.DefaultsConfig) -> None:
     """Deep merge source defaults into target, field by field.
 
-    Sub-messages (timeouts, ssh, autoscaler, bootstrap) are merged field-by-field
+    Sub-messages (timeouts, ssh, autoscaler, worker) are merged field-by-field
     so that partially-specified user configs overlay hardcoded defaults without
-    wiping unset siblings. Top-level scalar fields (e.g. default_task_image) are
-    merged via _merge_proto_fields which copies any explicitly-set value.
+    wiping unset siblings. Top-level scalar fields are merged via
+    _merge_proto_fields which copies any explicitly-set value.
 
     Args:
         target: DefaultsConfig to merge into (modified in place)
         source: DefaultsConfig to merge from
     """
-    # Merge top-level scalar fields (e.g. default_task_image).
+    # Merge top-level scalar fields.
     # We skip message fields here since sub-messages need deep merging below.
     for field_desc in source.DESCRIPTOR.fields:
         if field_desc.message_type is not None:
@@ -348,11 +348,13 @@ def _deep_merge_defaults(target: config_pb2.DefaultsConfig, source: config_pb2.D
         _merge_proto_fields(target.ssh, source.ssh)
     if source.HasField("autoscaler"):
         _merge_proto_fields(target.autoscaler, source.autoscaler)
-    if source.HasField("bootstrap"):
-        _merge_proto_fields(target.bootstrap, source.bootstrap)
-        # Merge env_vars map separately (map fields don't use HasField)
-        for key, value in source.bootstrap.env_vars.items():
-            target.bootstrap.env_vars[key] = value
+    if source.HasField("worker"):
+        _merge_proto_fields(target.worker, source.worker)
+        # Merge map fields separately (map fields don't support HasField)
+        for key, value in source.worker.default_task_env.items():
+            target.worker.default_task_env[key] = value
+        for key, value in source.worker.worker_attributes.items():
+            target.worker.worker_attributes[key] = value
 
 
 def _validate_autoscaler_config(config: config_pb2.AutoscalerConfig, context: str = "autoscaler") -> None:
@@ -619,12 +621,10 @@ def load_config(config_path: Path | str) -> config_pb2.IrisClusterConfig:
     # Expand environment variables in controller_address only.
     # Other fields (e.g., docker_image, ssh.key_file) are used as-is.
     # This is intentional - controller_address often needs $IRIS_CONTROLLER_ADDRESS for dynamic discovery.
-    if "bootstrap" in data and "controller_address" in data["bootstrap"]:
-        data["bootstrap"]["controller_address"] = os.path.expandvars(data["bootstrap"]["controller_address"])
-    if "defaults" in data and "bootstrap" in data["defaults"]:
-        defaults_bootstrap = data["defaults"]["bootstrap"]
-        if "controller_address" in defaults_bootstrap:
-            defaults_bootstrap["controller_address"] = os.path.expandvars(defaults_bootstrap["controller_address"])
+    if "defaults" in data and "worker" in data["defaults"]:
+        defaults_worker = data["defaults"]["worker"]
+        if "controller_address" in defaults_worker:
+            defaults_worker["controller_address"] = os.path.expandvars(defaults_worker["controller_address"])
 
     _normalize_scale_group_resources(data)
     _expand_multi_zone_groups(data)
@@ -898,15 +898,15 @@ def as_local(self) -> IrisConfig:
         return IrisConfig(local_proto)
 
     def controller_address(self) -> str:
-        """Get controller address from bootstrap config, if set.
+        """Get controller address from worker config, if set.
 
         Returns:
             Controller address string, or empty string if not configured
         """
         # TODO: Derive controller address from controller.manual/local when unset.
-        bootstrap = self._proto.defaults.bootstrap
-        if bootstrap.HasField("controller_address"):
-            return bootstrap.controller_address
+        worker = self._proto.defaults.worker
+        if worker.HasField("controller_address"):
+            return worker.controller_address
         return ""
 
 
@@ -915,7 +915,7 @@ def create_autoscaler(
     autoscaler_config: config_pb2.AutoscalerConfig,
     scale_groups: dict[str, config_pb2.ScaleGroupConfig],
     label_prefix: str,
-    bootstrap_config: config_pb2.BootstrapConfig | None = None,
+    base_worker_config: config_pb2.WorkerConfig | None = None,
     threads: ThreadContainer | None = None,
 ):
     """Create autoscaler from Platform and explicit config.
@@ -925,7 +925,7 @@ def create_autoscaler(
         autoscaler_config: Autoscaler settings (already resolved with defaults)
         scale_groups: Map of scale group name to config
         label_prefix: Prefix for labels on managed resources
-        bootstrap_config: Worker bootstrap settings passed through to platform.create_slice().
+        base_worker_config: Base worker configuration passed through to platform.create_slice().
             None disables bootstrap (test/local mode).
         threads: Thread container for background threads. Uses global default if not provided.
 
@@ -975,5 +975,5 @@ def create_autoscaler(
         scale_groups=scaling_groups,
         config=autoscaler_config,
         platform=platform,
-        bootstrap_config=bootstrap_config,
+        base_worker_config=base_worker_config,
     )