feat(trainer): address reviewer feedback for initializer support

HKanoje · HKanoje · commit 0dbb6b6411a2 · 2026-01-05T08:49:38.000-08:00
- Make initializer image configurable via ContainerBackendConfig - Make initializer timeout configurable (default 600 seconds) - Implement wait API in adapters instead of polling - Clean up successful initializer containers after completion - Clean up network on initializer failure - Raise ValueError for unsupported initializer types (no datacache fallback) All tests passing (173/173). Addresses all feedback from PR #188. Signed-off-by: HKanoje <hrithik.kanoje@gmail.com>
diff --git a/kubeflow/trainer/backends/container/adapters/base.py b/kubeflow/trainer/backends/container/adapters/base.py
@@ -193,3 +193,22 @@ def get_network(self, network_id: str) -> Optional[dict]:
             Dictionary with network info including labels, or None if not found
         """
         raise NotImplementedError()
+
+    @abc.abstractmethod
+    def wait_for_container(self, container_id: str, timeout: Optional[int] = None) -> int:
+        """
+        Wait for a container to exit and return its exit code.
+
+        This is a blocking call that waits until the container stops.
+
+        Args:
+            container_id: Container ID
+            timeout: Maximum time to wait in seconds, or None to wait indefinitely
+
+        Returns:
+            Container exit code
+
+        Raises:
+            TimeoutError: If timeout is reached before container exits
+        """
+        raise NotImplementedError()
diff --git a/kubeflow/trainer/backends/container/adapters/docker.py b/kubeflow/trainer/backends/container/adapters/docker.py
@@ -227,3 +227,31 @@ def get_network(self, network_id: str) -> Optional[dict]:
             }
         except Exception:
             return None
+
+    def wait_for_container(self, container_id: str, timeout: Optional[int] = None) -> int:
+        """
+        Wait for a Docker container to exit and return its exit code.
+
+        Args:
+            container_id: Container ID
+            timeout: Maximum time to wait in seconds, or None to wait indefinitely
+
+        Returns:
+            Container exit code
+
+        Raises:
+            TimeoutError: If timeout is reached before container exits
+        """
+        try:
+            container = self.get_container(container_id)
+            result = container.wait(timeout=timeout)
+            # Docker wait() returns a dict with 'StatusCode' key
+            if isinstance(result, dict):
+                return result.get("StatusCode", 0)
+            return int(result)
+        except Exception as e:
+            if "timeout" in str(e).lower():
+                raise TimeoutError(
+                    f"Container {container_id} did not exit within {timeout} seconds"
+                ) from e
+            raise
diff --git a/kubeflow/trainer/backends/container/adapters/podman.py b/kubeflow/trainer/backends/container/adapters/podman.py
@@ -254,3 +254,29 @@ def get_network(self, network_id: str) -> Optional[dict]:
             }
         except Exception:
             return None
+
+    def wait_for_container(self, container_id: str, timeout: Optional[int] = None) -> int:
+        """
+        Wait for a Podman container to exit and return its exit code.
+
+        Args:
+            container_id: Container ID
+            timeout: Maximum time to wait in seconds, or None to wait indefinitely
+
+        Returns:
+            Container exit code
+
+        Raises:
+            TimeoutError: If timeout is reached before container exits
+        """
+        try:
+            container = self.get_container(container_id)
+            result = container.wait(timeout=timeout)
+            # Podman wait() returns exit code directly
+            return int(result)
+        except Exception as e:
+            if "timeout" in str(e).lower():
+                raise TimeoutError(
+                    f"Container {container_id} did not exit within {timeout} seconds"
+                ) from e
+            raise
diff --git a/kubeflow/trainer/backends/container/backend.py b/kubeflow/trainer/backends/container/backend.py
@@ -274,8 +274,17 @@ def train(
             # Run initializers if configured
             if initializer:
                 logger.debug("Running initializers")
-                self._run_initializers(trainjob_name, initializer, workdir, network_id)
-                logger.debug("Initializers completed successfully")
+                try:
+                    self._run_initializers(trainjob_name, initializer, workdir, network_id)
+                    logger.debug("Initializers completed successfully")
+                except Exception as e:
+                    # Clean up network if initializers fail
+                    logger.error(f"Initializer failed, cleaning up network: {e}")
+                    from contextlib import suppress
+
+                    with suppress(Exception):
+                        self._adapter.delete_network(network_id)
+                    raise
 
             # Generate training script code (inline, not written to disk)
             training_script_code = container_utils.get_training_script_code(trainer)
@@ -493,7 +502,7 @@ def _run_initializers(
             RuntimeError: If initializer fails to complete successfully.
         """
         # Get initializer image
-        init_image = container_utils.get_initializer_image()
+        init_image = container_utils.get_initializer_image(self.cfg)
 
         # Pull initializer image if needed
         container_utils.maybe_pull_image(self._adapter, init_image, self.cfg.pull_policy)
@@ -586,32 +595,40 @@ def _run_single_initializer(
 
         # Wait for the initializer to complete
         try:
-            import time
+            # Use the wait API for efficient waiting
+            exit_code = self._adapter.wait_for_container(
+                container_id, timeout=self.cfg.initializer_timeout
+            )
 
-            timeout = 600  # 10 minutes timeout for initialization
-            polling_interval = 2
-            elapsed = 0
+            if exit_code == 0:
+                logger.debug(f"{init_type} initializer completed successfully")
+                # Clean up the successful container
+                from contextlib import suppress
+
+                with suppress(Exception):
+                    self._adapter.remove_container(container_id, force=True)
+                return
+            else:
+                # Get logs for debugging
+                logs = list(self._adapter.container_logs(container_id, follow=False))
+                error_msg = (
+                    f"{init_type} initializer failed with exit code {exit_code}. "
+                    f"Logs: {' '.join(logs[-10:]) if logs else 'No logs available'}"
+                )
+                raise RuntimeError(error_msg)
 
-            while elapsed < timeout:
-                status, exit_code = self._adapter.container_status(container_id)
+        except TimeoutError:
+            logger.error(
+                f"{init_type} initializer did not complete within "
+                f"{self.cfg.initializer_timeout} seconds"
+            )
+            # Clean up the timed-out container
+            from contextlib import suppress
 
-                if status == "exited":
-                    if exit_code == 0:
-                        logger.debug(f"{init_type} initializer completed successfully")
-                        return
-                    else:
-                        # Get logs for debugging
-                        logs = list(self._adapter.container_logs(container_id, follow=False))
-                        error_msg = (
-                            f"{init_type} initializer failed with exit code {exit_code}. "
-                            f"Logs: {' '.join(logs[-10:]) if logs else 'No logs available'}"
-                        )
-                        raise RuntimeError(error_msg)
-
-                time.sleep(polling_interval)
-                elapsed += polling_interval
-
-            raise TimeoutError(f"{init_type} initializer did not complete within {timeout} seconds")
+            with suppress(Exception):
+                self._adapter.stop_container(container_id, timeout=5)
+                self._adapter.remove_container(container_id, force=True)
+            raise
 
         except Exception as e:
             logger.error(f"Error running {init_type} initializer: {e}")
diff --git a/kubeflow/trainer/backends/container/backend_test.py b/kubeflow/trainer/backends/container/backend_test.py
@@ -197,6 +197,31 @@ def get_network(self, network_id: str) -> Optional[dict]:
                 }
         return None
 
+    def wait_for_container(self, container_id: str, timeout: Optional[int] = None) -> int:
+        """
+        Wait for a container to exit and return its exit code.
+
+        For testing, immediately returns the container's exit code if it has exited,
+        or raises TimeoutError if the container is still running.
+
+        Args:
+            container_id: Container ID
+            timeout: Maximum time to wait in seconds (not used in mock)
+
+        Returns:
+            Container exit code
+
+        Raises:
+            TimeoutError: If container is still running
+        """
+        for container in self.containers_created:
+            if container["id"] == container_id:
+                if container["status"] == "exited":
+                    return container.get("exit_code", 0)
+                # In mock, if not exited, simulate timeout
+                raise TimeoutError(f"Container {container_id} did not exit within timeout")
+        raise RuntimeError(f"Container {container_id} not found")
+
 
 # Fixtures
 @pytest.fixture
diff --git a/kubeflow/trainer/backends/container/types.py b/kubeflow/trainer/backends/container/types.py
@@ -65,3 +65,11 @@ class ContainerBackendConfig(BaseModel):
         default_factory=TrainingRuntimeSource,
         description="Configuration for training runtime sources",
     )
+    initializer_image: str = Field(
+        default="kubeflow/training-operator:latest",
+        description="Container image for dataset and model initializers",
+    )
+    initializer_timeout: int = Field(
+        default=600,
+        description="Timeout in seconds for initializer containers (default 10 minutes)",
+    )
diff --git a/kubeflow/trainer/backends/container/utils.py b/kubeflow/trainer/backends/container/utils.py
@@ -223,18 +223,26 @@ def build_initializer_command(initializer: types.BaseInitializer, init_type: str
 
     Returns:
         Command list for the initializer container.
+
+    Raises:
+        ValueError: If the initializer type is not supported.
     """
     # Use the training-operator initializer script
     # The initializer script is expected to be available in the image
-    python_cmd = (
-        "python -m kubeflow.storage_initializer.s3 "
-        if isinstance(initializer, (types.S3DatasetInitializer, types.S3ModelInitializer))
-        else "python -m kubeflow.storage_initializer.hugging_face "
-        if isinstance(
-            initializer, (types.HuggingFaceDatasetInitializer, types.HuggingFaceModelInitializer)
+    if isinstance(initializer, (types.S3DatasetInitializer, types.S3ModelInitializer)):
+        python_cmd = "python -m kubeflow.storage_initializer.s3 "
+    elif isinstance(
+        initializer, (types.HuggingFaceDatasetInitializer, types.HuggingFaceModelInitializer)
+    ):
+        python_cmd = "python -m kubeflow.storage_initializer.hugging_face "
+    elif isinstance(initializer, types.DataCacheInitializer):
+        python_cmd = "python -m kubeflow.storage_initializer.datacache "
+    else:
+        raise ValueError(
+            f"Unsupported initializer type: {type(initializer).__name__}. "
+            "Supported types: HuggingFaceDatasetInitializer, HuggingFaceModelInitializer, "
+            "S3DatasetInitializer, S3ModelInitializer, DataCacheInitializer"
         )
-        else "python -m kubeflow.storage_initializer.datacache "
-    )
 
     return ["bash", "-c", python_cmd]
 
@@ -300,13 +308,14 @@ def build_initializer_env(initializer: types.BaseInitializer, init_type: str) ->
     return env
 
 
-def get_initializer_image() -> str:
+def get_initializer_image(config) -> str:
     """
-    Get the container image for initializers.
+    Get the container image for initializers from backend config.
+
+    Args:
+        config: ContainerBackendConfig with initializer_image setting.
 
     Returns:
         Container image name for initializers.
     """
-    # Use the training-operator image which contains initializer scripts
-    # This can be made configurable via backend config in the future
-    return "kubeflow/training-operator:latest"
+    return config.initializer_image