Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions kubeflow/trainer/backends/container/adapters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,22 @@ def get_network(self, network_id: str) -> Optional[dict]:
Dictionary with network info including labels, or None if not found
"""
raise NotImplementedError()

@abc.abstractmethod
def wait_for_container(self, container_id: str, timeout: Optional[int] = None) -> int:
"""
Wait for a container to exit and return its exit code.

This is a blocking call that waits until the container stops.

Args:
container_id: Container ID
timeout: Maximum time to wait in seconds, or None to wait indefinitely

Returns:
Container exit code

Raises:
TimeoutError: If timeout is reached before container exits
"""
raise NotImplementedError()
28 changes: 28 additions & 0 deletions kubeflow/trainer/backends/container/adapters/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,31 @@ def get_network(self, network_id: str) -> Optional[dict]:
}
except Exception:
return None

def wait_for_container(self, container_id: str, timeout: Optional[int] = None) -> int:
"""
Wait for a Docker container to exit and return its exit code.

Args:
container_id: Container ID
timeout: Maximum time to wait in seconds, or None to wait indefinitely

Returns:
Container exit code

Raises:
TimeoutError: If timeout is reached before container exits
"""
try:
container = self.get_container(container_id)
result = container.wait(timeout=timeout)
# Docker wait() returns a dict with 'StatusCode' key
if isinstance(result, dict):
return result.get("StatusCode", 0)
return int(result)
except Exception as e:
if "timeout" in str(e).lower():
raise TimeoutError(
f"Container {container_id} did not exit within {timeout} seconds"
) from e
raise
26 changes: 26 additions & 0 deletions kubeflow/trainer/backends/container/adapters/podman.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,29 @@ def get_network(self, network_id: str) -> Optional[dict]:
}
except Exception:
return None

def wait_for_container(self, container_id: str, timeout: Optional[int] = None) -> int:
"""
Wait for a Podman container to exit and return its exit code.

Args:
container_id: Container ID
timeout: Maximum time to wait in seconds, or None to wait indefinitely

Returns:
Container exit code

Raises:
TimeoutError: If timeout is reached before container exits
"""
try:
container = self.get_container(container_id)
result = container.wait(timeout=timeout)
# Podman wait() returns exit code directly
return int(result)
except Exception as e:
if "timeout" in str(e).lower():
raise TimeoutError(
f"Container {container_id} did not exit within {timeout} seconds"
) from e
raise
Loading
Loading