Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

.PHONY: help clean clean-env dev dev-http docs install bdist sdist test release check_dists \
clean-images clean-enterprise-gateway clean-demo-base clean-kernel-images clean-enterprise-gateway \
clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \
clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \
clean-kernel-tf-gpu-py clean-kernel-image-puller push-images push-enterprise-gateway-demo push-demo-base \
push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r \
push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r \
push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller publish helm-chart

SA?=source activate
Expand Down Expand Up @@ -155,9 +155,9 @@ docker-images: ## Build docker images (includes kernel-based images)
kernel-images: ## Build kernel-based docker images

# Actual working targets...
docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller
docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller

enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller:
enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller:
make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) NO_CACHE=$(NO_CACHE) TAG=$(TAG) SPARK_VERSION=$(SPARK_VERSION) MULTIARCH_BUILD=$(MULTIARCH_BUILD) TARGET_ARCH=$(TARGET_ARCH) -C etc $@

demo-base:
Expand All @@ -167,14 +167,14 @@ demo-base:
clean-images: clean-demo-base ## Remove docker images (includes kernel-based images)
clean-kernel-images: ## Remove kernel-based images

clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller:
clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller:
make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@

clean-demo-base:
make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(SPARK_VERSION) -C etc $@

push-images: push-demo-base
push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller:
push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller:
make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@

push-demo-base:
Expand Down
16 changes: 16 additions & 0 deletions enterprise_gateway/services/processproxies/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ def poll(self) -> bool | None:
# See https://github.com/jupyter-server/enterprise_gateway/issues/827
if container_status in self.get_initial_states():
result = None

self.log.debug(f">>> container.poll(): {container_status} --> {result}")
return result

def send_signal(self, signum: int) -> bool | None:
Expand Down Expand Up @@ -188,6 +190,7 @@ def shutdown_listener(self):

async def confirm_remote_startup(self) -> None:
"""Confirms the container has started and returned necessary connection information."""
self.log.debug(">>> container.confirm_remote_startup()")
self.log.debug("Trying to confirm kernel container startup status")
self.start_time = RemoteProcessProxy.get_current_time()
i = 0
Expand All @@ -197,21 +200,34 @@ async def confirm_remote_startup(self) -> None:
await self.handle_timeout()

container_status = self.get_container_status(i)
self.log.debug(
f">>> container.confirm_remote_startup() - container_status: {container_status}"
)
if container_status:
if container_status in self.get_error_states():
self.log_and_raise(
http_status_code=500,
reason=f"Error starting kernel container; status: '{container_status}'.",
)
else:
self.log.debug(
f">>> container.confirm_remote_startup(): is hosted assigned => {self.assigned_host}"
)
self.log.debug(">>> should call receive_connection_info()")
if self.assigned_host:
ready_to_connect = await self.receive_connection_info()
self.log.debug(
f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm cool with leaving these debug statements in - perhaps we can remove some of them later. However, can we avoid having two statements display the same message? This statement is the same as that on L229, so it would be good differentiate these.

)
self.pid = (
0 # We won't send process signals for kubernetes lifecycle management
)
self.pgid = 0
else:
self.detect_launch_failure()
self.log.debug(
f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}"
)

def get_process_info(self) -> dict[str, Any]:
"""Captures the base information necessary for kernel persistence relative to containers."""
Expand Down
8 changes: 6 additions & 2 deletions enterprise_gateway/services/processproxies/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,15 @@ def get_container_status(self, iteration: int | None) -> str:
)

if custom_resource:
application_state = custom_resource['status']['applicationState']['state'].lower()
application_state = custom_resource.get("status", {}).get("state", "").lower()

self.log.debug(f">>> crd.get_container_status: {application_state}")

if application_state in self.get_error_states():
exception_text = self._get_exception_text(
custom_resource['status']['applicationState']['errorMessage']
custom_resource.get("status", {})
.get("applicationState", {})
.get("errorMessage")
)
error_message = (
f"CRD submission for kernel {self.kernel_id} failed: {exception_text}"
Expand Down
2 changes: 2 additions & 0 deletions enterprise_gateway/services/processproxies/k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def get_container_status(self, iteration: int | None) -> str:
self.container_name = pod_info.metadata.name
if pod_info.status:
pod_status = pod_info.status.phase.lower()
self.log.debug(f">>> k8s.get_container_status: {pod_status}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment here - see L132 below - let's differentiate these.

if pod_status == "running" and not self.assigned_host:
# Pod is running, capture IP
self.assigned_ip = pod_info.status.pod_ip
Expand All @@ -128,6 +129,7 @@ def get_container_status(self, iteration: int | None) -> str:
f"Status: '{pod_status}', Pod IP: '{self.assigned_ip}', KernelID: '{self.kernel_id}'"
)

self.log.debug(f">>> k8s.get_container_status: {pod_status}")
return pod_status

def delete_managed_object(self, termination_stati: list[str]) -> bool:
Expand Down
10 changes: 9 additions & 1 deletion enterprise_gateway/services/processproxies/processproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ def register_event(self, kernel_id: str) -> None:

async def get_connection_info(self, kernel_id: str) -> dict:
"""Performs a timeout wait on the event, returning the conenction information on completion."""
self.log.debug(f">>> processproxy.get_connection_info() for kernel_id {kernel_id}")
await asyncio.wait_for(self._response_registry[kernel_id].wait(), connection_interval)
return self._response_registry.pop(kernel_id).response

Expand Down Expand Up @@ -1300,9 +1301,13 @@ async def receive_connection_info(self) -> bool:
"""
# Polls the socket using accept. When data is found, returns ready indicator and encrypted data.
ready_to_connect = False

self.log.debug(
f">>> processproxy.receive_connection_info(): initializing ready to connect as {ready_to_connect}"
)
try:
connect_info = await self.response_manager.get_connection_info(self.kernel_id)
self.log.debug(">>> processproxy.receive_connection_info(): connect info received")
self.log.debug(connect_info)
self._setup_connection_info(connect_info)
ready_to_connect = True
except Exception as e:
Expand All @@ -1320,6 +1325,9 @@ async def receive_connection_info(self) -> bool:
self.kill()
self.log_and_raise(http_status_code=500, reason=error_message)

self.log.debug(
f">>> processproxy.receive_connection_info(): returning ready to connect {ready_to_connect}"
)
return ready_to_connect

def _setup_connection_info(self, connect_info: dict) -> None:
Expand Down
211 changes: 211 additions & 0 deletions enterprise_gateway/services/processproxies/ray_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""A Ray operator process proxy."""

# Internal implementation at Apple
from __future__ import annotations

from typing import Any

from kubernetes import client

from ..kernels.remotemanager import RemoteKernelManager
from .k8s import KubernetesProcessProxy


class RayOperatorProcessProxy(KubernetesProcessProxy):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this derive from CustomResourceProcessProxy?

"""Ray operator process proxy."""

object_kind = "RayCluster"

def __init__(self, kernel_manager: RemoteKernelManager, proxy_config: dict):
"""Initialize the proxy."""
super().__init__(kernel_manager, proxy_config)
self.group = "ray.io"
self.version = "v1alpha1"
self.plural = "rayclusters"

async def launch_process(
self, kernel_cmd: str, **kwargs: dict[str, Any] | None
) -> RayOperatorProcessProxy:
"""Launch the process for a kernel."""
self.kernel_resource_name = self._determine_kernel_pod_name(**kwargs)
kwargs["env"]["KERNEL_RESOURCE_NAME"] = self.kernel_resource_name
kwargs["env"]["KERNEL_CRD_GROUP"] = self.group
kwargs["env"]["KERNEL_CRD_VERSION"] = self.version
kwargs["env"]["KERNEL_CRD_PLURAL"] = self.plural

await super().launch_process(kernel_cmd, **kwargs)
return self

def get_container_status(self, iteration: int | None) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code here looks very generic and feels like it belongs on the superclass. In speaking with you offline - it sounds like the RayOperator CRD expects different status values for running and ready, etc. Perhaps we can introduce override methods to account for CRD's with different status requirements in their lifecycle.

We would need to be aware of the Spark operator, but, actually, the default implementations on the base class would match what that operator expects anyway - so perhaps its not too risky.

"""Determines submitted Ray application status and returns unified pod state.
This method returns the pod status (not CRD status) to maintain compatibility
with the base class lifecycle management. The RayCluster CRD state is checked
first to ensure the cluster is healthy, but we return pod states that the
base class understands: 'pending', 'running', 'failed', etc.
"""
application_state = None
head_pod_status = None
application_state = self._get_application_state()
if application_state:
self.log.debug(
f">>> ray_operator.get_container_status: application_state {application_state}"
)

# Check for CRD-level errors first
if application_state in self.get_error_states():
error_message = (
f"CRD submission for kernel {self.kernel_id} failed with state: {application_state}"
)
self.log.error(error_message)
return "failed" # Return pod state, not CRD state

# If CRD is not ready yet, return "pending" to indicate still launching
if application_state != "ready":
self.log.debug(
f">>> ray_operator.get_container_status: CRD not ready yet, state={application_state}"
)
return "pending"

# CRD is ready, now check the actual pod status
kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel"
ret = None
try:
ret = client.CoreV1Api().list_namespaced_pod(
namespace=self.kernel_namespace, label_selector=kernel_label_selector
)
except client.rest.ApiException as e:
if e.status == 404:
self.log.debug("Resetting cluster connection info as cluster deleted")
self._reset_connection_info()
return None

if ret and ret.items:
pod_info = ret.items[0]
self.log.debug(
f"Cluster status {application_state}, pod status {pod_info.status.phase.lower()}"
)
if pod_info.status:
head_pod_status = pod_info.status.phase.lower()
self.log.debug(
f">>> ray_operator.get_container_status: pod_status {head_pod_status}"
)
if head_pod_status == "running":
self.log.debug(
f"Pod Info name:{pod_info.metadata.name}, pod ip {pod_info.status.pod_ip}, host {self.container_name}"
)
self.container_name = pod_info.metadata.name
self.assigned_ip = pod_info.status.pod_ip
self.assigned_host = self.container_name
self.assigned_node_ip = pod_info.status.host_ip

# only log if iteration is not None (otherwise poll() is too noisy)
# check for running state to avoid double logging with superclass
if iteration and head_pod_status != 'running':
self.log.debug(
f"{iteration}: Waiting from CRD status from resource manager {self.object_kind.lower()} in "
f"namespace '{self.kernel_namespace}'. Name: '{self.kernel_resource_name}', "
f"Status: CRD='{application_state}', Pod='{head_pod_status}', KernelID: '{self.kernel_id}'"
)

# KEY FIX: Return pod status (not CRD state) so base class poll() works correctly
final_status = head_pod_status if head_pod_status else "pending"
self.log.debug(
f">>> ray_operator.get_container_status: returning pod_status={final_status} "
f"(CRD state was {application_state})"
)
return final_status

def delete_managed_object(self, termination_stati: list[str]) -> bool:
"""Deletes the object managed by this process-proxy
A return value of True indicates the object is considered deleted,
otherwise a False or None value is returned.
Note: the caller is responsible for handling exceptions.
"""
delete_status = client.CustomObjectsApi().delete_namespaced_custom_object(
self.group,
self.version,
self.kernel_namespace,
self.plural,
self.kernel_resource_name,
grace_period_seconds=0,
propagation_policy="Background",
)

result = delete_status and delete_status.get("status", None) in termination_stati
if result:
self._reset_connection_info()
Comment on lines +138 to +139
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These seems useful in the superclass, then this method could be removed. Is there harm in moving _reset_connection_info() there as well? If so, perhaps the default implementation of "reset" could be a no-op and let this process proxx override.

return result

def get_initial_states(self) -> set:
"""Return list of states indicating container is starting (includes running).
Note: We return pod states (not CRD states) to maintain compatibility
with the base class poll() implementation, which checks if the status
returned by get_container_status() is in this set.
"""
return ["pending", "running"]

def get_error_states(self) -> set:
"""Return list of states indicating RayCluster has failed."""
# Ray doesn't typically use "failed" state, but we'll include common error states
return {"failed", "error", "unhealthy"}

def _get_ray_cluster_status(self) -> dict:
try:
return client.CustomObjectsApi().get_namespaced_custom_object(
self.group,
self.version,
self.kernel_namespace,
self.plural,
self.kernel_resource_name,
)
except client.rest.ApiException as e:
if e.status == 404:
self.log.debug("Resetting cluster connection info as cluster deleted")
self._reset_connection_info()
return None

def _get_application_state(self):
custom_resource = self._get_ray_cluster_status()

if custom_resource is None:
return None

if 'status' not in custom_resource or 'state' not in custom_resource['status']:
return None

return custom_resource['status']['state'].lower()

def _get_pod_status(self) -> str:
"""Get the current status of the kernel pod.
Returns
-------
str
The pod status in lowercase (e.g., 'pending', 'running', 'failed', 'unknown').
"""
pod_status = "unknown"
kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel"
ret = client.CoreV1Api().list_namespaced_pod(
namespace=self.kernel_namespace, label_selector=kernel_label_selector
)
if ret and ret.items:
pod_info = ret.items[0]
self.container_name = pod_info.metadata.name
if pod_info.status:
pod_status = pod_info.status.phase.lower()
self.log.debug(f">>> k8s._get_pod_status: {pod_status}")

return pod_status

def _reset_connection_info(self):
"""Reset all connection-related attributes to their initial state.
This is typically called when a cluster is deleted or connection is lost.
"""

self.assigned_host = None
self.container_name = ""
self.assigned_node_ip = None
self.assigned_ip = None
4 changes: 4 additions & 0 deletions enterprise_gateway/services/sessions/kernelsessionmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def create_session(self, kernel_id: str, **kwargs) -> None:
Information used for the launch of the kernel

"""
self.log.debug(f">>> Creating new session for kernel {kernel_id}")
km = self.kernel_manager.get_kernel(kernel_id)

# Compose the kernel_session entry
Expand All @@ -103,11 +104,14 @@ def create_session(self, kernel_id: str, **kwargs) -> None:
kernel_session["kernel_name"] = km.kernel_name

# Build the inner dictionaries: connection_info, process_proxy and add to kernel_session
self.log.debug(f">>> Getting connection info for kernel {kernel_id}")
kernel_session["connection_info"] = km.get_connection_info()
kernel_session["launch_args"] = kwargs.copy()
self.log.debug(f">>> Getting process info for kernel {kernel_id}")
kernel_session["process_info"] = (
km.process_proxy.get_process_info() if km.process_proxy else {}
)
self.log.debug(f">>> Saving session {kernel_session}")
self._save_session(kernel_id, kernel_session)

def refresh_session(self, kernel_id: str) -> None:
Expand Down
Loading
Loading