jupyter-server · lresende · Oct 19, 2025 · Oct 17, 2025 · kevin-bates · Nov 5, 2025
diff --git a/Makefile b/Makefile
@@ -3,9 +3,9 @@
 
 .PHONY: help clean clean-env dev dev-http docs install bdist sdist test release check_dists \
     clean-images clean-enterprise-gateway clean-demo-base clean-kernel-images clean-enterprise-gateway \
-    clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \
+    clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \
     clean-kernel-tf-gpu-py clean-kernel-image-puller push-images push-enterprise-gateway-demo push-demo-base \
-    push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r \
+    push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r \
     push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller publish helm-chart
 
 SA?=source activate
@@ -155,9 +155,9 @@ docker-images:  ## Build docker images (includes kernel-based images)
 kernel-images: ## Build kernel-based docker images
 
 # Actual working targets...
-docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller
+docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller
 
-enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller:
+enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller:
 	make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) NO_CACHE=$(NO_CACHE) TAG=$(TAG) SPARK_VERSION=$(SPARK_VERSION) MULTIARCH_BUILD=$(MULTIARCH_BUILD) TARGET_ARCH=$(TARGET_ARCH) -C etc $@
 
 demo-base:
@@ -167,14 +167,14 @@ demo-base:
 clean-images: clean-demo-base ## Remove docker images (includes kernel-based images)
 clean-kernel-images: ## Remove kernel-based images
 
-clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller:
+clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller:
 	make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@
 
 clean-demo-base:
 	make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(SPARK_VERSION) -C etc $@
 
 push-images: push-demo-base
-push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller:
+push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller:
 	make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@
 
 push-demo-base:

diff --git a/enterprise_gateway/services/processproxies/container.py b/enterprise_gateway/services/processproxies/container.py
@@ -147,6 +147,8 @@ def poll(self) -> bool | None:
         # See https://github.com/jupyter-server/enterprise_gateway/issues/827
         if container_status in self.get_initial_states():
             result = None
+
+        self.log.debug(f">>> container.poll(): {container_status} --> {result}")
         return result
 
     def send_signal(self, signum: int) -> bool | None:
@@ -188,6 +190,7 @@ def shutdown_listener(self):
 
     async def confirm_remote_startup(self) -> None:
         """Confirms the container has started and returned necessary connection information."""
+        self.log.debug(">>> container.confirm_remote_startup()")
         self.log.debug("Trying to confirm kernel container startup status")
         self.start_time = RemoteProcessProxy.get_current_time()
         i = 0
@@ -197,21 +200,34 @@ async def confirm_remote_startup(self) -> None:
             await self.handle_timeout()
 
             container_status = self.get_container_status(i)
+            self.log.debug(
+                f">>> container.confirm_remote_startup() - container_status: {container_status}"
+            )
             if container_status:
                 if container_status in self.get_error_states():
                     self.log_and_raise(
                         http_status_code=500,
                         reason=f"Error starting kernel container; status: '{container_status}'.",
                     )
                 else:
+                    self.log.debug(
+                        f">>> container.confirm_remote_startup(): is hosted assigned => {self.assigned_host}"
+                    )
+                    self.log.debug(">>> should call receive_connection_info()")
                     if self.assigned_host:
                         ready_to_connect = await self.receive_connection_info()
+                        self.log.debug(
+                            f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}"
+                        )
                         self.pid = (
                             0  # We won't send process signals for kubernetes lifecycle management
                         )
                         self.pgid = 0
             else:
                 self.detect_launch_failure()
+        self.log.debug(
+            f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}"
+        )
 
     def get_process_info(self) -> dict[str, Any]:
         """Captures the base information necessary for kernel persistence relative to containers."""

diff --git a/enterprise_gateway/services/processproxies/crd.py b/enterprise_gateway/services/processproxies/crd.py
@@ -74,11 +74,15 @@ def get_container_status(self, iteration: int | None) -> str:
             )
 
             if custom_resource:
-                application_state = custom_resource['status']['applicationState']['state'].lower()
+                application_state = custom_resource.get("status", {}).get("state", "").lower()
+
+                self.log.debug(f">>> crd.get_container_status: {application_state}")
 
                 if application_state in self.get_error_states():
                     exception_text = self._get_exception_text(
-                        custom_resource['status']['applicationState']['errorMessage']
+                        custom_resource.get("status", {})
+                        .get("applicationState", {})
+                        .get("errorMessage")
                     )
                     error_message = (
                         f"CRD submission for kernel {self.kernel_id} failed: {exception_text}"

diff --git a/enterprise_gateway/services/processproxies/k8s.py b/enterprise_gateway/services/processproxies/k8s.py
@@ -115,6 +115,7 @@ def get_container_status(self, iteration: int | None) -> str:
             self.container_name = pod_info.metadata.name
             if pod_info.status:
                 pod_status = pod_info.status.phase.lower()
+                self.log.debug(f">>> k8s.get_container_status: {pod_status}")
                 if pod_status == "running" and not self.assigned_host:
                     # Pod is running, capture IP
                     self.assigned_ip = pod_info.status.pod_ip
@@ -128,6 +129,7 @@ def get_container_status(self, iteration: int | None) -> str:
                 f"Status: '{pod_status}', Pod IP: '{self.assigned_ip}', KernelID: '{self.kernel_id}'"
             )
 
+        self.log.debug(f">>> k8s.get_container_status: {pod_status}")
         return pod_status
 
     def delete_managed_object(self, termination_stati: list[str]) -> bool:

diff --git a/enterprise_gateway/services/processproxies/processproxy.py b/enterprise_gateway/services/processproxies/processproxy.py
@@ -201,6 +201,7 @@ def register_event(self, kernel_id: str) -> None:
 
     async def get_connection_info(self, kernel_id: str) -> dict:
         """Performs a timeout wait on the event, returning the conenction information on completion."""
+        self.log.debug(f">>> processproxy.get_connection_info() for kernel_id {kernel_id}")
         await asyncio.wait_for(self._response_registry[kernel_id].wait(), connection_interval)
         return self._response_registry.pop(kernel_id).response
 
@@ -1300,9 +1301,13 @@ async def receive_connection_info(self) -> bool:
         """
         # Polls the socket using accept.  When data is found, returns ready indicator and encrypted data.
         ready_to_connect = False
-
+        self.log.debug(
+            f">>> processproxy.receive_connection_info(): initializing ready to connect as {ready_to_connect}"
+        )
         try:
             connect_info = await self.response_manager.get_connection_info(self.kernel_id)
+            self.log.debug(">>> processproxy.receive_connection_info(): connect info received")
+            self.log.debug(connect_info)
             self._setup_connection_info(connect_info)
             ready_to_connect = True
         except Exception as e:
@@ -1320,6 +1325,9 @@ async def receive_connection_info(self) -> bool:
                 self.kill()
                 self.log_and_raise(http_status_code=500, reason=error_message)
 
+        self.log.debug(
+            f">>> processproxy.receive_connection_info(): returning ready to connect {ready_to_connect}"
+        )
         return ready_to_connect
 
     def _setup_connection_info(self, connect_info: dict) -> None:

diff --git a/enterprise_gateway/services/processproxies/ray_operator.py b/enterprise_gateway/services/processproxies/ray_operator.py
@@ -0,0 +1,211 @@
+"""A Ray operator process proxy."""
+
+# Internal implementation at Apple
+from __future__ import annotations
+
+from typing import Any
+
+from kubernetes import client
+
+from ..kernels.remotemanager import RemoteKernelManager
+from .k8s import KubernetesProcessProxy
+
+
+class RayOperatorProcessProxy(KubernetesProcessProxy):
+    """Ray operator process proxy."""
+
+    object_kind = "RayCluster"
+
+    def __init__(self, kernel_manager: RemoteKernelManager, proxy_config: dict):
+        """Initialize the proxy."""
+        super().__init__(kernel_manager, proxy_config)
+        self.group = "ray.io"
+        self.version = "v1alpha1"
+        self.plural = "rayclusters"
+
+    async def launch_process(
+        self, kernel_cmd: str, **kwargs: dict[str, Any] | None
+    ) -> RayOperatorProcessProxy:
+        """Launch the process for a kernel."""
+        self.kernel_resource_name = self._determine_kernel_pod_name(**kwargs)
+        kwargs["env"]["KERNEL_RESOURCE_NAME"] = self.kernel_resource_name
+        kwargs["env"]["KERNEL_CRD_GROUP"] = self.group
+        kwargs["env"]["KERNEL_CRD_VERSION"] = self.version
+        kwargs["env"]["KERNEL_CRD_PLURAL"] = self.plural
+
+        await super().launch_process(kernel_cmd, **kwargs)
+        return self
+
+    def get_container_status(self, iteration: int | None) -> str:
+        """Determines submitted Ray application status and returns unified pod state.
+
+        This method returns the pod status (not CRD status) to maintain compatibility
+        with the base class lifecycle management. The RayCluster CRD state is checked
+        first to ensure the cluster is healthy, but we return pod states that the
+        base class understands: 'pending', 'running', 'failed', etc.
+        """
+        application_state = None
+        head_pod_status = None
+        application_state = self._get_application_state()
+        if application_state:
+            self.log.debug(
+                f">>> ray_operator.get_container_status: application_state {application_state}"
+            )
+
+        # Check for CRD-level errors first
+        if application_state in self.get_error_states():
+            error_message = (
+                f"CRD submission for kernel {self.kernel_id} failed with state: {application_state}"
+            )
+            self.log.error(error_message)
+            return "failed"  # Return pod state, not CRD state
+
+        # If CRD is not ready yet, return "pending" to indicate still launching
+        if application_state != "ready":
+            self.log.debug(
+                f">>> ray_operator.get_container_status: CRD not ready yet, state={application_state}"
+            )
+            return "pending"
+
+        # CRD is ready, now check the actual pod status
+        kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel"
+        ret = None
+        try:
+            ret = client.CoreV1Api().list_namespaced_pod(
+                namespace=self.kernel_namespace, label_selector=kernel_label_selector
+            )
+        except client.rest.ApiException as e:
+            if e.status == 404:
+                self.log.debug("Resetting cluster connection info as cluster deleted")
+                self._reset_connection_info()
+            return None
+
+        if ret and ret.items:
+            pod_info = ret.items[0]
+            self.log.debug(
+                f"Cluster status {application_state}, pod status {pod_info.status.phase.lower()}"
+            )
+            if pod_info.status:
+                head_pod_status = pod_info.status.phase.lower()
+                self.log.debug(
+                    f">>> ray_operator.get_container_status: pod_status {head_pod_status}"
+                )
+                if head_pod_status == "running":
+                    self.log.debug(
+                        f"Pod Info name:{pod_info.metadata.name}, pod ip {pod_info.status.pod_ip}, host {self.container_name}"
+                    )
+                    self.container_name = pod_info.metadata.name
+                    self.assigned_ip = pod_info.status.pod_ip
+                    self.assigned_host = self.container_name
+                    self.assigned_node_ip = pod_info.status.host_ip
+
+        # only log if iteration is not None (otherwise poll() is too noisy)
+        # check for running state to avoid double logging with superclass
+        if iteration and head_pod_status != 'running':
+            self.log.debug(
+                f"{iteration}: Waiting from CRD status from resource manager {self.object_kind.lower()} in "
+                f"namespace '{self.kernel_namespace}'. Name: '{self.kernel_resource_name}', "
+                f"Status: CRD='{application_state}', Pod='{head_pod_status}', KernelID: '{self.kernel_id}'"
+            )
+
+        # KEY FIX: Return pod status (not CRD state) so base class poll() works correctly
+        final_status = head_pod_status if head_pod_status else "pending"
+        self.log.debug(
+            f">>> ray_operator.get_container_status: returning pod_status={final_status} "
+            f"(CRD state was {application_state})"
+        )
+        return final_status
+
+    def delete_managed_object(self, termination_stati: list[str]) -> bool:
+        """Deletes the object managed by this process-proxy
+
+        A return value of True indicates the object is considered deleted,
+        otherwise a False or None value is returned.
+
+        Note: the caller is responsible for handling exceptions.
+        """
+        delete_status = client.CustomObjectsApi().delete_namespaced_custom_object(
+            self.group,
+            self.version,
+            self.kernel_namespace,
+            self.plural,
+            self.kernel_resource_name,
+            grace_period_seconds=0,
+            propagation_policy="Background",
+        )
+
+        result = delete_status and delete_status.get("status", None) in termination_stati
+        if result:
+            self._reset_connection_info()
+        return result
+
+    def get_initial_states(self) -> set:
+        """Return list of states indicating container is starting (includes running).
+
+        Note: We return pod states (not CRD states) to maintain compatibility
+        with the base class poll() implementation, which checks if the status
+        returned by get_container_status() is in this set.
+        """
+        return ["pending", "running"]
+
+    def get_error_states(self) -> set:
+        """Return list of states indicating RayCluster has failed."""
+        # Ray doesn't typically use "failed" state, but we'll include common error states
+        return {"failed", "error", "unhealthy"}
+
+    def _get_ray_cluster_status(self) -> dict:
+        try:
+            return client.CustomObjectsApi().get_namespaced_custom_object(
+                self.group,
+                self.version,
+                self.kernel_namespace,
+                self.plural,
+                self.kernel_resource_name,
+            )
+        except client.rest.ApiException as e:
+            if e.status == 404:
+                self.log.debug("Resetting cluster connection info as cluster deleted")
+                self._reset_connection_info()
+            return None
+
+    def _get_application_state(self):
+        custom_resource = self._get_ray_cluster_status()
+
+        if custom_resource is None:
+            return None
+
+        if 'status' not in custom_resource or 'state' not in custom_resource['status']:
+            return None
+
+        return custom_resource['status']['state'].lower()
+
+    def _get_pod_status(self) -> str:
+        """Get the current status of the kernel pod.
+        Returns
+        -------
+        str
+            The pod status in lowercase (e.g., 'pending', 'running', 'failed', 'unknown').
+        """
+        pod_status = "unknown"
+        kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel"
+        ret = client.CoreV1Api().list_namespaced_pod(
+            namespace=self.kernel_namespace, label_selector=kernel_label_selector
+        )
+        if ret and ret.items:
+            pod_info = ret.items[0]
+            self.container_name = pod_info.metadata.name
+            if pod_info.status:
+                pod_status = pod_info.status.phase.lower()
+                self.log.debug(f">>> k8s._get_pod_status: {pod_status}")
+
+        return pod_status
+
+    def _reset_connection_info(self):
+        """Reset all connection-related attributes to their initial state.
+        This is typically called when a cluster is deleted or connection is lost.
+        """
+
+        self.assigned_host = None
+        self.container_name = ""
+        self.assigned_node_ip = None
+        self.assigned_ip = None
diff --git a/enterprise_gateway/services/sessions/kernelsessionmanager.py b/enterprise_gateway/services/sessions/kernelsessionmanager.py
@@ -94,6 +94,7 @@ def create_session(self, kernel_id: str, **kwargs) -> None:
             Information used for the launch of the kernel
 
         """
+        self.log.debug(f">>> Creating new session for kernel {kernel_id}")
         km = self.kernel_manager.get_kernel(kernel_id)
 
         # Compose the kernel_session entry
@@ -103,11 +104,14 @@ def create_session(self, kernel_id: str, **kwargs) -> None:
         kernel_session["kernel_name"] = km.kernel_name
 
         # Build the inner dictionaries: connection_info, process_proxy and add to kernel_session
+        self.log.debug(f">>> Getting connection info for kernel {kernel_id}")
         kernel_session["connection_info"] = km.get_connection_info()
         kernel_session["launch_args"] = kwargs.copy()
+        self.log.debug(f">>> Getting process info for kernel {kernel_id}")
         kernel_session["process_info"] = (
             km.process_proxy.get_process_info() if km.process_proxy else {}
         )
+        self.log.debug(f">>> Saving session {kernel_session}")
         self._save_session(kernel_id, kernel_session)
 
     def refresh_session(self, kernel_id: str) -> None: