diff --git a/Makefile b/Makefile index 33259cc38..8ab2f2917 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,9 @@ .PHONY: help clean clean-env dev dev-http docs install bdist sdist test release check_dists \ clean-images clean-enterprise-gateway clean-demo-base clean-kernel-images clean-enterprise-gateway \ - clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \ + clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \ clean-kernel-tf-gpu-py clean-kernel-image-puller push-images push-enterprise-gateway-demo push-demo-base \ - push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r \ + push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r \ push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller publish helm-chart SA?=source activate @@ -155,9 +155,9 @@ docker-images: ## Build docker images (includes kernel-based images) kernel-images: ## Build kernel-based docker images # Actual working targets... -docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller +docker-images: demo-base enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller -enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller: +enterprise-gateway-demo kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) NO_CACHE=$(NO_CACHE) TAG=$(TAG) SPARK_VERSION=$(SPARK_VERSION) MULTIARCH_BUILD=$(MULTIARCH_BUILD) TARGET_ARCH=$(TARGET_ARCH) -C etc $@ demo-base: @@ -167,14 +167,14 @@ demo-base: clean-images: clean-demo-base ## Remove docker images (includes kernel-based images) clean-kernel-images: ## Remove kernel-based images -clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller: +clean-images clean-enterprise-gateway-demo clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@ clean-demo-base: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(SPARK_VERSION) -C etc $@ push-images: push-demo-base -push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller: +push-images push-enterprise-gateway-demo push-kernel-images push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-tf-py push-kernel-tf-gpu-py push-kernel-image-puller: make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@ push-demo-base: diff --git a/enterprise_gateway/services/processproxies/container.py b/enterprise_gateway/services/processproxies/container.py index 6378b633f..24ce5ee2d 100644 --- a/enterprise_gateway/services/processproxies/container.py +++ b/enterprise_gateway/services/processproxies/container.py @@ -147,6 +147,8 @@ def poll(self) -> bool | None: # See https://github.com/jupyter-server/enterprise_gateway/issues/827 if container_status in self.get_initial_states(): result = None + + self.log.debug(f">>> container.poll(): {container_status} --> {result}") return result def send_signal(self, signum: int) -> bool | None: @@ -188,6 +190,7 @@ def shutdown_listener(self): async def confirm_remote_startup(self) -> None: """Confirms the container has started and returned necessary connection information.""" + self.log.debug(">>> container.confirm_remote_startup()") self.log.debug("Trying to confirm kernel container startup status") self.start_time = RemoteProcessProxy.get_current_time() i = 0 @@ -197,6 +200,9 @@ async def confirm_remote_startup(self) -> None: await self.handle_timeout() container_status = self.get_container_status(i) + self.log.debug( + f">>> container.confirm_remote_startup() - container_status: {container_status}" + ) if container_status: if container_status in self.get_error_states(): self.log_and_raise( @@ -204,14 +210,24 @@ async def confirm_remote_startup(self) -> None: reason=f"Error starting kernel container; status: '{container_status}'.", ) else: + self.log.debug( + f">>> container.confirm_remote_startup(): is hosted assigned => {self.assigned_host}" + ) + self.log.debug(">>> should call receive_connection_info()") if self.assigned_host: ready_to_connect = await self.receive_connection_info() + self.log.debug( + f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}" + ) self.pid = ( 0 # We won't send process signals for kubernetes lifecycle management ) self.pgid = 0 else: self.detect_launch_failure() + self.log.debug( + f">>> container.confirm_remote_startup(): ready to connect => {ready_to_connect}" + ) def get_process_info(self) -> dict[str, Any]: """Captures the base information necessary for kernel persistence relative to containers.""" diff --git a/enterprise_gateway/services/processproxies/crd.py b/enterprise_gateway/services/processproxies/crd.py index 54f24b5ca..4962c8cc1 100644 --- a/enterprise_gateway/services/processproxies/crd.py +++ b/enterprise_gateway/services/processproxies/crd.py @@ -74,11 +74,15 @@ def get_container_status(self, iteration: int | None) -> str: ) if custom_resource: - application_state = custom_resource['status']['applicationState']['state'].lower() + application_state = custom_resource.get("status", {}).get("state", "").lower() + + self.log.debug(f">>> crd.get_container_status: {application_state}") if application_state in self.get_error_states(): exception_text = self._get_exception_text( - custom_resource['status']['applicationState']['errorMessage'] + custom_resource.get("status", {}) + .get("applicationState", {}) + .get("errorMessage") ) error_message = ( f"CRD submission for kernel {self.kernel_id} failed: {exception_text}" diff --git a/enterprise_gateway/services/processproxies/k8s.py b/enterprise_gateway/services/processproxies/k8s.py index ea4a1822b..14404df2a 100644 --- a/enterprise_gateway/services/processproxies/k8s.py +++ b/enterprise_gateway/services/processproxies/k8s.py @@ -115,6 +115,7 @@ def get_container_status(self, iteration: int | None) -> str: self.container_name = pod_info.metadata.name if pod_info.status: pod_status = pod_info.status.phase.lower() + self.log.debug(f">>> k8s.get_container_status: {pod_status}") if pod_status == "running" and not self.assigned_host: # Pod is running, capture IP self.assigned_ip = pod_info.status.pod_ip @@ -128,6 +129,7 @@ def get_container_status(self, iteration: int | None) -> str: f"Status: '{pod_status}', Pod IP: '{self.assigned_ip}', KernelID: '{self.kernel_id}'" ) + self.log.debug(f">>> k8s.get_container_status: {pod_status}") return pod_status def delete_managed_object(self, termination_stati: list[str]) -> bool: diff --git a/enterprise_gateway/services/processproxies/processproxy.py b/enterprise_gateway/services/processproxies/processproxy.py index 405adfbca..22e01c486 100644 --- a/enterprise_gateway/services/processproxies/processproxy.py +++ b/enterprise_gateway/services/processproxies/processproxy.py @@ -201,6 +201,7 @@ def register_event(self, kernel_id: str) -> None: async def get_connection_info(self, kernel_id: str) -> dict: """Performs a timeout wait on the event, returning the conenction information on completion.""" + self.log.debug(f">>> processproxy.get_connection_info() for kernel_id {kernel_id}") await asyncio.wait_for(self._response_registry[kernel_id].wait(), connection_interval) return self._response_registry.pop(kernel_id).response @@ -1300,9 +1301,13 @@ async def receive_connection_info(self) -> bool: """ # Polls the socket using accept. When data is found, returns ready indicator and encrypted data. ready_to_connect = False - + self.log.debug( + f">>> processproxy.receive_connection_info(): initializing ready to connect as {ready_to_connect}" + ) try: connect_info = await self.response_manager.get_connection_info(self.kernel_id) + self.log.debug(">>> processproxy.receive_connection_info(): connect info received") + self.log.debug(connect_info) self._setup_connection_info(connect_info) ready_to_connect = True except Exception as e: @@ -1320,6 +1325,9 @@ async def receive_connection_info(self) -> bool: self.kill() self.log_and_raise(http_status_code=500, reason=error_message) + self.log.debug( + f">>> processproxy.receive_connection_info(): returning ready to connect {ready_to_connect}" + ) return ready_to_connect def _setup_connection_info(self, connect_info: dict) -> None: diff --git a/enterprise_gateway/services/processproxies/ray_operator.py b/enterprise_gateway/services/processproxies/ray_operator.py new file mode 100644 index 000000000..41cea3c6a --- /dev/null +++ b/enterprise_gateway/services/processproxies/ray_operator.py @@ -0,0 +1,211 @@ +"""A Ray operator process proxy.""" + +# Internal implementation at Apple +from __future__ import annotations + +from typing import Any + +from kubernetes import client + +from ..kernels.remotemanager import RemoteKernelManager +from .k8s import KubernetesProcessProxy + + +class RayOperatorProcessProxy(KubernetesProcessProxy): + """Ray operator process proxy.""" + + object_kind = "RayCluster" + + def __init__(self, kernel_manager: RemoteKernelManager, proxy_config: dict): + """Initialize the proxy.""" + super().__init__(kernel_manager, proxy_config) + self.group = "ray.io" + self.version = "v1alpha1" + self.plural = "rayclusters" + + async def launch_process( + self, kernel_cmd: str, **kwargs: dict[str, Any] | None + ) -> RayOperatorProcessProxy: + """Launch the process for a kernel.""" + self.kernel_resource_name = self._determine_kernel_pod_name(**kwargs) + kwargs["env"]["KERNEL_RESOURCE_NAME"] = self.kernel_resource_name + kwargs["env"]["KERNEL_CRD_GROUP"] = self.group + kwargs["env"]["KERNEL_CRD_VERSION"] = self.version + kwargs["env"]["KERNEL_CRD_PLURAL"] = self.plural + + await super().launch_process(kernel_cmd, **kwargs) + return self + + def get_container_status(self, iteration: int | None) -> str: + """Determines submitted Ray application status and returns unified pod state. + + This method returns the pod status (not CRD status) to maintain compatibility + with the base class lifecycle management. The RayCluster CRD state is checked + first to ensure the cluster is healthy, but we return pod states that the + base class understands: 'pending', 'running', 'failed', etc. + """ + application_state = None + head_pod_status = None + application_state = self._get_application_state() + if application_state: + self.log.debug( + f">>> ray_operator.get_container_status: application_state {application_state}" + ) + + # Check for CRD-level errors first + if application_state in self.get_error_states(): + error_message = ( + f"CRD submission for kernel {self.kernel_id} failed with state: {application_state}" + ) + self.log.error(error_message) + return "failed" # Return pod state, not CRD state + + # If CRD is not ready yet, return "pending" to indicate still launching + if application_state != "ready": + self.log.debug( + f">>> ray_operator.get_container_status: CRD not ready yet, state={application_state}" + ) + return "pending" + + # CRD is ready, now check the actual pod status + kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel" + ret = None + try: + ret = client.CoreV1Api().list_namespaced_pod( + namespace=self.kernel_namespace, label_selector=kernel_label_selector + ) + except client.rest.ApiException as e: + if e.status == 404: + self.log.debug("Resetting cluster connection info as cluster deleted") + self._reset_connection_info() + return None + + if ret and ret.items: + pod_info = ret.items[0] + self.log.debug( + f"Cluster status {application_state}, pod status {pod_info.status.phase.lower()}" + ) + if pod_info.status: + head_pod_status = pod_info.status.phase.lower() + self.log.debug( + f">>> ray_operator.get_container_status: pod_status {head_pod_status}" + ) + if head_pod_status == "running": + self.log.debug( + f"Pod Info name:{pod_info.metadata.name}, pod ip {pod_info.status.pod_ip}, host {self.container_name}" + ) + self.container_name = pod_info.metadata.name + self.assigned_ip = pod_info.status.pod_ip + self.assigned_host = self.container_name + self.assigned_node_ip = pod_info.status.host_ip + + # only log if iteration is not None (otherwise poll() is too noisy) + # check for running state to avoid double logging with superclass + if iteration and head_pod_status != 'running': + self.log.debug( + f"{iteration}: Waiting from CRD status from resource manager {self.object_kind.lower()} in " + f"namespace '{self.kernel_namespace}'. Name: '{self.kernel_resource_name}', " + f"Status: CRD='{application_state}', Pod='{head_pod_status}', KernelID: '{self.kernel_id}'" + ) + + # KEY FIX: Return pod status (not CRD state) so base class poll() works correctly + final_status = head_pod_status if head_pod_status else "pending" + self.log.debug( + f">>> ray_operator.get_container_status: returning pod_status={final_status} " + f"(CRD state was {application_state})" + ) + return final_status + + def delete_managed_object(self, termination_stati: list[str]) -> bool: + """Deletes the object managed by this process-proxy + + A return value of True indicates the object is considered deleted, + otherwise a False or None value is returned. + + Note: the caller is responsible for handling exceptions. + """ + delete_status = client.CustomObjectsApi().delete_namespaced_custom_object( + self.group, + self.version, + self.kernel_namespace, + self.plural, + self.kernel_resource_name, + grace_period_seconds=0, + propagation_policy="Background", + ) + + result = delete_status and delete_status.get("status", None) in termination_stati + if result: + self._reset_connection_info() + return result + + def get_initial_states(self) -> set: + """Return list of states indicating container is starting (includes running). + + Note: We return pod states (not CRD states) to maintain compatibility + with the base class poll() implementation, which checks if the status + returned by get_container_status() is in this set. + """ + return ["pending", "running"] + + def get_error_states(self) -> set: + """Return list of states indicating RayCluster has failed.""" + # Ray doesn't typically use "failed" state, but we'll include common error states + return {"failed", "error", "unhealthy"} + + def _get_ray_cluster_status(self) -> dict: + try: + return client.CustomObjectsApi().get_namespaced_custom_object( + self.group, + self.version, + self.kernel_namespace, + self.plural, + self.kernel_resource_name, + ) + except client.rest.ApiException as e: + if e.status == 404: + self.log.debug("Resetting cluster connection info as cluster deleted") + self._reset_connection_info() + return None + + def _get_application_state(self): + custom_resource = self._get_ray_cluster_status() + + if custom_resource is None: + return None + + if 'status' not in custom_resource or 'state' not in custom_resource['status']: + return None + + return custom_resource['status']['state'].lower() + + def _get_pod_status(self) -> str: + """Get the current status of the kernel pod. + Returns + ------- + str + The pod status in lowercase (e.g., 'pending', 'running', 'failed', 'unknown'). + """ + pod_status = "unknown" + kernel_label_selector = "kernel_id=" + self.kernel_id + ",component=kernel" + ret = client.CoreV1Api().list_namespaced_pod( + namespace=self.kernel_namespace, label_selector=kernel_label_selector + ) + if ret and ret.items: + pod_info = ret.items[0] + self.container_name = pod_info.metadata.name + if pod_info.status: + pod_status = pod_info.status.phase.lower() + self.log.debug(f">>> k8s._get_pod_status: {pod_status}") + + return pod_status + + def _reset_connection_info(self): + """Reset all connection-related attributes to their initial state. + This is typically called when a cluster is deleted or connection is lost. + """ + + self.assigned_host = None + self.container_name = "" + self.assigned_node_ip = None + self.assigned_ip = None diff --git a/enterprise_gateway/services/sessions/kernelsessionmanager.py b/enterprise_gateway/services/sessions/kernelsessionmanager.py index f4e73ca93..f44622bce 100644 --- a/enterprise_gateway/services/sessions/kernelsessionmanager.py +++ b/enterprise_gateway/services/sessions/kernelsessionmanager.py @@ -94,6 +94,7 @@ def create_session(self, kernel_id: str, **kwargs) -> None: Information used for the launch of the kernel """ + self.log.debug(f">>> Creating new session for kernel {kernel_id}") km = self.kernel_manager.get_kernel(kernel_id) # Compose the kernel_session entry @@ -103,11 +104,14 @@ def create_session(self, kernel_id: str, **kwargs) -> None: kernel_session["kernel_name"] = km.kernel_name # Build the inner dictionaries: connection_info, process_proxy and add to kernel_session + self.log.debug(f">>> Getting connection info for kernel {kernel_id}") kernel_session["connection_info"] = km.get_connection_info() kernel_session["launch_args"] = kwargs.copy() + self.log.debug(f">>> Getting process info for kernel {kernel_id}") kernel_session["process_info"] = ( km.process_proxy.get_process_info() if km.process_proxy else {} ) + self.log.debug(f">>> Saving session {kernel_session}") self._save_session(kernel_id, kernel_session) def refresh_session(self, kernel_id: str) -> None: diff --git a/etc/Makefile b/etc/Makefile index 08b54ecb6..9a23c7718 100644 --- a/etc/Makefile +++ b/etc/Makefile @@ -58,7 +58,6 @@ TOREE_LAUNCHER_FILES:=$(shell find kernel-launchers/scala/toree-launcher/src -ty @echo ../build/kernelspecs/{python,R,scala,python_tf,python_tf_gpu}_kubernetes | xargs -t -n 1 cp -r kernel-launchers/kubernetes/* @echo ../build/kernelspecs/spark_{python,R,scala}_kubernetes | xargs -t -n 1 cp -r kernel-launchers/kubernetes/* @echo ../build/kernelspecs/{python,R,scala,python_tf,python_tf_gpu}_docker | xargs -t -n 1 cp -r kernel-launchers/docker/* - @echo ../build/kernelspecs/spark_python_operator | xargs -t -n 1 cp -r kernel-launchers/operators/* # Populate kernel resources. Because tensorflow is also python, it should be last. @echo ../build/kernelspecs/*R* | xargs -t -n 1 cp -r kernel-resources/ir/* @echo ../build/kernelspecs/*scala* | xargs -t -n 1 cp -r kernel-resources/apache_toree/* @@ -66,6 +65,12 @@ TOREE_LAUNCHER_FILES:=$(shell find kernel-launchers/scala/toree-launcher/src -ty @echo ../build/kernelspecs/*tf* | xargs -t -n 1 cp -r kernel-resources/tensorflow/* # Perform the copy again to enable local, per-kernel, overrides cp -r kernelspecs ../build + # Operator kernelspecs get launcher files after the override to preserve scripts + @echo ../build/kernelspecs/spark_python_operator | xargs -t -n 1 cp -r kernel-launchers/operators/* + @rm -f ../build/kernelspecs/spark_python_operator/scripts/ray.io-v1alpha1.yaml.j2 + @echo ../build/kernelspecs/ray_python_operator | xargs -t -n 1 cp -r kernel-launchers/operators/* + @rm -f ../build/kernelspecs/ray_python_operator/scripts/sparkoperator.k8s.io-v1beta2.yaml.j2 + @echo ../build/kernelspecs/ray_python_operator | xargs -t -n 1 cp -r kernel-resources/ray/* @(cd ../build/kernelspecs; find . -name 'kernel.json' -print0 | xargs -0 sed -i.bak "s/VERSION/$(TAG)/g"; find . -name *.bak -print0 | xargs -0 rm -f) @mkdir -p ../dist @@ -105,31 +110,31 @@ kernel_image_files: ../build/kernel_image_files # Docker image build section *********************************************** # -KERNEL_IMAGES := kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py +KERNEL_IMAGES := kernel-py kernel-spark-py kernel-ray-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py DOCKER_IMAGES := demo-base enterprise-gateway-demo enterprise-gateway kernel-image-puller $(KERNEL_IMAGES) PUSHED_IMAGES := demo-base enterprise-gateway-demo enterprise-gateway kernel-image-puller $(KERNEL_IMAGES) docker-images: $(DOCKER_IMAGES) kernel-images: $(KERNEL_IMAGES) -push-images: push-enterprise-gateway-demo push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-tf-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-image-puller +push-images: push-enterprise-gateway-demo push-enterprise-gateway push-kernel-py push-kernel-spark-py push-kernel-ray-py push-kernel-tf-py push-kernel-r push-kernel-spark-r push-kernel-scala push-kernel-image-puller clean-images: clean-enterprise-gateway-demo clean-demo-base clean-enterprise-gateway clean-kernel-image-puller clean-kernel-images -clean-kernel-images: clean-kernel-py clean-kernel-spark-py clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala +clean-kernel-images: clean-kernel-py clean-kernel-spark-py clean-kernel-ray-py clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala # Extra dependencies for each docker image... DEPENDS_demo-base: DEPENDS_enterprise-gateway-demo: $(FILE_kernelspecs_all) DEPENDS_enterprise-gateway: $(FILE_kernelspecs_all) DEPENDS_kernel-image-puller: -DEPENDS_kernel-py DEPENDS_kernel-spark-py DEPENDS_kernel-r DEPENDS_kernel-spark-r DEPENDS_kernel-scala DEPENDS_kernel-tf-py DEPENDS_kernel-tf-gpu-py: $(FILE_kernelspecs_kubernetes) $(FILE_kernelspecs_docker) +DEPENDS_kernel-py DEPENDS_kernel-spark-py DEPENDS_kernel-ray-py DEPENDS_kernel-r DEPENDS_kernel-spark-r DEPENDS_kernel-scala DEPENDS_kernel-tf-py DEPENDS_kernel-tf-gpu-py: $(FILE_kernelspecs_kubernetes) $(FILE_kernelspecs_docker) # Extra targets for each docker image... TARGETS_demo-base: TARGETS_kernel-image-puller: TARGETS_enterprise-gateway TARGETS_enterprise-gateway-demo: kernelspecs @make -C .. bdist -TARGETS_kernel-py TARGETS_kernel-spark-py TARGETS_kernel-r TARGETS_kernel-spark-r TARGETS_kernel-scala TARGETS_kernel-tf-py TARGETS_kernel-tf-gpu-py: kernelspecs +TARGETS_kernel-py TARGETS_kernel-spark-py TARGETS_kernel-ray-py TARGETS_kernel-r TARGETS_kernel-spark-r TARGETS_kernel-scala TARGETS_kernel-tf-py TARGETS_kernel-tf-gpu-py: kernelspecs # Extra files for each docker image... FILES_demo-base := @@ -138,6 +143,7 @@ FILES_enterprise-gateway-demo := ../dist/jupyter_enterprise_gateway_kernelspecs- FILES_enterprise-gateway := ../dist/jupyter_enterprise_gateway_kernel_image_files* ../dist/jupyter_enterprise_gateway_kernelspecs-* ../dist/jupyter_enterprise_gateway*.whl FILES_kernel-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-spark-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* +FILES_kernel-ray-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-tf-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-tf-gpu-py := ../dist/jupyter_enterprise_gateway_kernel_image_files* FILES_kernel-r := ../dist/jupyter_enterprise_gateway_kernel_image_files* diff --git a/etc/docker/demo-base/Dockerfile b/etc/docker/demo-base/Dockerfile index 9b484c507..47c3aa8a4 100644 --- a/etc/docker/demo-base/Dockerfile +++ b/etc/docker/demo-base/Dockerfile @@ -27,8 +27,8 @@ ENV SHELL=/bin/bash \ ENV HOME=/home/$NB_USER \ PATH=$JAVA_HOME/bin:$ANACONDA_HOME/bin:$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH -ENV SPARK_VER $SPARK_VERSION -ENV HADOOP_VER 3.3.1 +ENV SPARK_VER=$SPARK_VERSION +ENV HADOOP_VER=3.3.1 # INSTALL / DOWNLOAD ALL NEEDED PACKAGES RUN dpkg --purge --force-depends ca-certificates-java \ diff --git a/etc/docker/enterprise-gateway/Dockerfile b/etc/docker/enterprise-gateway/Dockerfile index 08b640771..5341fe714 100644 --- a/etc/docker/enterprise-gateway/Dockerfile +++ b/etc/docker/enterprise-gateway/Dockerfile @@ -4,8 +4,8 @@ FROM $BASE_CONTAINER ARG SPARK_VERSION -ENV SPARK_VER $SPARK_VERSION -ENV SPARK_HOME /opt/spark +ENV SPARK_VER=$SPARK_VERSION +ENV SPARK_HOME=/opt/spark RUN mamba install --quiet --yes \ @@ -20,9 +20,9 @@ RUN mamba install --quiet --yes \ USER root -RUN apt update && apt install -yq curl openjdk-8-jdk +RUN apt update && apt install -yq curl openjdk-8-jdk iputils-ping telnet netcat-openbsd net-tools iproute2 dnsutils curl -ENV JAVA_HOME /usr/lib/jvm/java +ENV JAVA_HOME=/usr/lib/jvm/java RUN ln -s $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") ${JAVA_HOME} # Download and install Spark @@ -53,6 +53,6 @@ USER jovyan CMD ["/usr/local/bin/start-enterprise-gateway.sh"] -EXPOSE 8888 +EXPOSE 8888 8877 WORKDIR /usr/local/bin diff --git a/etc/docker/kernel-image-puller/Dockerfile b/etc/docker/kernel-image-puller/Dockerfile index 271e60df5..1683aae70 100644 --- a/etc/docker/kernel-image-puller/Dockerfile +++ b/etc/docker/kernel-image-puller/Dockerfile @@ -17,11 +17,11 @@ RUN apt-get update && apt-get install cri-tools RUN echo $PATH # The following environment variables are supported - defaults provided. Override as needed. -ENV KIP_GATEWAY_HOST http://localhost:8888 -ENV KIP_INTERVAL 300 -ENV KIP_LOG_LEVEL INFO -ENV KIP_NUM_PULLERS 2 -ENV KIP_NUM_RETRIES 3 -ENV KIP_PULL_POLICY 'IfNotPresent' +ENV KIP_GATEWAY_HOST=http://localhost:8888 +ENV KIP_INTERVAL=300 +ENV KIP_LOG_LEVEL=INFO +ENV KIP_NUM_PULLERS=2 +ENV KIP_NUM_RETRIES=3 +ENV KIP_PULL_POLICY='IfNotPresent' CMD [ "python", "./kernel_image_puller.py" ] diff --git a/etc/docker/kernel-py/Dockerfile b/etc/docker/kernel-py/Dockerfile index e967509bb..3f8a6fed1 100644 --- a/etc/docker/kernel-py/Dockerfile +++ b/etc/docker/kernel-py/Dockerfile @@ -5,7 +5,7 @@ FROM $BASE_CONTAINER ENV PATH=$PATH:$CONDA_DIR/bin # Add debugger support -RUN pip install --upgrade ipykernel +RUN pip install --upgrade --no-cache-dir ipykernel RUN conda install --quiet --yes \ cffi \ @@ -29,7 +29,7 @@ RUN chown jovyan:users /usr/local/bin/bootstrap-kernel.sh && \ USER jovyan -ENV KERNEL_LANGUAGE python +ENV KERNEL_LANGUAGE=python # Disble healthcheck inherited from notebook image HEALTHCHECK NONE diff --git a/etc/docker/kernel-r/Dockerfile b/etc/docker/kernel-r/Dockerfile index c615674d5..be8b376df 100644 --- a/etc/docker/kernel-r/Dockerfile +++ b/etc/docker/kernel-r/Dockerfile @@ -25,7 +25,7 @@ RUN chown jovyan:users /usr/local/bin/bootstrap-kernel.sh && \ USER jovyan -ENV KERNEL_LANGUAGE R +ENV KERNEL_LANGUAGE=R # Disble healthcheck inherited from notebook image HEALTHCHECK NONE diff --git a/etc/docker/kernel-ray-py/Dockerfile b/etc/docker/kernel-ray-py/Dockerfile new file mode 100644 index 000000000..cb3dad0bf --- /dev/null +++ b/etc/docker/kernel-ray-py/Dockerfile @@ -0,0 +1,51 @@ +# Ray 2.50.0 with Python 3.11 +# rayproject/ray:2.50.0.714bc0-extra-py311-cpu +ARG BASE_CONTAINER=rayproject/ray:2.50.0.714bc0-extra-py311-cpu +FROM $BASE_CONTAINER + +# Add debugger support +RUN pip install --upgrade --no-cache-dir ipykernel + +RUN pip install --upgrade --no-cache-dir --upgrade \ + "jupyter_client>=6.1,<7" \ + "jupyter_server>=1.7,<2" \ + "pyzmq>=20.0.0,<25" \ + "ray[data]==2.50.0" \ + ipykernel \ + cffi \ + future \ + pycryptodomex + +ADD jupyter_enterprise_gateway_kernel_image_files*.tar.gz /usr/local/bin/ + +USER root + +RUN apt-get update && apt-get install -yq --no-install-recommends \ + libkrb5-dev \ + iputils-ping \ + telnet \ + netcat-openbsd \ + net-tools \ + iproute2 \ + dnsutils \ + curl \ + less \ + && rm -rf /var/lib/apt/lists/* + +# Set up permissions for ray user (Ray base image uses 'ray' user) +RUN chown ray:users /usr/local/bin/bootstrap-kernel.sh && \ + chmod 0755 /usr/local/bin/bootstrap-kernel.sh && \ + chown -R ray:users /usr/local/bin/kernel-launchers + +USER ray + +ENV KERNEL_LANGUAGE=python +ENV RAY_HOME=/home/ray + +WORKDIR /home/ray + +# Disble healthcheck inherited from notebook image +HEALTHCHECK NONE + + +CMD /usr/local/bin/bootstrap-kernel.sh diff --git a/etc/docker/kernel-ray-py/README.md b/etc/docker/kernel-ray-py/README.md new file mode 100644 index 000000000..da9a403ac --- /dev/null +++ b/etc/docker/kernel-ray-py/README.md @@ -0,0 +1,16 @@ +This image enables the use of an IPython kernel launched from [Jupyter Enterprise Gateway](https://jupyter-enterprise-gateway.readthedocs.io/en/latest/) within a Kubernetes cluster. It is built on the base image [rayproject/ray:2.50.0.714bc0-extra-py311-cpu](https://hub.docker.com/r/rayproject/ray/), and provides [Ray 2.50.0](https://docs.ray.io/) for distributed Python computing. + +# What it Gives You + +- IPython kernel support (with debugger) +- Ray 2.50.0 for distributed computing +- Python 3.11 environment +- Ray on Kubernetes support from within a Jupyter Notebook + +# Basic Use + +Deploy [enterprise-gateway](https://hub.docker.com/r/elyra/enterprise-gateway/) per its instructions and configured to the appropriate environment. + +Launch a gateway-enabled Jupyter Notebook application against the Enterprise Gateway instance and pick the Ray kernel to use in your notebook. + +For more information, check our [repo](https://github.com/jupyter-server/enterprise_gateway) and [docs](https://jupyter-enterprise-gateway.readthedocs.io/en/latest/). diff --git a/etc/docker/kernel-scala/Dockerfile b/etc/docker/kernel-scala/Dockerfile index d3146da48..e26a91d46 100644 --- a/etc/docker/kernel-scala/Dockerfile +++ b/etc/docker/kernel-scala/Dockerfile @@ -17,5 +17,5 @@ RUN adduser --system -uid 1000 jovyan --ingroup users && \ chown -R jovyan:users /usr/local/bin/kernel-launchers USER jovyan -ENV KERNEL_LANGUAGE scala +ENV KERNEL_LANGUAGE=scala CMD /usr/local/bin/bootstrap-kernel.sh diff --git a/etc/docker/kernel-spark-py/Dockerfile b/etc/docker/kernel-spark-py/Dockerfile index ed6f1a3d0..86ac97193 100644 --- a/etc/docker/kernel-spark-py/Dockerfile +++ b/etc/docker/kernel-spark-py/Dockerfile @@ -7,11 +7,11 @@ FROM $BASE_CONTAINER ARG SPARK_VERSION -ENV SPARK_VER $SPARK_VERSION -ENV SPARK_HOME /opt/spark -ENV KERNEL_LANGUAGE python -ENV R_LIBS_USER $R_LIBS_USER:${SPARK_HOME}/R/lib -ENV PATH $PATH:$SPARK_HOME/bin +ENV SPARK_VER=$SPARK_VERSION +ENV SPARK_HOME=/opt/spark +ENV KERNEL_LANGUAGE=python +ENV R_LIBS_USER=$R_LIBS_USER:${SPARK_HOME}/R/lib +ENV PATH=$PATH:$SPARK_HOME/bin USER root @@ -26,7 +26,7 @@ RUN dpkg --purge --force-depends ca-certificates-java \ libssl-dev \ && rm -rf /var/lib/apt/lists/* -ENV JAVA_HOME /usr/lib/jvm/java +ENV JAVA_HOME=/usr/lib/jvm/java RUN ln -s $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") ${JAVA_HOME} # Download and install Spark diff --git a/etc/docker/kernel-spark-r/Dockerfile b/etc/docker/kernel-spark-r/Dockerfile index 5e92caeaa..df1f6a0ed 100644 --- a/etc/docker/kernel-spark-r/Dockerfile +++ b/etc/docker/kernel-spark-r/Dockerfile @@ -8,11 +8,11 @@ ARG SPARK_VERSION USER root -ENV SPARK_VER $SPARK_VERSION -ENV SPARK_HOME /opt/spark +ENV SPARK_VER=$SPARK_VERSION +ENV SPARK_HOME=/opt/spark ENV KERNEL_LANGUAGE=R -ENV R_LIBS_USER $R_LIBS_USER:${R_HOME}/library:${SPARK_HOME}/R/lib -ENV PATH $PATH:$SPARK_HOME/bin +ENV R_LIBS_USER=$R_LIBS_USER:${R_HOME}/library:${SPARK_HOME}/R/lib +ENV PATH=$PATH:$SPARK_HOME/bin RUN dpkg --purge --force-depends ca-certificates-java \ && apt-get update \ @@ -23,7 +23,7 @@ RUN dpkg --purge --force-depends ca-certificates-java \ libssl-dev \ && rm -rf /var/lib/apt/lists/* -ENV JAVA_HOME /usr/lib/jvm/java +ENV JAVA_HOME=/usr/lib/jvm/java RUN ln -s $(readlink -f /usr/bin/javac | sed "s:/bin/javac::") ${JAVA_HOME} # Download and install Spark diff --git a/etc/docker/kernel-tf-gpu-py/Dockerfile b/etc/docker/kernel-tf-gpu-py/Dockerfile index d6b6c5d27..354465299 100644 --- a/etc/docker/kernel-tf-gpu-py/Dockerfile +++ b/etc/docker/kernel-tf-gpu-py/Dockerfile @@ -27,5 +27,5 @@ RUN adduser --system --uid 1000 --gid 100 jovyan && \ USER jovyan -ENV KERNEL_LANGUAGE python +ENV KERNEL_LANGUAGE=python CMD /usr/local/bin/bootstrap-kernel.sh diff --git a/etc/docker/kernel-tf-py/Dockerfile b/etc/docker/kernel-tf-py/Dockerfile index b6b7e225e..783faa594 100644 --- a/etc/docker/kernel-tf-py/Dockerfile +++ b/etc/docker/kernel-tf-py/Dockerfile @@ -4,7 +4,7 @@ ARG BASE_CONTAINER=jupyter/tensorflow-notebook:2023-10-20 FROM $BASE_CONTAINER -ENV KERNEL_LANGUAGE python +ENV KERNEL_LANGUAGE=python ADD jupyter_enterprise_gateway_kernel_image_files*.tar.gz /usr/local/bin/ diff --git a/etc/kernel-launchers/operators/scripts/launch_custom_resource.py b/etc/kernel-launchers/operators/scripts/launch_custom_resource.py index 371d18b2d..9a6e0379a 100644 --- a/etc/kernel-launchers/operators/scripts/launch_custom_resource.py +++ b/etc/kernel-launchers/operators/scripts/launch_custom_resource.py @@ -76,6 +76,7 @@ def launch_custom_resource_kernel( kernel_crd_template = keywords["kernel_crd_group"] + "-" + keywords["kernel_crd_version"] custom_resource_yaml = generate_kernel_custom_resource_yaml(kernel_crd_template, keywords) + print(f">>> Generated YAML \n{custom_resource_yaml}") kernel_namespace = keywords["kernel_namespace"] group = keywords["kernel_crd_group"] diff --git a/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1.yaml.j2 b/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1.yaml.j2 new file mode 100644 index 000000000..28cd9333e --- /dev/null +++ b/etc/kernel-launchers/operators/scripts/ray.io-v1alpha1.yaml.j2 @@ -0,0 +1,203 @@ +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + labels: + controller-tools.k8s.io: "1.0" + ray.io/cluster-name: "{{ kernel_resource_name }}" + annotations: + ray.io/ft-enabled: "false" # Disable GCS FT for faster startup + name: {{ kernel_resource_name }} +spec: + enableInTreeAutoscaling: true + autoscalerOptions: + upscalingMode: Aggressive + idleTimeoutSeconds: 3600 + imagePullPolicy: Always + resources: + limits: + cpu: 1 + memory: "1Gi" + requests: + cpu: 1 + memory: "1Gi" +########################################## +## HEAD Node group spec +########################################## + headGroupSpec: + serviceType: ClusterIP # optional + # the following params are used to complete the ray start: ray start --head --block --port=6379 ... + rayStartParams: + disable-usage-stats: 'true' + dashboard-host: '0.0.0.0' + block: 'true' + template: + metadata: + labels: + kernel_id: "{{ kernel_id }}" + app: enterprise-gateway + component: kernel + annotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + spec: + restartPolicy: OnFailure + serviceAccountName: "{{ kernel_service_account_name }}" +# nodeSelector: +# node.kubernetes.io/instance-type: m5d.8xlarge + containers: + # The Ray head container + - name: ray-head + image: {{ kernel_image }} + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + resources: + limits: + cpu: {{ kernel_head_num_cpu_limit | default(2)}} + memory: {{ kernel_head_memory_limit | default("4Gi")}} + requests: + cpu: {{ kernel_head_num_cpu_request | default(2)}} + memory: {{ kernel_head_memory_request | default("4Gi")}} + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8000 + name: serve + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 44217 + name: as-metrics # autoscaler + - containerPort: 44227 + name: dash-metrics # dashboard + startupProbe: + httpGet: + path: / + port: 8265 + initialDelaySeconds: 15 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 24 + successThreshold: 1 + readinessProbe: + httpGet: + path: / + port: 8265 + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: / + port: 8265 + initialDelaySeconds: 30 + periodSeconds: 20 + timeoutSeconds: 5 + failureThreshold: 3 + successThreshold: 1 + - name: ray-kernel + image: {{ kernel_image }} + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + env: + - name: RAY_ADDRESS + value: "127.0.0.1:6379" + - name: RAY_PORT + value: "6379" + - name: SERVE_URI + value: "{{ kernel_serve_url }}" + - name: BUILD_URI + value: "{{ kernel_build_url }}" + - name: EG_LOG_LEVEL + value: "0" +# - name: PIP_INDEX_URL +# value: https://pypi.org + command: + - "/bin/sh" + - "-c" + - "python /usr/local/bin/kernel-launchers/python/scripts/launch_ipykernel.py --kernel-id {{ kernel_id }} --response-address {{ eg_response_address }} --port-range {{ eg_port_range }} --public-key {{ eg_public_key }}" +########################################## +## CPU Workers group specs +########################################## + workerGroupSpecs: + - replicas: 0 + minReplicas: 0 + maxReplicas: {{ kernel_num_cpu_worker or 1 }} + groupName: cpu-group + rayStartParams: + block: 'true' + template: + spec: + serviceAccountName: "{{ kernel_service_account_name }}" +# nodeSelector: +# node.kubernetes.io/instance-type: {{ kernel_cpu_instance_type | default("m5d.8xlarge")}} + initContainers: + - name: init + image: docker.com/busybox:1.28 + command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] + containers: + - name: ray-cpu-worker + image: {{ kernel_image }} + imagePullPolicy: Always +# env: +# - name: PIP_INDEX_URL +# value: https://pypi.org + resources: + limits: + cpu: {{ kernel_cpu_worker_num_cpu_limit | default(1)}} + memory: {{ kernel_cpu_worker_num_memory_limit | default("1Gi")}} + requests: + cpu: {{ kernel_cpu_worker_num_cpu_request | default("500m")}} + memory: {{ kernel_cpu_worker_num_memory_request | default("1Gi")}} +# volumeMounts: +# - name: ray-logs +# mountPath: /tmp/ray + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + # volumes: + # - name: ray-logs + # hostPath: + # path: "/mnt/data" +########################################## +## GPU Workers node groups +########################################## +# - replicas: 0 +# minReplicas: 0 +# maxReplicas: {{ kernel_num_gpu_worker or 0 }} +# groupName: gpu-group +# rayStartParams: +# block: 'true' +# template: +# spec: +# serviceAccountName: "{{ kernel_service_account_name }}" +## nodeSelector: +## node.kubernetes.io/instance-type: {{ kernel_gpu_instance_type | default("g5.4xlarge")}} +# initContainers: +# - name: init +# image: docker.com/busybox:1.28 +# command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] +# containers: +# - name: ray-gpu-worker +# image: {{ kernel_image }} +# imagePullPolicy: Always +# env: +# - name: PIP_INDEX_URL +# value: https://pypi.org +# resources: +# limits: +# cpu: {{ kernel_gpu_worker_num_cpu_limit | default(1)}} +# memory: {{ kernel_gpu_worker_num_memory_limit | default("1Gi")}} +# nvidia.com/gpu: {{ kernel_gpu_worker_num_gpu | default(0)}} +# requests: +# cpu: {{kernel_gpu_worker_num_cpu_request | default(0)}} +# memory: {{ kernel_gpu_worker_num_memory_request | default("512Mi")}} +# nvidia.com/gpu: {{ kernel_gpu_worker_num_gpu | default(0)}} +# securityContext: +# allowPrivilegeEscalation: false +# runAsUser: 0 diff --git a/etc/kernel-resources/ray/logo-64x64.png b/etc/kernel-resources/ray/logo-64x64.png new file mode 100644 index 000000000..9917a4c0d Binary files /dev/null and b/etc/kernel-resources/ray/logo-64x64.png differ diff --git a/etc/kernelspecs/ray_python_operator/kernel.json b/etc/kernelspecs/ray_python_operator/kernel.json new file mode 100644 index 000000000..129c5ec7f --- /dev/null +++ b/etc/kernelspecs/ray_python_operator/kernel.json @@ -0,0 +1,25 @@ +{ + "language": "python", + "display_name": "Ray Operator (Python)", + "metadata": { + "process_proxy": { + "class_name": "enterprise_gateway.services.processproxies.ray_operator.RayOperatorProcessProxy", + "config": { + "image_name": "lresende/kernel-ray-py:VERSION", + "executor_image_name": "lresende/kernel-ray-py:VERSION" + } + } + }, + "argv": [ + "python", + "/usr/local/share/jupyter/kernels/ray_python_operator/scripts/launch_custom_resource.py", + "--RemoteProcessProxy.kernel-id", + "{kernel_id}", + "--RemoteProcessProxy.port-range", + "{port_range}", + "--RemoteProcessProxy.response-address", + "{response_address}", + "--RemoteProcessProxy.public-key", + "{public_key}" + ] +} diff --git a/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml b/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml index 5edef9bf4..03e676965 100644 --- a/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml +++ b/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml @@ -73,6 +73,8 @@ spec: value: !!str {{ .Values.kernel.launchTimeout }} - name: EG_KERNEL_INFO_TIMEOUT value: !!str {{ .Values.kernel.infoTimeout }} + - name: EG_REQUEST_TIMEOUT + value: !!str {{ .Values.kernel.requestTimeout }} - name: EG_ALLOWED_KERNELS value: {{ toJson .Values.kernel.allowedKernels | squote }} - name: EG_DEFAULT_KERNEL_NAME diff --git a/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml b/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml index 11a0abac5..be06575fb 100644 --- a/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml +++ b/etc/kubernetes/helm/enterprise-gateway/templates/eg-clusterrole.yaml @@ -23,6 +23,9 @@ rules: - apiGroups: ["sparkoperator.k8s.io"] resources: ["sparkapplications", "sparkapplications/status", "scheduledsparkapplications", "scheduledsparkapplications/status"] verbs: ["get", "watch", "list", "create", "delete"] + - apiGroups: ["ray.io"] + resources: ["rayclusters", "rayclusters/status", "rayjobs", "rayjobs/status", "rayservices", "rayservices/status"] + verbs: ["get", "watch", "list", "create", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -43,6 +46,6 @@ rules: resources: ["configmaps"] verbs: ["list", "create"] - apiGroups: [""] - resources: ["services", "persistentvolumeclaims"] + resources: ["services", "persistentvolumes", "persistentvolumeclaims"] verbs: ["list"] {{- end }} diff --git a/etc/kubernetes/helm/enterprise-gateway/values.yaml b/etc/kubernetes/helm/enterprise-gateway/values.yaml index 493bb3ebf..b65c95be1 100644 --- a/etc/kubernetes/helm/enterprise-gateway/values.yaml +++ b/etc/kubernetes/helm/enterprise-gateway/values.yaml @@ -89,6 +89,8 @@ kernel: shareGatewayNamespace: false # Timeout for kernel launching in seconds. launchTimeout: 60 + infoTimeout: 60 + requestTimeout: 60 # Timeout for an idle kernel before its culled in seconds. Default is 1 hour. cullIdleTimeout: 3600 # Whether to cull idle kernels with connecting clients diff --git a/pyproject.toml b/pyproject.toml index 829c6be41..424a25e1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "pycryptodomex>=3.9.7", "pyzmq>=20.0,<25.0", # Pyzmq 25 removes deprecated code that jupyter_client 6 uses, remove if v6 gets updated "requests>=2.14.2", - "tornado>=6.1", + "tornado>=6.5.2", "traitlets>=5.3.0", "watchdog>=2.1.3", "yarn-api-client>=1.0"