diff --git a/.github/workflows/rayjob_e2e_tests.yaml b/.github/workflows/rayjob_e2e_tests.yaml index c4856fd3..ba0659c0 100644 --- a/.github/workflows/rayjob_e2e_tests.yaml +++ b/.github/workflows/rayjob_e2e_tests.yaml @@ -115,6 +115,10 @@ jobs: kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user + kubectl create clusterrole secret-manager --verb=get,list,create,delete,update,patch --resource=secrets + kubectl create clusterrolebinding sdk-user-secret-manager --clusterrole=secret-manager --user=sdk-user + kubectl create clusterrole workload-reader --verb=get,list,watch --resource=workloads + kubectl create clusterrolebinding sdk-user-workload-reader --clusterrole=workload-reader --user=sdk-user kubectl config use-context sdk-user - name: Run RayJob E2E tests @@ -126,7 +130,7 @@ jobs: pip install poetry poetry install --with test,docs echo "Running RayJob e2e tests..." - poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log 2>&1 + poetry run pytest -v -s ./tests/e2e/rayjob/ > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log 2>&1 - name: Switch to kind-cluster context to print logs if: always() && steps.deploy.outcome == 'success' diff --git a/poetry.lock b/poetry.lock index aa315f21..88ceb3cb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4508,14 +4508,14 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.31.2" +version = "20.35.1" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "virtualenv-20.31.2-py3-none-any.whl", hash = "sha256:36efd0d9650ee985f0cad72065001e66d49a6f24eb44d98980f630686243cf11"}, - {file = "virtualenv-20.31.2.tar.gz", hash = "sha256:e10c0a9d02835e592521be48b332b6caee6887f332c111aa79a09b9e79efc2af"}, + {file = "virtualenv-20.35.1-py3-none-any.whl", hash = "sha256:1d9d93cd01d35b785476e2fa7af711a98d40d227a078941695bbae394f8737e2"}, + {file = "virtualenv-20.35.1.tar.gz", hash = "sha256:041dac43b6899858a91838b616599e80000e545dee01a21172a6a46746472cb2"}, ] [package.dependencies] diff --git a/src/codeflare_sdk/common/utils/constants.py b/src/codeflare_sdk/common/utils/constants.py index 7e6147f6..00559d2e 100644 --- a/src/codeflare_sdk/common/utils/constants.py +++ b/src/codeflare_sdk/common/utils/constants.py @@ -12,4 +12,4 @@ "3.11": CUDA_PY311_RUNTIME_IMAGE, "3.12": CUDA_PY312_RUNTIME_IMAGE, } -MOUNT_PATH = "/home/ray/scripts" +MOUNT_PATH = "/home/ray/files" diff --git a/src/codeflare_sdk/common/utils/test_utils.py b/src/codeflare_sdk/common/utils/test_utils.py new file mode 100644 index 00000000..d330bc4d --- /dev/null +++ b/src/codeflare_sdk/common/utils/test_utils.py @@ -0,0 +1,209 @@ +# Copyright 2025 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests for common/utils/utils.py +""" + +import pytest +from collections import namedtuple +from codeflare_sdk.common.utils.utils import ( + update_image, + get_ray_image_for_python_version, +) +from codeflare_sdk.common.utils.constants import ( + SUPPORTED_PYTHON_VERSIONS, + CUDA_PY311_RUNTIME_IMAGE, + CUDA_PY312_RUNTIME_IMAGE, +) + + +def test_update_image_with_empty_string_python_311(mocker): + """Test that update_image() with empty string returns default image for Python 3.11.""" + # Mock sys.version_info to simulate Python 3.11 + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 11, 0, "final", 0)) + + # Test with empty image (should use default for Python 3.11) + image = update_image("") + assert image == CUDA_PY311_RUNTIME_IMAGE + assert image == SUPPORTED_PYTHON_VERSIONS["3.11"] + + +def test_update_image_with_empty_string_python_312(mocker): + """Test that update_image() with empty string returns default image for Python 3.12.""" + # Mock sys.version_info to simulate Python 3.12 + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 12, 0, "final", 0)) + + # Test with empty image (should use default for Python 3.12) + image = update_image("") + assert image == CUDA_PY312_RUNTIME_IMAGE + assert image == SUPPORTED_PYTHON_VERSIONS["3.12"] + + +def test_update_image_with_none_python_311(mocker): + """Test that update_image() with None returns default image for Python 3.11.""" + # Mock sys.version_info to simulate Python 3.11 + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 11, 0, "final", 0)) + + # Test with None image (should use default for Python 3.11) + image = update_image(None) + assert image == CUDA_PY311_RUNTIME_IMAGE + + +def test_update_image_with_none_python_312(mocker): + """Test that update_image() with None returns default image for Python 3.12.""" + # Mock sys.version_info to simulate Python 3.12 + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 12, 0, "final", 0)) + + # Test with None image (should use default for Python 3.12) + image = update_image(None) + assert image == CUDA_PY312_RUNTIME_IMAGE + + +def test_update_image_with_unsupported_python_version(mocker): + """Test update_image() warning for unsupported Python versions.""" + # Mock sys.version_info to simulate Python 3.8 (unsupported) + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 8, 0, "final", 0)) + + # Mock warnings.warn to check if it gets called + warn_mock = mocker.patch("warnings.warn") + + # Call update_image with empty image + image = update_image("") + + # Assert that the warning was called with the expected message + warn_mock.assert_called_once() + assert "No default Ray image defined for 3.8" in warn_mock.call_args[0][0] + assert "3.11, 3.12" in warn_mock.call_args[0][0] + + # Assert that no image was set since the Python version is not supported + assert image is None + + +def test_update_image_with_provided_custom_image(): + """Test that providing a custom image bypasses auto-detection.""" + custom_image = "my-custom-ray:latest" + image = update_image(custom_image) + + # Should return the provided image unchanged + assert image == custom_image + + +def test_update_image_with_provided_image_empty_string(): + """Test update_image() with provided custom image as a non-empty string.""" + custom_image = "docker.io/rayproject/ray:2.40.0" + image = update_image(custom_image) + + # Should return the provided image unchanged + assert image == custom_image + + +def test_get_ray_image_for_python_version_explicit_311(): + """Test get_ray_image_for_python_version() with explicit Python 3.11.""" + image = get_ray_image_for_python_version("3.11") + assert image == CUDA_PY311_RUNTIME_IMAGE + + +def test_get_ray_image_for_python_version_explicit_312(): + """Test get_ray_image_for_python_version() with explicit Python 3.12.""" + image = get_ray_image_for_python_version("3.12") + assert image == CUDA_PY312_RUNTIME_IMAGE + + +def test_get_ray_image_for_python_version_auto_detect_311(mocker): + """Test get_ray_image_for_python_version() auto-detects Python 3.11.""" + # Mock sys.version_info to simulate Python 3.11 + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 11, 0, "final", 0)) + + # Test with None (should auto-detect) + image = get_ray_image_for_python_version() + assert image == CUDA_PY311_RUNTIME_IMAGE + + +def test_get_ray_image_for_python_version_auto_detect_312(mocker): + """Test get_ray_image_for_python_version() auto-detects Python 3.12.""" + # Mock sys.version_info to simulate Python 3.12 + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 12, 0, "final", 0)) + + # Test with None (should auto-detect) + image = get_ray_image_for_python_version() + assert image == CUDA_PY312_RUNTIME_IMAGE + + +def test_get_ray_image_for_python_version_unsupported_with_warning(mocker): + """Test get_ray_image_for_python_version() warns for unsupported versions.""" + warn_mock = mocker.patch("warnings.warn") + + # Test with unsupported version and warn_on_unsupported=True (default) + image = get_ray_image_for_python_version("3.9", warn_on_unsupported=True) + + # Should have warned + warn_mock.assert_called_once() + assert "No default Ray image defined for 3.9" in warn_mock.call_args[0][0] + + # Should return None + assert image is None + + +def test_get_ray_image_for_python_version_unsupported_without_warning(): + """Test get_ray_image_for_python_version() falls back to 3.12 without warning.""" + # Test with unsupported version and warn_on_unsupported=False + image = get_ray_image_for_python_version("3.10", warn_on_unsupported=False) + + # Should fall back to Python 3.12 image + assert image == CUDA_PY312_RUNTIME_IMAGE + + +def test_get_ray_image_for_python_version_unsupported_silent_fallback(): + """Test get_ray_image_for_python_version() silently falls back for old versions.""" + # Test with Python 3.8 and warn_on_unsupported=False + image = get_ray_image_for_python_version("3.8", warn_on_unsupported=False) + + # Should fall back to Python 3.12 image without warning + assert image == CUDA_PY312_RUNTIME_IMAGE + + +def test_get_ray_image_for_python_version_none_defaults_to_current(mocker): + """Test that passing None to get_ray_image_for_python_version() uses current Python.""" + # Mock sys.version_info to simulate Python 3.11 + VersionInfo = namedtuple( + "version_info", ["major", "minor", "micro", "releaselevel", "serial"] + ) + mocker.patch("sys.version_info", VersionInfo(3, 11, 5, "final", 0)) + + # Passing None should detect the mocked version + image = get_ray_image_for_python_version(None, warn_on_unsupported=True) + + assert image == CUDA_PY311_RUNTIME_IMAGE diff --git a/src/codeflare_sdk/ray/rayjobs/config.py b/src/codeflare_sdk/ray/rayjobs/config.py index 02ced875..c1fe0daa 100644 --- a/src/codeflare_sdk/ray/rayjobs/config.py +++ b/src/codeflare_sdk/ray/rayjobs/config.py @@ -24,6 +24,8 @@ from kubernetes.client import ( V1ConfigMapVolumeSource, V1KeyToPath, + V1LocalObjectReference, + V1SecretVolumeSource, V1Toleration, V1Volume, V1VolumeMount, @@ -247,6 +249,7 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]: """ ray_cluster_spec = { "rayVersion": RAY_VERSION, + "enableInTreeAutoscaling": False, # Required for Kueue-managed jobs "headGroupSpec": self._build_head_group_spec(), "workerGroupSpecs": [self._build_worker_group_spec(cluster_name)], } @@ -414,8 +417,6 @@ def _build_pod_spec(self, container: V1Container, is_head: bool) -> V1PodSpec: # Add image pull secrets if specified if hasattr(self, "image_pull_secrets") and self.image_pull_secrets: - from kubernetes.client import V1LocalObjectReference - pod_spec.image_pull_secrets = [ V1LocalObjectReference(name=secret) for secret in self.image_pull_secrets @@ -447,100 +448,101 @@ def _build_env_vars(self) -> list: """Build environment variables list.""" return [V1EnvVar(name=key, value=value) for key, value in self.envs.items()] - def add_script_volumes(self, configmap_name: str, mount_path: str = MOUNT_PATH): + def add_file_volumes(self, secret_name: str, mount_path: str = MOUNT_PATH): """ - Add script volume and mount references to cluster configuration. + Add file volume and mount references to cluster configuration. Args: - configmap_name: Name of the ConfigMap containing scripts - mount_path: Where to mount scripts in containers (default: /home/ray/scripts) + secret_name: Name of the Secret containing files + mount_path: Where to mount files in containers (default: /home/ray/scripts) """ - # Check if script volume already exists - volume_name = "ray-job-scripts" + # Check if file volume already exists + volume_name = "ray-job-files" existing_volume = next( (v for v in self.volumes if getattr(v, "name", None) == volume_name), None ) if existing_volume: - logger.debug(f"Script volume '{volume_name}' already exists, skipping...") + logger.debug(f"File volume '{volume_name}' already exists, skipping...") return - # Check if script mount already exists + # Check if file mount already exists existing_mount = next( (m for m in self.volume_mounts if getattr(m, "name", None) == volume_name), None, ) if existing_mount: logger.debug( - f"Script volume mount '{volume_name}' already exists, skipping..." + f"File volume mount '{volume_name}' already exists, skipping..." ) return - # Add script volume to cluster configuration - script_volume = V1Volume( - name=volume_name, config_map=V1ConfigMapVolumeSource(name=configmap_name) + # Add file volume to cluster configuration + file_volume = V1Volume( + name=volume_name, secret=V1SecretVolumeSource(secret_name=secret_name) ) - self.volumes.append(script_volume) + self.volumes.append(file_volume) - # Add script volume mount to cluster configuration - script_mount = V1VolumeMount(name=volume_name, mount_path=mount_path) - self.volume_mounts.append(script_mount) + # Add file volume mount to cluster configuration + file_mount = V1VolumeMount(name=volume_name, mount_path=mount_path) + self.volume_mounts.append(file_mount) logger.info( - f"Added script volume '{configmap_name}' to cluster config: mount_path={mount_path}" + f"Added file volume '{secret_name}' to cluster config: mount_path={mount_path}" ) - def validate_configmap_size(self, scripts: Dict[str, str]) -> None: - total_size = sum(len(content.encode("utf-8")) for content in scripts.values()) + def validate_secret_size(self, files: Dict[str, str]) -> None: + total_size = sum(len(content.encode("utf-8")) for content in files.values()) if total_size > 1024 * 1024: # 1MB raise ValueError( - f"ConfigMap size exceeds 1MB limit. Total size: {total_size} bytes" + f"Secret size exceeds 1MB limit. Total size: {total_size} bytes" ) - def build_script_configmap_spec( - self, job_name: str, namespace: str, scripts: Dict[str, str] + def build_file_secret_spec( + self, job_name: str, namespace: str, files: Dict[str, str] ) -> Dict[str, Any]: """ - Build ConfigMap specification for scripts + Build Secret specification for files Args: - job_name: Name of the RayJob (used for ConfigMap naming) + job_name: Name of the RayJob (used for Secret naming) namespace: Kubernetes namespace - scripts: Dictionary of script_name -> script_content + files: Dictionary of file_name -> file_content Returns: - Dict: ConfigMap specification ready for Kubernetes API + Dict: Secret specification ready for Kubernetes API """ - configmap_name = f"{job_name}-scripts" + secret_name = f"{job_name}-files" return { "apiVersion": "v1", - "kind": "ConfigMap", + "kind": "Secret", + "type": "Opaque", "metadata": { - "name": configmap_name, + "name": secret_name, "namespace": namespace, "labels": { "ray.io/job-name": job_name, "app.kubernetes.io/managed-by": "codeflare-sdk", - "app.kubernetes.io/component": "rayjob-scripts", + "app.kubernetes.io/component": "rayjob-files", }, }, - "data": scripts, + "data": files, } - def build_script_volume_specs( - self, configmap_name: str, mount_path: str = MOUNT_PATH + def build_file_volume_specs( + self, secret_name: str, mount_path: str = MOUNT_PATH ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ - Build volume and mount specifications for scripts + Build volume and mount specifications for files Args: - configmap_name: Name of the ConfigMap containing scripts - mount_path: Where to mount scripts in containers + secret_name: Name of the Secret containing files + mount_path: Where to mount files in containers Returns: Tuple of (volume_spec, mount_spec) as dictionaries """ - volume_spec = {"name": "ray-job-scripts", "configMap": {"name": configmap_name}} + volume_spec = {"name": "ray-job-files", "secret": {"secretName": secret_name}} - mount_spec = {"name": "ray-job-scripts", "mountPath": mount_path} + mount_spec = {"name": "ray-job-files", "mountPath": mount_path} return volume_spec, mount_spec diff --git a/src/codeflare_sdk/ray/rayjobs/rayjob.py b/src/codeflare_sdk/ray/rayjobs/rayjob.py index 228f9bb0..76e396ee 100644 --- a/src/codeflare_sdk/ray/rayjobs/rayjob.py +++ b/src/codeflare_sdk/ray/rayjobs/rayjob.py @@ -17,18 +17,24 @@ """ import logging -import warnings import os import re -import ast -from typing import Dict, Any, Optional, Tuple +import warnings +from typing import Dict, Any, Optional, Tuple, Union + +from ray.runtime_env import RuntimeEnv from codeflare_sdk.common.kueue.kueue import get_default_kueue_name from codeflare_sdk.common.utils.constants import MOUNT_PATH -from kubernetes import client -from ...common.kubernetes_cluster.auth import get_api_client + +from codeflare_sdk.common.utils.utils import get_ray_image_for_python_version from python_client.kuberay_job_api import RayjobApi from python_client.kuberay_cluster_api import RayClusterApi from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig +from codeflare_sdk.ray.rayjobs.runtime_env import ( + create_file_secret, + extract_all_local_files, + process_runtime_env, +) from ...common.utils import get_current_namespace from ...common.utils.validation import validate_ray_version_compatibility @@ -59,7 +65,7 @@ def __init__( cluster_name: Optional[str] = None, cluster_config: Optional[ManagedClusterConfig] = None, namespace: Optional[str] = None, - runtime_env: Optional[Dict[str, Any]] = None, + runtime_env: Optional[Union[RuntimeEnv, Dict[str, Any]]] = None, ttl_seconds_after_finished: int = 0, active_deadline_seconds: Optional[int] = None, local_queue: Optional[str] = None, @@ -73,7 +79,10 @@ def __init__( cluster_name: The name of an existing Ray cluster (optional if cluster_config provided) cluster_config: Configuration for creating a new cluster (optional if cluster_name provided) namespace: The Kubernetes namespace (auto-detected if not specified) - runtime_env: Ray runtime environment configuration (optional) + runtime_env: Ray runtime environment configuration. Can be: + - RuntimeEnv object from ray.runtime_env + - Dict with keys like 'working_dir', 'pip', 'env_vars', etc. + Example: {"working_dir": "./my-scripts", "pip": ["requests"]} ttl_seconds_after_finished: Seconds to wait before cleanup after job finishes (default: 0) active_deadline_seconds: Maximum time the job can run before being terminated (optional) local_queue: The Kueue LocalQueue to submit the job to (optional) @@ -105,7 +114,13 @@ def __init__( self.name = job_name self.entrypoint = entrypoint - self.runtime_env = runtime_env + + # Convert dict to RuntimeEnv if needed for user convenience + if isinstance(runtime_env, dict): + self.runtime_env = RuntimeEnv(**runtime_env) + else: + self.runtime_env = runtime_env + self.ttl_seconds_after_finished = ttl_seconds_after_finished self.active_deadline_seconds = active_deadline_seconds self.local_queue = local_queue @@ -147,7 +162,17 @@ def submit(self) -> str: if not self.entrypoint: raise ValueError("Entrypoint must be provided to submit a RayJob") + # Validate configuration before submitting self._validate_ray_version_compatibility() + self._validate_working_dir_entrypoint() + + # Extract files from entrypoint and runtime_env working_dir + files = extract_all_local_files(self) + + # Create Secret for files (will be mounted to submitter pod) + secret_name = None + if files: + secret_name = f"{self.name}-files" rayjob_cr = self._build_rayjob_cr() @@ -157,15 +182,9 @@ def submit(self) -> str: if result: logger.info(f"Successfully submitted RayJob {self.name}") - # Handle script files after RayJob creation so we can set owner reference - if self._cluster_config is not None: - scripts = self._extract_script_files_from_entrypoint() - if scripts: - self._handle_script_volumes_for_new_cluster(scripts, result) - elif self._cluster_name: - scripts = self._extract_script_files_from_entrypoint() - if scripts: - self._handle_script_volumes_for_existing_cluster(scripts, result) + # Create Secret with owner reference after RayJob exists + if files: + create_file_secret(self, files, result) return self.name else: @@ -195,13 +214,18 @@ def resubmit(self): def delete(self): """ Delete the Ray job. + Returns True if deleted successfully or if already deleted. """ deleted = self._api.delete_job(name=self.name, k8s_namespace=self.namespace) if deleted: logger.info(f"Successfully deleted the RayJob {self.name}") return True else: - raise RuntimeError(f"Failed to delete the RayJob {self.name}") + # The python client logs "rayjob custom resource already deleted" + # and returns False when the job doesn't exist. + # This is not an error - treat it as successful deletion. + logger.info(f"RayJob {self.name} already deleted or does not exist") + return True def _build_rayjob_cr(self) -> Dict[str, Any]: """ @@ -221,6 +245,9 @@ def _build_rayjob_cr(self) -> Dict[str, Any]: }, } + # Extract files once and use for both runtime_env and submitter pod + files = extract_all_local_files(self) + labels = {} # If cluster_config is provided, use the local_queue from the cluster_config if self._cluster_config is not None: @@ -251,9 +278,17 @@ def _build_rayjob_cr(self) -> Dict[str, Any]: if self.active_deadline_seconds: rayjob_cr["spec"]["activeDeadlineSeconds"] = self.active_deadline_seconds - # Add runtime environment if specified - if self.runtime_env: - rayjob_cr["spec"]["runtimeEnvYAML"] = str(self.runtime_env) + # Add runtime environment (can be inferred even if not explicitly specified) + processed_runtime_env = process_runtime_env(self, files) + if processed_runtime_env: + rayjob_cr["spec"]["runtimeEnvYAML"] = processed_runtime_env + + # Add submitterPodTemplate if we have files to mount + if files: + secret_name = f"{self.name}-files" + rayjob_cr["spec"][ + "submitterPodTemplate" + ] = self._build_submitter_pod_template(files, secret_name) # Configure cluster: either use existing or create new if self._cluster_config is not None: @@ -275,6 +310,115 @@ def _build_rayjob_cr(self) -> Dict[str, Any]: return rayjob_cr + def _build_submitter_pod_template( + self, files: Dict[str, str], secret_name: str + ) -> Dict[str, Any]: + """ + Build submitterPodTemplate with Secret volume mount for local files. + + If files contain working_dir.zip, an init container will unzip it before + the main submitter container runs. + + Args: + files: Dict of file_name -> file_content + secret_name: Name of the Secret containing the files + + Returns: + submitterPodTemplate specification + """ + from codeflare_sdk.ray.rayjobs.runtime_env import UNZIP_PATH + + # Image has to be hard coded for the job submitter + image = get_ray_image_for_python_version() + if ( + self._cluster_config + and hasattr(self._cluster_config, "image") + and self._cluster_config.image + ): + image = self._cluster_config.image + + # Build Secret items for each file + secret_items = [] + entrypoint_path = files.get( + "__entrypoint_path__" + ) # Metadata for single file case + + for file_name in files.keys(): + if file_name == "__entrypoint_path__": + continue # Skip metadata key + + # For single file case, use the preserved path structure + if entrypoint_path: + secret_items.append({"key": file_name, "path": entrypoint_path}) + else: + secret_items.append({"key": file_name, "path": file_name}) + + # Check if we need to unzip working_dir + has_working_dir_zip = "working_dir.zip" in files + + # Base volume mounts for main container + volume_mounts = [{"name": "ray-job-files", "mountPath": MOUNT_PATH}] + + # If we have a zip file, we need shared volume for unzipped content + if has_working_dir_zip: + volume_mounts.append( + {"name": "unzipped-working-dir", "mountPath": UNZIP_PATH} + ) + + submitter_pod_template = { + "spec": { + "restartPolicy": "Never", + "containers": [ + { + "name": "ray-job-submitter", + "image": image, + "volumeMounts": volume_mounts, + } + ], + "volumes": [ + { + "name": "ray-job-files", + "secret": { + "secretName": secret_name, + "items": secret_items, + }, + } + ], + } + } + + # Add init container and volume for unzipping if needed + if has_working_dir_zip: + # Add emptyDir volume for unzipped content + submitter_pod_template["spec"]["volumes"].append( + {"name": "unzipped-working-dir", "emptyDir": {}} + ) + + # Add init container to unzip before KubeRay's submitter runs + submitter_pod_template["spec"]["initContainers"] = [ + { + "name": "unzip-working-dir", + "image": image, + "command": ["/bin/sh", "-c"], + "args": [ + # Decode base64 zip, save to temp file, extract, cleanup + f"mkdir -p {UNZIP_PATH} && " + f"python3 -m base64 -d {MOUNT_PATH}/working_dir.zip > /tmp/working_dir.zip && " + f"python3 -m zipfile -e /tmp/working_dir.zip {UNZIP_PATH}/ && " + f"rm /tmp/working_dir.zip && " + f"echo 'Successfully unzipped working_dir to {UNZIP_PATH}' && " + f"ls -la {UNZIP_PATH}" + ], + "volumeMounts": volume_mounts, + } + ] + logger.info(f"Added init container to unzip working_dir to {UNZIP_PATH}") + + logger.info( + f"Built submitterPodTemplate with {len(files)} files mounted at {MOUNT_PATH}, using image: {image}" + ) + return submitter_pod_template + def _validate_ray_version_compatibility(self): """ Validate Ray version compatibility for cluster_config image. @@ -311,6 +455,100 @@ def _validate_cluster_config_image(self): elif is_warning: warnings.warn(f"Cluster config image: {message}") + def _validate_working_dir_entrypoint(self): + """ + Validate entrypoint file configuration. + + Checks: + 1. Entrypoint doesn't redundantly reference working_dir + 2. Local files exist before submission + + Raises ValueError if validation fails. + """ + # Skip validation for inline commands (python -c, etc.) + if re.search(r"\s+-c\s+", self.entrypoint): + return + + # Match Python file references only + file_pattern = r"(?:python\d?\s+)?([./\w/-]+\.py)" + matches = re.findall(file_pattern, self.entrypoint) + + if not matches: + return + + entrypoint_path = matches[0] + + # Get working_dir from runtime_env + runtime_env_dict = None + working_dir = None + + if self.runtime_env: + runtime_env_dict = ( + self.runtime_env.to_dict() + if hasattr(self.runtime_env, "to_dict") + else self.runtime_env + ) + if runtime_env_dict and "working_dir" in runtime_env_dict: + working_dir = runtime_env_dict["working_dir"] + + # Skip all validation for remote working_dir + if working_dir and not os.path.isdir(working_dir): + return + + # Case 1: Local working_dir - check redundancy and file existence + if working_dir: + normalized_working_dir = os.path.normpath(working_dir) + normalized_entrypoint = os.path.normpath(entrypoint_path) + + # Check for redundant directory reference + if normalized_entrypoint.startswith(normalized_working_dir + os.sep): + relative_to_working_dir = os.path.relpath( + normalized_entrypoint, normalized_working_dir + ) + working_dir_basename = os.path.basename(normalized_working_dir) + redundant_nested_path = os.path.join( + normalized_working_dir, + working_dir_basename, + relative_to_working_dir, + ) + + if not os.path.exists(redundant_nested_path): + raise ValueError( + f"❌ Working directory conflict detected:\n" + f" working_dir: '{working_dir}'\n" + f" entrypoint references: '{entrypoint_path}'\n" + f"\n" + f"This will fail because the entrypoint runs from within working_dir.\n" + f"It would look for: '{redundant_nested_path}' (which doesn't exist)\n" + f"\n" + f"Fix: Remove the directory prefix from your entrypoint:\n" + f' entrypoint = "python {relative_to_working_dir}"' + ) + + # Check file exists within working_dir + if not normalized_entrypoint.startswith(normalized_working_dir + os.sep): + # Use normalized_working_dir (absolute path) for proper file existence check + full_entrypoint_path = os.path.join( + normalized_working_dir, entrypoint_path + ) + if not os.path.isfile(full_entrypoint_path): + raise ValueError( + f"❌ Entrypoint file not found:\n" + f" Looking for: '{full_entrypoint_path}'\n" + f" (working_dir: '{working_dir}', entrypoint file: '{entrypoint_path}')\n" + f"\n" + f"Please ensure the file exists at the expected location." + ) + + # Case 2: No working_dir - validate local file exists + else: + if not os.path.isfile(entrypoint_path): + raise ValueError( + f"❌ Entrypoint file not found: '{entrypoint_path}'\n" + f"\n" + f"Please ensure the file exists at the specified path." + ) + def status( self, print_to_console: bool = True ) -> Tuple[CodeflareRayJobStatus, bool]: @@ -380,295 +618,3 @@ def _map_to_codeflare_status( return status_mapping.get( deployment_status, (CodeflareRayJobStatus.UNKNOWN, False) ) - - def _extract_script_files_from_entrypoint(self) -> Optional[Dict[str, str]]: - """ - Extract local Python script files from entrypoint command, plus their dependencies. - - Returns: - Dict of {script_name: script_content} if local scripts found, None otherwise - """ - if not self.entrypoint: - return None - - scripts = {} - processed_files = set() # Avoid infinite loops - - # Look for Python file patterns in entrypoint (e.g., "python script.py", "python /path/to/script.py") - python_file_pattern = r"(?:python\s+)?([./\w/]+\.py)" - matches = re.findall(python_file_pattern, self.entrypoint) - - # Process main scripts from entrypoint files - for script_path in matches: - self._process_script_and_imports( - script_path, scripts, MOUNT_PATH, processed_files - ) - - # Update entrypoint paths to use mounted locations - for script_path in matches: - if script_path in [os.path.basename(s) for s in processed_files]: - old_path = script_path - new_path = f"{MOUNT_PATH}/{os.path.basename(script_path)}" - self.entrypoint = self.entrypoint.replace(old_path, new_path) - - return scripts if scripts else None - - def _process_script_and_imports( - self, - script_path: str, - scripts: Dict[str, str], - mount_path: str, - processed_files: set, - ): - """Recursively process a script and its local imports""" - if script_path in processed_files: - return - - # Check if it's a local file (not already a container path) - if script_path.startswith("/home/ray/") or not os.path.isfile(script_path): - return - - processed_files.add(script_path) - - try: - with open(script_path, "r") as f: - script_content = f.read() - - script_name = os.path.basename(script_path) - scripts[script_name] = script_content - - logger.info( - f"Found local script: {script_path} -> will mount at {mount_path}/{script_name}" - ) - - # Parse imports in this script to find dependencies - self._find_local_imports( - script_content, - script_path, - lambda path: self._process_script_and_imports( - path, scripts, mount_path, processed_files - ), - ) - - except (IOError, OSError) as e: - logger.warning(f"Could not read script file {script_path}: {e}") - - def _find_local_imports( - self, script_content: str, script_path: str, process_callback - ): - """ - Find local Python imports in script content and process them. - - Args: - script_content: The content of the Python script - script_path: Path to the current script (for relative imports) - process_callback: Function to call for each found local import - """ - - try: - # Parse the Python AST to find imports - tree = ast.parse(script_content) - script_dir = os.path.dirname(os.path.abspath(script_path)) - - for node in ast.walk(tree): - if isinstance(node, ast.Import): - # Handle: import module_name - for alias in node.names: - potential_file = os.path.join(script_dir, f"{alias.name}.py") - if os.path.isfile(potential_file): - process_callback(potential_file) - - elif isinstance(node, ast.ImportFrom): - # Handle: from module_name import something - if node.module: - potential_file = os.path.join(script_dir, f"{node.module}.py") - if os.path.isfile(potential_file): - process_callback(potential_file) - - except (SyntaxError, ValueError) as e: - logger.debug(f"Could not parse imports from {script_path}: {e}") - - def _handle_script_volumes_for_new_cluster( - self, scripts: Dict[str, str], rayjob_result: Dict[str, Any] = None - ): - """Handle script volumes for new clusters (uses ManagedClusterConfig).""" - # Validate ConfigMap size before creation - self._cluster_config.validate_configmap_size(scripts) - - # Build ConfigMap spec using config.py - configmap_spec = self._cluster_config.build_script_configmap_spec( - job_name=self.name, namespace=self.namespace, scripts=scripts - ) - - # Create ConfigMap via Kubernetes API with owner reference - configmap_name = self._create_configmap_from_spec(configmap_spec, rayjob_result) - - # Add volumes to cluster config (config.py handles spec building) - self._cluster_config.add_script_volumes( - configmap_name=configmap_name, mount_path=MOUNT_PATH - ) - - def _handle_script_volumes_for_existing_cluster( - self, scripts: Dict[str, str], rayjob_result: Dict[str, Any] = None - ): - """Handle script volumes for existing clusters (updates RayCluster CR).""" - # Create config builder for utility methods - config_builder = ManagedClusterConfig() - - # Validate ConfigMap size before creation - config_builder.validate_configmap_size(scripts) - - # Build ConfigMap spec using config.py - configmap_spec = config_builder.build_script_configmap_spec( - job_name=self.name, namespace=self.namespace, scripts=scripts - ) - - # Create ConfigMap via Kubernetes API with owner reference - configmap_name = self._create_configmap_from_spec(configmap_spec, rayjob_result) - - # Update existing RayCluster - self._update_existing_cluster_for_scripts(configmap_name, config_builder) - - def _create_configmap_from_spec( - self, configmap_spec: Dict[str, Any], rayjob_result: Dict[str, Any] = None - ) -> str: - """ - Create ConfigMap from specification via Kubernetes API. - - Args: - configmap_spec: ConfigMap specification dictionary - rayjob_result: The result from RayJob creation containing UID - - Returns: - str: Name of the created ConfigMap - """ - - configmap_name = configmap_spec["metadata"]["name"] - - metadata = client.V1ObjectMeta(**configmap_spec["metadata"]) - - # Add owner reference if we have the RayJob result - if ( - rayjob_result - and isinstance(rayjob_result, dict) - and rayjob_result.get("metadata", {}).get("uid") - ): - logger.info( - f"Adding owner reference to ConfigMap '{configmap_name}' with RayJob UID: {rayjob_result['metadata']['uid']}" - ) - metadata.owner_references = [ - client.V1OwnerReference( - api_version="ray.io/v1", - kind="RayJob", - name=self.name, - uid=rayjob_result["metadata"]["uid"], - controller=True, - block_owner_deletion=True, - ) - ] - else: - logger.warning( - f"No valid RayJob result with UID found, ConfigMap '{configmap_name}' will not have owner reference. Result: {rayjob_result}" - ) - - # Convert dict spec to V1ConfigMap - configmap = client.V1ConfigMap( - metadata=metadata, - data=configmap_spec["data"], - ) - - # Create ConfigMap via Kubernetes API - k8s_api = client.CoreV1Api(get_api_client()) - try: - k8s_api.create_namespaced_config_map( - namespace=self.namespace, body=configmap - ) - logger.info( - f"Created ConfigMap '{configmap_name}' with {len(configmap_spec['data'])} scripts" - ) - except client.ApiException as e: - if e.status == 409: # Already exists - logger.info(f"ConfigMap '{configmap_name}' already exists, updating...") - k8s_api.replace_namespaced_config_map( - name=configmap_name, namespace=self.namespace, body=configmap - ) - else: - raise RuntimeError( - f"Failed to create ConfigMap '{configmap_name}': {e}" - ) - - return configmap_name - - # Note: This only works once the pods have been restarted as the configmaps won't be picked up until then :/ - def _update_existing_cluster_for_scripts( - self, configmap_name: str, config_builder: ManagedClusterConfig - ): - """ - Update existing RayCluster to add script volumes and mounts. - - Args: - configmap_name: Name of the ConfigMap containing scripts - config_builder: ManagedClusterConfig instance for building specs - """ - - try: - ray_cluster = self._cluster_api.get_ray_cluster( - name=self.cluster_name, - k8s_namespace=self.namespace, - ) - except client.ApiException as e: - raise RuntimeError(f"Failed to get RayCluster '{self.cluster_name}': {e}") - - # Build script volume and mount specifications using config.py - script_volume, script_mount = config_builder.build_script_volume_specs( - configmap_name=configmap_name, mount_path=MOUNT_PATH - ) - - # Helper function to check for duplicate volumes/mounts - def volume_exists(volumes_list, volume_name): - return any(v.get("name") == volume_name for v in volumes_list) - - def mount_exists(mounts_list, mount_name): - return any(m.get("name") == mount_name for m in mounts_list) - - # Add volumes and mounts to head group - head_spec = ray_cluster["spec"]["headGroupSpec"]["template"]["spec"] - if "volumes" not in head_spec: - head_spec["volumes"] = [] - if not volume_exists(head_spec["volumes"], script_volume["name"]): - head_spec["volumes"].append(script_volume) - - head_container = head_spec["containers"][0] # Ray head container - if "volumeMounts" not in head_container: - head_container["volumeMounts"] = [] - if not mount_exists(head_container["volumeMounts"], script_mount["name"]): - head_container["volumeMounts"].append(script_mount) - - # Add volumes and mounts to worker groups - for worker_group in ray_cluster["spec"]["workerGroupSpecs"]: - worker_spec = worker_group["template"]["spec"] - if "volumes" not in worker_spec: - worker_spec["volumes"] = [] - if not volume_exists(worker_spec["volumes"], script_volume["name"]): - worker_spec["volumes"].append(script_volume) - - worker_container = worker_spec["containers"][0] # Ray worker container - if "volumeMounts" not in worker_container: - worker_container["volumeMounts"] = [] - if not mount_exists(worker_container["volumeMounts"], script_mount["name"]): - worker_container["volumeMounts"].append(script_mount) - - # Update the RayCluster - try: - self._cluster_api.patch_ray_cluster( - name=self.cluster_name, - ray_patch=ray_cluster, - k8s_namespace=self.namespace, - ) - logger.info( - f"Updated RayCluster '{self.cluster_name}' with script volumes from ConfigMap '{configmap_name}'" - ) - except client.ApiException as e: - raise RuntimeError( - f"Failed to update RayCluster '{self.cluster_name}': {e}" - ) diff --git a/src/codeflare_sdk/ray/rayjobs/runtime_env.py b/src/codeflare_sdk/ray/rayjobs/runtime_env.py new file mode 100644 index 00000000..d6d2230b --- /dev/null +++ b/src/codeflare_sdk/ray/rayjobs/runtime_env.py @@ -0,0 +1,408 @@ +from __future__ import annotations # Postpone evaluation of annotations + +import logging +import os +import re +import yaml +import zipfile +import base64 +import io +from typing import Dict, Any, Optional, List, TYPE_CHECKING +from codeflare_sdk.common.utils.constants import MOUNT_PATH +from kubernetes import client +from ray.runtime_env import RuntimeEnv + +from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig +from ...common.kubernetes_cluster.auth import get_api_client + +# Use TYPE_CHECKING to avoid circular import at runtime +if TYPE_CHECKING: + from codeflare_sdk.ray.rayjobs.rayjob import RayJob + +logger = logging.getLogger(__name__) + +# Regex pattern for finding Python files in entrypoint commands +# Matches paths like: test.py, ./test.py, dir/test.py, my-dir/test.py +PYTHON_FILE_PATTERN = r"(?:python\s+)?([./\w/-]+\.py)" + +# Path where working_dir will be unzipped on submitter pod +UNZIP_PATH = "/tmp/rayjob-working-dir" + +# File pattern to exclude from working directory zips +# Jupyter notebooks can contain sensitive outputs, tokens, and large data +JUPYTER_NOTEBOOK_PATTERN = r"\.ipynb$" + + +def _should_exclude_file(file_path: str) -> bool: + """ + Check if file should be excluded from working directory zip. + Currently excludes Jupyter notebook files (.ipynb). + + Args: + file_path: Relative file path within the working directory + + Returns: + True if file should be excluded, False otherwise + """ + return bool(re.search(JUPYTER_NOTEBOOK_PATTERN, file_path, re.IGNORECASE)) + + +def _normalize_runtime_env( + runtime_env: Optional[RuntimeEnv], +) -> Optional[Dict[str, Any]]: + if runtime_env is None: + return None + return runtime_env.to_dict() + + +def extract_all_local_files(job: RayJob) -> Optional[Dict[str, str]]: + """ + Prepare local files for Secret upload. + + - If runtime_env has local working_dir: zip entire directory into single file + - If single entrypoint file (no working_dir): extract that file + - If remote working_dir URL: return None (pass through to Ray) + + Returns: + Dict with either: + - {"working_dir.zip": } for zipped directories + - {"script.py": } for single files + - None for remote working_dir or no files + """ + # Convert RuntimeEnv to dict for processing + runtime_env_dict = _normalize_runtime_env(job.runtime_env) + + # If there's a remote working_dir, don't extract local files + if ( + runtime_env_dict + and "working_dir" in runtime_env_dict + and not os.path.isdir(runtime_env_dict["working_dir"]) + ): + logger.info( + f"Remote working_dir detected: {runtime_env_dict['working_dir']}. " + "Skipping local file extraction - using remote source." + ) + return None + + # If there's a local working_dir, zip it + if ( + runtime_env_dict + and "working_dir" in runtime_env_dict + and os.path.isdir(runtime_env_dict["working_dir"]) + ): + working_dir = runtime_env_dict["working_dir"] + logger.info(f"Zipping local working_dir: {working_dir}") + zip_data = _zip_directory(working_dir) + if zip_data: + # Encode zip as base64 for Secret storage + zip_base64 = base64.b64encode(zip_data).decode("utf-8") + return {"working_dir.zip": zip_base64} + + # If no working_dir, check for single entrypoint file + entrypoint_file = _extract_single_entrypoint_file(job) + if entrypoint_file: + return entrypoint_file + + return None + + +def _zip_directory(directory_path: str) -> Optional[bytes]: + """ + Zip entire directory preserving structure, excluding Jupyter notebook files. + + Args: + directory_path: Path to directory to zip + + Returns: + Bytes of zip file, or None on error + """ + try: + # Create in-memory zip file + zip_buffer = io.BytesIO() + excluded_count = 0 + + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf: + # Walk through directory and add all files + for root, dirs, files in os.walk(directory_path): + for file in files: + file_path = os.path.join(root, file) + # Calculate relative path from directory_path + arcname = os.path.relpath(file_path, directory_path) + + # Check if file should be excluded + if _should_exclude_file(arcname): + excluded_count += 1 + logger.debug(f"Excluded from zip: {arcname}") + continue + + zipf.write(file_path, arcname) + logger.debug(f"Added to zip: {arcname}") + + zip_data = zip_buffer.getvalue() + + # Log summary with exclusion count + log_message = ( + f"Successfully zipped directory: {directory_path} ({len(zip_data)} bytes)" + ) + if excluded_count > 0: + log_message += f" - Excluded {excluded_count} Jupyter notebook files" + logger.info(log_message) + + return zip_data + + except (IOError, OSError) as e: + logger.error(f"Failed to zip directory {directory_path}: {e}") + return None + + +def _extract_single_entrypoint_file(job: RayJob) -> Optional[Dict[str, str]]: + """ + Extract single Python file from entrypoint if no working_dir specified. + + Returns a dict with metadata about the file path structure so we can + preserve it when mounting via Secret. + + Args: + job: RayJob instance + + Returns: + Dict with special format: {"__entrypoint_path__": path, "filename": content} + This allows us to preserve directory structure when mounting + """ + if not job.entrypoint: + return None + + # Look for Python file in entrypoint + matches = re.findall(PYTHON_FILE_PATTERN, job.entrypoint) + + for file_path in matches: + # Check if it's a local file + if os.path.isfile(file_path): + try: + with open(file_path, "r") as f: + content = f.read() + + # Use basename as key (Secret keys can't have slashes) + # But store the full path for later use in Secret item.path + filename = os.path.basename(file_path) + relative_path = file_path.lstrip("./") + + logger.info(f"Extracted single entrypoint file: {file_path}") + + # Return special format with metadata + return {"__entrypoint_path__": relative_path, filename: content} + + except (IOError, OSError) as e: + logger.warning(f"Could not read entrypoint file {file_path}: {e}") + + return None + + +def process_runtime_env( + job: RayJob, files: Optional[Dict[str, str]] = None +) -> Optional[str]: + """ + Process runtime_env field to handle env_vars, pip dependencies, and working_dir. + + Returns: + Processed runtime environment as YAML string, or None if no processing needed + """ + # Convert RuntimeEnv to dict for processing + runtime_env_dict = _normalize_runtime_env(job.runtime_env) + + processed_env = {} + + # Handle env_vars + if runtime_env_dict and "env_vars" in runtime_env_dict: + processed_env["env_vars"] = runtime_env_dict["env_vars"] + logger.info( + f"Added {len(runtime_env_dict['env_vars'])} environment variables to runtime_env" + ) + + # Handle pip dependencies + if runtime_env_dict and "pip" in runtime_env_dict: + pip_deps = process_pip_dependencies(job, runtime_env_dict["pip"]) + if pip_deps: + processed_env["pip"] = pip_deps + + # Handle working_dir + if runtime_env_dict and "working_dir" in runtime_env_dict: + working_dir = runtime_env_dict["working_dir"] + if os.path.isdir(working_dir): + # Local working directory - will be zipped and unzipped to UNZIP_PATH by submitter pod + processed_env["working_dir"] = UNZIP_PATH + logger.info( + f"Local working_dir will be zipped, mounted, and unzipped to: {UNZIP_PATH}" + ) + else: + # Remote URI (e.g., GitHub, S3) - pass through as-is + processed_env["working_dir"] = working_dir + logger.info(f"Using remote working_dir: {working_dir}") + + # If no working_dir specified but we have files (single file case) + elif not runtime_env_dict or "working_dir" not in runtime_env_dict: + if files and "working_dir.zip" not in files: + # Single file case - mount at MOUNT_PATH + processed_env["working_dir"] = MOUNT_PATH + logger.info(f"Single file will be mounted at: {MOUNT_PATH}") + + # Convert to YAML string if we have any processed environment + if processed_env: + return yaml.dump(processed_env, default_flow_style=False) + + return None + + +def process_pip_dependencies(job: RayJob, pip_spec) -> Optional[List[str]]: + """ + Process pip dependencies from runtime_env. + + Args: + pip_spec: Can be a list of packages, a string path to requirements.txt, or dict + + Returns: + List of pip dependencies + """ + if isinstance(pip_spec, list): + # Already a list of dependencies + logger.info(f"Using provided pip dependencies: {len(pip_spec)} packages") + return pip_spec + elif isinstance(pip_spec, str): + # Assume it's a path to requirements.txt + return parse_requirements_file(pip_spec) + elif isinstance(pip_spec, dict): + # Handle dict format (e.g., {"packages": [...], "pip_check": False}) + if "packages" in pip_spec: + logger.info( + f"Using pip dependencies from dict: {len(pip_spec['packages'])} packages" + ) + return pip_spec["packages"] + + logger.warning(f"Unsupported pip specification format: {type(pip_spec)}") + return None + + +def parse_requirements_file(requirements_path: str) -> Optional[List[str]]: + """ + Parse a requirements.txt file and return list of dependencies. + + Args: + requirements_path: Path to requirements.txt file + + Returns: + List of pip dependencies + """ + if not os.path.isfile(requirements_path): + logger.warning(f"Requirements file not found: {requirements_path}") + return None + + try: + with open(requirements_path, "r") as f: + lines = f.readlines() + + # Parse requirements, filtering out comments and empty lines + requirements = [] + for line in lines: + line = line.strip() + if line and not line.startswith("#"): + requirements.append(line) + + logger.info(f"Parsed {len(requirements)} dependencies from {requirements_path}") + return requirements + + except (IOError, OSError) as e: + logger.warning(f"Could not read requirements file {requirements_path}: {e}") + return None + + +def create_secret_from_spec( + job: RayJob, secret_spec: Dict[str, Any], rayjob_result: Dict[str, Any] = None +) -> str: + """ + Create Secret from specification via Kubernetes API. + + Args: + secret_spec: Secret specification dictionary + rayjob_result: The result from RayJob creation containing UID + + Returns: + str: Name of the created Secret + """ + + secret_name = secret_spec["metadata"]["name"] + + metadata = client.V1ObjectMeta(**secret_spec["metadata"]) + + # Add owner reference if we have the RayJob result + if ( + rayjob_result + and isinstance(rayjob_result, dict) + and rayjob_result.get("metadata", {}).get("uid") + ): + logger.info( + f"Adding owner reference to Secret '{secret_name}' with RayJob UID: {rayjob_result['metadata']['uid']}" + ) + metadata.owner_references = [ + client.V1OwnerReference( + api_version="ray.io/v1", + kind="RayJob", + name=job.name, + uid=rayjob_result["metadata"]["uid"], + controller=True, + block_owner_deletion=True, + ) + ] + else: + logger.warning( + f"No valid RayJob result with UID found, Secret '{secret_name}' will not have owner reference. Result: {rayjob_result}" + ) + + # Convert dict spec to V1Secret + # Use stringData instead of data to avoid double base64 encoding + # Our zip files are already base64-encoded, so stringData will handle the final encoding + secret = client.V1Secret( + metadata=metadata, + type=secret_spec.get("type", "Opaque"), + string_data=secret_spec["data"], + ) + + # Create Secret via Kubernetes API + k8s_api = client.CoreV1Api(get_api_client()) + try: + k8s_api.create_namespaced_secret(namespace=job.namespace, body=secret) + logger.info( + f"Created Secret '{secret_name}' with {len(secret_spec['data'])} files" + ) + except client.ApiException as e: + if e.status == 409: # Already exists + logger.info(f"Secret '{secret_name}' already exists, updating...") + k8s_api.replace_namespaced_secret( + name=secret_name, namespace=job.namespace, body=secret + ) + else: + raise RuntimeError(f"Failed to create Secret '{secret_name}': {e}") + + return secret_name + + +def create_file_secret( + job: RayJob, files: Dict[str, str], rayjob_result: Dict[str, Any] +): + """ + Create Secret with owner reference for local files. + """ + # Use a basic config builder for Secret creation + config_builder = ManagedClusterConfig() + + # Filter out metadata keys (like __entrypoint_path__) from Secret data + secret_files = {k: v for k, v in files.items() if not k.startswith("__")} + + # Validate and build Secret spec + config_builder.validate_secret_size(secret_files) + secret_spec = config_builder.build_file_secret_spec( + job_name=job.name, namespace=job.namespace, files=secret_files + ) + + # Create Secret with owner reference + # TODO Error handling + create_secret_from_spec(job, secret_spec, rayjob_result) diff --git a/src/codeflare_sdk/ray/rayjobs/test/conftest.py b/src/codeflare_sdk/ray/rayjobs/test/conftest.py new file mode 100644 index 00000000..bad195a7 --- /dev/null +++ b/src/codeflare_sdk/ray/rayjobs/test/conftest.py @@ -0,0 +1,45 @@ +"""Shared pytest fixtures for rayjobs tests.""" + +import pytest +from unittest.mock import MagicMock + + +# Global test setup that runs automatically for ALL tests +@pytest.fixture(autouse=True) +def auto_mock_setup(mocker): + """Automatically mock common dependencies for all tests.""" + mocker.patch("kubernetes.config.load_kube_config") + + # Always mock get_default_kueue_name to prevent K8s API calls + mocker.patch( + "codeflare_sdk.ray.rayjobs.rayjob.get_default_kueue_name", + return_value="default-queue", + ) + + mock_get_ns = mocker.patch( + "codeflare_sdk.ray.rayjobs.rayjob.get_current_namespace", + return_value="test-namespace", + ) + + mock_rayjob_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") + mock_rayjob_instance = MagicMock() + mock_rayjob_api.return_value = mock_rayjob_instance + + mock_cluster_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayClusterApi") + mock_cluster_instance = MagicMock() + mock_cluster_api.return_value = mock_cluster_instance + + mock_k8s_api = mocker.patch("kubernetes.client.CoreV1Api") + mock_k8s_instance = MagicMock() + mock_k8s_api.return_value = mock_k8s_instance + + # Mock get_api_client in runtime_env module where it's actually used + mocker.patch("codeflare_sdk.ray.rayjobs.runtime_env.get_api_client") + + # Return the mocked instances so tests can configure them as needed + return { + "rayjob_api": mock_rayjob_instance, + "cluster_api": mock_cluster_instance, + "k8s_api": mock_k8s_instance, + "get_current_namespace": mock_get_ns, + } diff --git a/src/codeflare_sdk/ray/rayjobs/test_config.py b/src/codeflare_sdk/ray/rayjobs/test/test_config.py similarity index 54% rename from src/codeflare_sdk/ray/rayjobs/test_config.py rename to src/codeflare_sdk/ray/rayjobs/test/test_config.py index d19864ba..182ff90c 100644 --- a/src/codeflare_sdk/ray/rayjobs/test_config.py +++ b/src/codeflare_sdk/ray/rayjobs/test/test_config.py @@ -4,6 +4,8 @@ import pytest from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig, DEFAULT_ACCELERATORS +from kubernetes.client import V1VolumeMount +from kubernetes.client import V1Volume, V1SecretVolumeSource def test_accelerator_configs_defaults_to_default_accelerators(): @@ -167,63 +169,178 @@ def test_ray_usage_stats_with_other_user_envs(): assert len(config.envs) == 3 -def test_add_script_volumes_existing_volume_early_return(): - """Test add_script_volumes early return when volume already exists.""" - from kubernetes.client import V1Volume, V1ConfigMapVolumeSource +def test_add_file_volumes_existing_volume_early_return(): + """Test add_file_volumes early return when volume already exists.""" config = ManagedClusterConfig() # Pre-add a volume with same name existing_volume = V1Volume( - name="ray-job-scripts", - config_map=V1ConfigMapVolumeSource(name="existing-scripts"), + name="ray-job-files", + secret=V1SecretVolumeSource(secret_name="existing-files"), ) config.volumes.append(existing_volume) # Should return early and not add duplicate - config.add_script_volumes(configmap_name="new-scripts") + config.add_file_volumes(secret_name="new-files") # Should still have only one volume, no mount added assert len(config.volumes) == 1 assert len(config.volume_mounts) == 0 -def test_add_script_volumes_existing_mount_early_return(): - """Test add_script_volumes early return when mount already exists.""" - from kubernetes.client import V1VolumeMount +def test_add_file_volumes_existing_mount_early_return(): + """Test add_file_volumes early return when mount already exists.""" config = ManagedClusterConfig() # Pre-add a mount with same name - existing_mount = V1VolumeMount(name="ray-job-scripts", mount_path="/existing/path") + existing_mount = V1VolumeMount(name="ray-job-files", mount_path="/existing/path") config.volume_mounts.append(existing_mount) # Should return early and not add duplicate - config.add_script_volumes(configmap_name="new-scripts") + config.add_file_volumes(secret_name="new-files") # Should still have only one mount, no volume added assert len(config.volumes) == 0 assert len(config.volume_mounts) == 1 -def test_build_script_configmap_spec_labels(): - """Test that build_script_configmap_spec creates ConfigMap with correct labels.""" +def test_build_file_secret_spec_labels(): + """Test that build_file_secret_spec creates Secret with correct labels.""" config = ManagedClusterConfig() job_name = "test-job" namespace = "test-namespace" - scripts = {"script.py": "print('hello')", "helper.py": "# helper code"} + files = {"test.py": "print('hello')", "helper.py": "# helper code"} - configmap_spec = config.build_script_configmap_spec(job_name, namespace, scripts) + secret_spec = config.build_file_secret_spec(job_name, namespace, files) - assert configmap_spec["apiVersion"] == "v1" - assert configmap_spec["kind"] == "ConfigMap" - assert configmap_spec["metadata"]["name"] == f"{job_name}-scripts" - assert configmap_spec["metadata"]["namespace"] == namespace + assert secret_spec["apiVersion"] == "v1" + assert secret_spec["kind"] == "Secret" + assert secret_spec["type"] == "Opaque" + assert secret_spec["metadata"]["name"] == f"{job_name}-files" + assert secret_spec["metadata"]["namespace"] == namespace - labels = configmap_spec["metadata"]["labels"] + labels = secret_spec["metadata"]["labels"] assert labels["ray.io/job-name"] == job_name assert labels["app.kubernetes.io/managed-by"] == "codeflare-sdk" - assert labels["app.kubernetes.io/component"] == "rayjob-scripts" + assert labels["app.kubernetes.io/component"] == "rayjob-files" - assert configmap_spec["data"] == scripts + assert secret_spec["data"] == files + + +def test_managed_cluster_config_uses_update_image_for_head(mocker): + """Test that ManagedClusterConfig calls update_image() for head container.""" + # Mock update_image where it's used (in config module), not where it's defined + mock_update_image = mocker.patch( + "codeflare_sdk.ray.rayjobs.config.update_image", + return_value="mocked-image:latest", + ) + + config = ManagedClusterConfig(image="custom-image:v1") + + # Build cluster spec (which should call update_image) + spec = config.build_ray_cluster_spec("test-cluster") + + # Verify update_image was called for head container + assert mock_update_image.called + # Verify head container has the mocked image + head_container = spec["headGroupSpec"]["template"].spec.containers[0] + assert head_container.image == "mocked-image:latest" + + +def test_managed_cluster_config_uses_update_image_for_worker(mocker): + """Test that ManagedClusterConfig calls update_image() for worker container.""" + # Mock update_image where it's used (in config module), not where it's defined + mock_update_image = mocker.patch( + "codeflare_sdk.ray.rayjobs.config.update_image", + return_value="mocked-image:latest", + ) + + config = ManagedClusterConfig(image="custom-image:v1") + + # Build cluster spec (which should call update_image) + spec = config.build_ray_cluster_spec("test-cluster") + + # Verify update_image was called for worker container + assert mock_update_image.called + # Verify worker container has the mocked image + worker_container = spec["workerGroupSpecs"][0]["template"].spec.containers[0] + assert worker_container.image == "mocked-image:latest" + + +def test_managed_cluster_config_with_empty_image_uses_update_image(mocker): + """Test that empty image triggers update_image() to auto-detect.""" + # Mock update_image where it's used (in config module), not where it's defined + mock_update_image = mocker.patch( + "codeflare_sdk.ray.rayjobs.config.update_image", + return_value="auto-detected-image:py3.12", + ) + + config = ManagedClusterConfig(image="") + + # Build cluster spec + spec = config.build_ray_cluster_spec("test-cluster") + + # Verify update_image was called with empty string + mock_update_image.assert_called_with("") + + # Verify containers have the auto-detected image + head_container = spec["headGroupSpec"]["template"].spec.containers[0] + assert head_container.image == "auto-detected-image:py3.12" + + worker_container = spec["workerGroupSpecs"][0]["template"].spec.containers[0] + assert worker_container.image == "auto-detected-image:py3.12" + + +def test_build_ray_cluster_spec_has_enable_in_tree_autoscaling_false(): + """Test that build_ray_cluster_spec sets enableInTreeAutoscaling to False.""" + config = ManagedClusterConfig() + + spec = config.build_ray_cluster_spec("test-cluster") + + # Verify enableInTreeAutoscaling is set to False (required for Kueue) + assert "enableInTreeAutoscaling" in spec + assert spec["enableInTreeAutoscaling"] is False + + +def test_build_ray_cluster_spec_autoscaling_disabled_for_kueue(): + """Test that autoscaling is explicitly disabled for Kueue-managed jobs.""" + config = ManagedClusterConfig(num_workers=3) + + spec = config.build_ray_cluster_spec("kueue-cluster") + + # Verify enableInTreeAutoscaling is False + assert spec["enableInTreeAutoscaling"] is False + + # Verify worker replicas are fixed (min == max == replicas) + worker_spec = spec["workerGroupSpecs"][0] + assert worker_spec["replicas"] == 3 + assert worker_spec["minReplicas"] == 3 + assert worker_spec["maxReplicas"] == 3 + + +def test_managed_cluster_config_default_image_integration(): + """Test that ManagedClusterConfig works with default images (integration test).""" + # Create config without specifying an image (should auto-detect based on Python version) + config = ManagedClusterConfig() + + # Build cluster spec + spec = config.build_ray_cluster_spec("test-cluster") + + # Verify head container has an image (should be auto-detected) + head_container = spec["headGroupSpec"]["template"].spec.containers[0] + assert head_container.image is not None + assert len(head_container.image) > 0 + # Should be one of the supported images + from codeflare_sdk.common.utils.constants import ( + CUDA_PY311_RUNTIME_IMAGE, + CUDA_PY312_RUNTIME_IMAGE, + ) + + assert head_container.image in [CUDA_PY311_RUNTIME_IMAGE, CUDA_PY312_RUNTIME_IMAGE] + + # Verify worker container has the same image + worker_container = spec["workerGroupSpecs"][0]["template"].spec.containers[0] + assert worker_container.image == head_container.image diff --git a/src/codeflare_sdk/ray/rayjobs/test_pretty_print.py b/src/codeflare_sdk/ray/rayjobs/test/test_pretty_print.py similarity index 100% rename from src/codeflare_sdk/ray/rayjobs/test_pretty_print.py rename to src/codeflare_sdk/ray/rayjobs/test/test_pretty_print.py diff --git a/src/codeflare_sdk/ray/rayjobs/test/test_rayjob.py b/src/codeflare_sdk/ray/rayjobs/test/test_rayjob.py new file mode 100644 index 00000000..928cc1f8 --- /dev/null +++ b/src/codeflare_sdk/ray/rayjobs/test/test_rayjob.py @@ -0,0 +1,1783 @@ +# Copyright 2022-2025 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from unittest.mock import MagicMock +from codeflare_sdk.common.utils.constants import RAY_VERSION +from ray.runtime_env import RuntimeEnv + +from codeflare_sdk.ray.rayjobs.rayjob import RayJob +from codeflare_sdk.ray.cluster.config import ClusterConfiguration +from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig +from kubernetes.client import V1Volume, V1VolumeMount, V1Toleration + + +def test_rayjob_submit_success(auto_mock_setup): + """ + Test successful RayJob submission. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + mock_api_instance.submit.return_value = {"metadata": {"name": "test-rayjob"}} + + rayjob = RayJob( + job_name="test-rayjob", + cluster_name="test-ray-cluster", + namespace="test-namespace", + entrypoint="python -c 'print(\"hello world\")'", + runtime_env=RuntimeEnv(pip=["requests"]), + ) + + job_id = rayjob.submit() + + assert job_id == "test-rayjob" + + mock_api_instance.submit_job.assert_called_once() + call_args = mock_api_instance.submit_job.call_args + + assert call_args.kwargs["k8s_namespace"] == "test-namespace" + + job_cr = call_args.kwargs["job"] + assert job_cr["metadata"]["name"] == "test-rayjob" + assert job_cr["metadata"]["namespace"] == "test-namespace" + assert job_cr["spec"]["entrypoint"] == "python -c 'print(\"hello world\")'" + assert job_cr["spec"]["clusterSelector"]["ray.io/cluster"] == "test-ray-cluster" + assert job_cr["spec"]["runtimeEnvYAML"] == "pip:\n- requests\n" + + +def test_rayjob_submit_failure(auto_mock_setup): + """ + Test RayJob submission failure. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + mock_api_instance.submit_job.return_value = None + + rayjob = RayJob( + job_name="test-rayjob", + cluster_name="test-ray-cluster", + namespace="default", + entrypoint="python -c 'print()'", + runtime_env=RuntimeEnv(pip=["numpy"]), + ) + + with pytest.raises(RuntimeError, match="Failed to submit RayJob test-rayjob"): + rayjob.submit() + + +def test_rayjob_init_validation_both_provided(auto_mock_setup): + """ + Test that providing both cluster_name and cluster_config raises error. + """ + cluster_config = ClusterConfiguration(name="test-cluster", namespace="test") + + with pytest.raises( + ValueError, + match="❌ Configuration Error: You cannot specify both 'cluster_name' and 'cluster_config'", + ): + RayJob( + job_name="test-job", + cluster_name="existing-cluster", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + ) + + +def test_rayjob_init_validation_neither_provided(auto_mock_setup): + """ + Test that providing neither cluster_name nor cluster_config raises error. + """ + with pytest.raises( + ValueError, + match="❌ Configuration Error: You must provide either 'cluster_name'", + ): + RayJob(job_name="test-job", entrypoint="python test.py") + + +def test_rayjob_init_with_cluster_config(auto_mock_setup): + """ + Test RayJob initialization with cluster configuration for auto-creation. + """ + cluster_config = ClusterConfiguration( + name="auto-cluster", namespace="test-namespace", num_workers=2 + ) + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + assert rayjob.name == "test-job" + assert rayjob.cluster_name == "test-job-cluster" # Generated from job name + assert rayjob._cluster_config == cluster_config + assert rayjob._cluster_name is None + + +def test_rayjob_cluster_name_generation(auto_mock_setup): + """ + Test that cluster names are generated when config has empty name. + """ + cluster_config = ClusterConfiguration( + name="", # Empty name should trigger generation + namespace="test-namespace", + num_workers=1, + ) + + rayjob = RayJob( + job_name="my-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + assert rayjob.cluster_name == "my-job-cluster" + + +def test_rayjob_cluster_config_namespace_none(auto_mock_setup): + """ + Test that cluster config namespace is set when None. + """ + cluster_config = ClusterConfiguration( + name="test-cluster", + namespace=None, # This should be set to job namespace + num_workers=1, + ) + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + namespace="job-namespace", + entrypoint="python -c 'print()'", + ) + + assert rayjob.namespace == "job-namespace" + + +def test_rayjob_with_active_deadline_seconds(auto_mock_setup): + """ + Test RayJob CR generation with active deadline seconds. + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + namespace="test-namespace", + entrypoint="python main.py", + active_deadline_seconds=30, + ) + + rayjob_cr = rayjob._build_rayjob_cr() + + assert rayjob_cr["spec"]["activeDeadlineSeconds"] == 30 + + +def test_build_ray_cluster_spec_no_config_error(auto_mock_setup): + """ + Test _build_ray_cluster_spec raises error when no cluster config. + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + rayjob_cr = rayjob._build_rayjob_cr() + + assert rayjob_cr["spec"]["clusterSelector"]["ray.io/cluster"] == "existing-cluster" + assert "rayClusterSpec" not in rayjob_cr["spec"] + + +def test_build_ray_cluster_spec(mocker, auto_mock_setup): + """ + Test _build_ray_cluster_spec method. + """ + + mock_ray_cluster = { + "apiVersion": "ray.io/v1", + "kind": "RayCluster", + "metadata": {"name": "test-cluster", "namespace": "test"}, + "spec": { + "rayVersion": RAY_VERSION, + "headGroupSpec": {"replicas": 1}, + "workerGroupSpecs": [{"replicas": 2}], + }, + } + cluster_config = ManagedClusterConfig(num_workers=2) + mocker.patch.object( + cluster_config, "build_ray_cluster_spec", return_value=mock_ray_cluster["spec"] + ) + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + rayjob_cr = rayjob._build_rayjob_cr() + + assert "rayClusterSpec" in rayjob_cr["spec"] + cluster_config.build_ray_cluster_spec.assert_called_once_with( + cluster_name="test-job-cluster" + ) + + +def test_build_rayjob_cr_with_existing_cluster(auto_mock_setup): + """ + Test _build_rayjob_cr method with existing cluster. + """ + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + namespace="test-namespace", + entrypoint="python main.py", + ttl_seconds_after_finished=300, + ) + + rayjob_cr = rayjob._build_rayjob_cr() + + assert rayjob_cr["apiVersion"] == "ray.io/v1" + assert rayjob_cr["kind"] == "RayJob" + assert rayjob_cr["metadata"]["name"] == "test-job" + spec = rayjob_cr["spec"] + assert spec["entrypoint"] == "python main.py" + assert spec["shutdownAfterJobFinishes"] is False + assert spec["ttlSecondsAfterFinished"] == 300 + + assert spec["clusterSelector"]["ray.io/cluster"] == "existing-cluster" + assert "rayClusterSpec" not in spec + + +def test_build_rayjob_cr_with_auto_cluster(mocker, auto_mock_setup): + """ + Test _build_rayjob_cr method with auto-created cluster. + """ + mock_ray_cluster = { + "apiVersion": "ray.io/v1", + "kind": "RayCluster", + "metadata": {"name": "auto-cluster", "namespace": "test"}, + "spec": { + "rayVersion": RAY_VERSION, + "headGroupSpec": {"replicas": 1}, + "workerGroupSpecs": [{"replicas": 2}], + }, + } + cluster_config = ManagedClusterConfig(num_workers=2) + + mocker.patch.object( + cluster_config, "build_ray_cluster_spec", return_value=mock_ray_cluster["spec"] + ) + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python main.py", + namespace="test-namespace", + ) + + rayjob_cr = rayjob._build_rayjob_cr() + assert rayjob_cr["spec"]["rayClusterSpec"] == mock_ray_cluster["spec"] + assert "clusterSelector" not in rayjob_cr["spec"] + + +def test_submit_validation_no_entrypoint(auto_mock_setup): + """ + Test that submit() raises error when entrypoint is None. + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=None, # No entrypoint provided + namespace="test-namespace", + ) + + with pytest.raises( + ValueError, match="Entrypoint must be provided to submit a RayJob" + ): + rayjob.submit() + + +def test_submit_with_auto_cluster(mocker, auto_mock_setup): + """ + Test successful submission with auto-created cluster. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + mock_ray_cluster = { + "apiVersion": "ray.io/v1", + "kind": "RayCluster", + "spec": { + "rayVersion": RAY_VERSION, + "headGroupSpec": {"replicas": 1}, + "workerGroupSpecs": [{"replicas": 1}], + }, + } + mock_api_instance.submit_job.return_value = True + + cluster_config = ManagedClusterConfig(num_workers=1) + mocker.patch.object( + cluster_config, "build_ray_cluster_spec", return_value=mock_ray_cluster["spec"] + ) + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + result = rayjob.submit() + + assert result == "test-job" + + mock_api_instance.submit_job.assert_called_once() + call_args = mock_api_instance.submit_job.call_args + + job_cr = call_args.kwargs["job"] + assert "rayClusterSpec" in job_cr["spec"] + assert job_cr["spec"]["rayClusterSpec"] == mock_ray_cluster["spec"] + + +def test_namespace_auto_detection_success(auto_mock_setup): + """ + Test successful namespace auto-detection. + """ + auto_mock_setup["get_current_namespace"].return_value = "detected-ns" + + rayjob = RayJob( + job_name="test-job", entrypoint="python test.py", cluster_name="test-cluster" + ) + + assert rayjob.namespace == "detected-ns" + + +def test_namespace_auto_detection_fallback(auto_mock_setup): + """ + Test that namespace auto-detection failure raises an error. + """ + auto_mock_setup["get_current_namespace"].return_value = None + + with pytest.raises(ValueError, match="Could not auto-detect Kubernetes namespace"): + RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + ) + + +def test_namespace_explicit_override(auto_mock_setup): + """ + Test that explicit namespace overrides auto-detection. + """ + auto_mock_setup["get_current_namespace"].return_value = "detected-ns" + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + namespace="explicit-ns", + ) + + assert rayjob.namespace == "explicit-ns" + + +def test_rayjob_with_rayjob_cluster_config(auto_mock_setup): + """ + Test RayJob with the new ManagedClusterConfig. + """ + cluster_config = ManagedClusterConfig( + num_workers=2, + head_cpu_requests="500m", + head_memory_requests="512Mi", + ) + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_config=cluster_config, + namespace="test-namespace", + ) + + assert rayjob._cluster_config == cluster_config + assert rayjob.cluster_name == "test-job-cluster" # Generated from job name + + +def test_rayjob_cluster_config_validation(auto_mock_setup): + """ + Test validation of ManagedClusterConfig parameters. + """ + cluster_config = ManagedClusterConfig() + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_config=cluster_config, + namespace="test-namespace", + ) + + assert rayjob._cluster_config is not None + + +def test_rayjob_missing_entrypoint_validation(auto_mock_setup): + """ + Test that RayJob requires entrypoint for submission. + """ + with pytest.raises( + TypeError, match="missing 1 required positional argument: 'entrypoint'" + ): + RayJob( + job_name="test-job", + cluster_name="test-cluster", + ) + + +def test_build_ray_cluster_spec_integration(mocker, auto_mock_setup): + """ + Test integration with the new build_ray_cluster_spec method. + """ + cluster_config = ManagedClusterConfig() + mock_spec = {"spec": "test-spec"} + mocker.patch.object( + cluster_config, "build_ray_cluster_spec", return_value=mock_spec + ) + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_config=cluster_config, + namespace="test-namespace", + ) + + rayjob_cr = rayjob._build_rayjob_cr() + + cluster_config.build_ray_cluster_spec.assert_called_once_with( + cluster_name="test-job-cluster" + ) + assert "rayClusterSpec" in rayjob_cr["spec"] + assert rayjob_cr["spec"]["rayClusterSpec"] == mock_spec + + +def test_rayjob_with_runtime_env(auto_mock_setup): + """ + Test RayJob with runtime environment configuration. + """ + runtime_env = RuntimeEnv(pip=["numpy", "pandas"]) + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + runtime_env=runtime_env, + namespace="test-namespace", + ) + + assert rayjob.runtime_env == runtime_env + + rayjob_cr = rayjob._build_rayjob_cr() + assert rayjob_cr["spec"]["runtimeEnvYAML"] == "pip:\n- numpy\n- pandas\n" + + +def test_rayjob_with_runtime_env_dict(auto_mock_setup): + """ + Test RayJob with runtime environment as dict (user convenience). + Users can pass a dict instead of having to import RuntimeEnv. + """ + # User can pass dict instead of RuntimeEnv object + runtime_env_dict = { + "pip": ["numpy", "pandas"], + "env_vars": {"TEST_VAR": "test_value"}, + } + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + runtime_env=runtime_env_dict, + namespace="test-namespace", + ) + + # Should be converted to RuntimeEnv internally + assert isinstance(rayjob.runtime_env, RuntimeEnv) + assert rayjob.runtime_env["env_vars"] == {"TEST_VAR": "test_value"} + + # Verify it generates proper YAML output + rayjob_cr = rayjob._build_rayjob_cr() + assert "runtimeEnvYAML" in rayjob_cr["spec"] + runtime_yaml = rayjob_cr["spec"]["runtimeEnvYAML"] + assert "pip:" in runtime_yaml or "pip_packages:" in runtime_yaml + assert "env_vars:" in runtime_yaml + assert "TEST_VAR" in runtime_yaml + + +def test_rayjob_with_active_deadline_and_ttl(auto_mock_setup): + """ + Test RayJob with both active deadline and TTL settings. + """ + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + active_deadline_seconds=300, + ttl_seconds_after_finished=600, + namespace="test-namespace", + ) + + assert rayjob.active_deadline_seconds == 300 + assert rayjob.ttl_seconds_after_finished == 600 + + rayjob_cr = rayjob._build_rayjob_cr() + assert rayjob_cr["spec"]["activeDeadlineSeconds"] == 300 + assert rayjob_cr["spec"]["ttlSecondsAfterFinished"] == 600 + + +def test_rayjob_cluster_name_generation_with_config(auto_mock_setup): + """ + Test cluster name generation when using cluster_config. + """ + + cluster_config = ManagedClusterConfig() + + rayjob = RayJob( + job_name="my-job", + entrypoint="python -c 'print()'", + cluster_config=cluster_config, + namespace="test-namespace", # Explicitly specify namespace + ) + + assert rayjob.cluster_name == "my-job-cluster" + + +def test_rayjob_namespace_propagation_to_cluster_config(auto_mock_setup): + """ + Test that job namespace is propagated to cluster config when None. + """ + auto_mock_setup["get_current_namespace"].return_value = "detected-ns" + + cluster_config = ManagedClusterConfig() + + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_config=cluster_config, + ) + + assert rayjob.namespace == "detected-ns" + + +def test_rayjob_error_handling_invalid_cluster_config(auto_mock_setup): + """ + Test error handling with invalid cluster configuration. + """ + + with pytest.raises(ValueError): + RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + ) + + +def test_rayjob_constructor_parameter_validation(auto_mock_setup): + """ + Test constructor parameter validation. + """ + rayjob = RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + namespace="test-ns", + runtime_env=RuntimeEnv(pip=["numpy"]), + ttl_seconds_after_finished=300, + active_deadline_seconds=600, + ) + + assert rayjob.name == "test-job" + assert rayjob.entrypoint == "python -c 'print()'" + assert rayjob.cluster_name == "test-cluster" + assert rayjob.namespace == "test-ns" + # Check that runtime_env is a RuntimeEnv object and contains pip dependencies + assert isinstance(rayjob.runtime_env, RuntimeEnv) + runtime_env_dict = rayjob.runtime_env.to_dict() + assert "pip" in runtime_env_dict + # Ray transforms pip to dict format with 'packages' key + assert runtime_env_dict["pip"]["packages"] == ["numpy"] + assert rayjob.ttl_seconds_after_finished == 300 + assert rayjob.active_deadline_seconds == 600 + + +def test_build_ray_cluster_spec_function(): + """ + Test the build_ray_cluster_spec method directly. + """ + cluster_config = ManagedClusterConfig( + num_workers=2, + head_cpu_requests="500m", + head_memory_requests="512Mi", + worker_cpu_requests="250m", + worker_memory_requests="256Mi", + ) + + spec = cluster_config.build_ray_cluster_spec("test-cluster") + assert "rayVersion" in spec + assert "enableInTreeAutoscaling" in spec + assert spec["enableInTreeAutoscaling"] is False # Required for Kueue + assert "headGroupSpec" in spec + assert "workerGroupSpecs" in spec + + head_spec = spec["headGroupSpec"] + assert head_spec["serviceType"] == "ClusterIP" + assert head_spec["enableIngress"] is False + assert "rayStartParams" in head_spec + assert "template" in head_spec + worker_specs = spec["workerGroupSpecs"] + assert len(worker_specs) == 1 + worker_spec = worker_specs[0] + assert worker_spec["replicas"] == 2 + assert worker_spec["minReplicas"] == 2 + assert worker_spec["maxReplicas"] == 2 + assert worker_spec["groupName"] == "worker-group-test-cluster" + + +def test_build_ray_cluster_spec_with_accelerators(): + """ + Test build_ray_cluster_spec with GPU accelerators. + """ + cluster_config = ManagedClusterConfig( + head_accelerators={"nvidia.com/gpu": 1}, + worker_accelerators={"nvidia.com/gpu": 2}, + ) + + spec = cluster_config.build_ray_cluster_spec("test-cluster") + head_spec = spec["headGroupSpec"] + head_params = head_spec["rayStartParams"] + assert "num-gpus" in head_params + assert head_params["num-gpus"] == "1" + + worker_specs = spec["workerGroupSpecs"] + worker_spec = worker_specs[0] + worker_params = worker_spec["rayStartParams"] + assert "num-gpus" in worker_params + assert worker_params["num-gpus"] == "2" + + +def test_build_ray_cluster_spec_with_custom_volumes(): + """ + Test build_ray_cluster_spec with custom volumes and volume mounts. + """ + custom_volume = V1Volume(name="custom-data", empty_dir={}) + custom_volume_mount = V1VolumeMount(name="custom-data", mount_path="/data") + cluster_config = ManagedClusterConfig( + volumes=[custom_volume], + volume_mounts=[custom_volume_mount], + ) + + spec = cluster_config.build_ray_cluster_spec("test-cluster") + head_spec = spec["headGroupSpec"] + head_pod_spec = head_spec["template"].spec + assert len(head_pod_spec.volumes) > 0 + + head_container = head_pod_spec.containers[0] + assert len(head_container.volume_mounts) > 0 + + +def test_build_ray_cluster_spec_with_environment_variables(): + """ + Test build_ray_cluster_spec with environment variables. + """ + cluster_config = ManagedClusterConfig( + envs={"CUDA_VISIBLE_DEVICES": "0", "RAY_DISABLE_IMPORT_WARNING": "1"}, + ) + + spec = cluster_config.build_ray_cluster_spec("test-cluster") + + head_spec = spec["headGroupSpec"] + head_pod_spec = head_spec["template"].spec + head_container = head_pod_spec.containers[0] + assert hasattr(head_container, "env") + env_vars = {env.name: env.value for env in head_container.env} + assert env_vars["CUDA_VISIBLE_DEVICES"] == "0" + assert env_vars["RAY_DISABLE_IMPORT_WARNING"] == "1" + worker_specs = spec["workerGroupSpecs"] + worker_spec = worker_specs[0] + worker_pod_spec = worker_spec["template"].spec + worker_container = worker_pod_spec.containers[0] + + assert hasattr(worker_container, "env") + worker_env_vars = {env.name: env.value for env in worker_container.env} + assert worker_env_vars["CUDA_VISIBLE_DEVICES"] == "0" + assert worker_env_vars["RAY_DISABLE_IMPORT_WARNING"] == "1" + + +def test_build_ray_cluster_spec_with_tolerations(): + """ + Test build_ray_cluster_spec with tolerations. + """ + head_toleration = V1Toleration( + key="node-role.kubernetes.io/master", operator="Exists", effect="NoSchedule" + ) + worker_toleration = V1Toleration( + key="nvidia.com/gpu", operator="Exists", effect="NoSchedule" + ) + + cluster_config = ManagedClusterConfig( + head_tolerations=[head_toleration], + worker_tolerations=[worker_toleration], + ) + + spec = cluster_config.build_ray_cluster_spec("test-cluster") + head_spec = spec["headGroupSpec"] + head_pod_spec = head_spec["template"].spec + assert hasattr(head_pod_spec, "tolerations") + assert len(head_pod_spec.tolerations) == 1 + assert head_pod_spec.tolerations[0].key == "node-role.kubernetes.io/master" + + worker_specs = spec["workerGroupSpecs"] + worker_spec = worker_specs[0] + worker_pod_spec = worker_spec["template"].spec + assert hasattr(worker_pod_spec, "tolerations") + assert len(worker_pod_spec.tolerations) == 1 + assert worker_pod_spec.tolerations[0].key == "nvidia.com/gpu" + + +def test_build_ray_cluster_spec_with_image_pull_secrets(): + """ + Test build_ray_cluster_spec with image pull secrets. + """ + cluster_config = ManagedClusterConfig( + image_pull_secrets=["my-registry-secret", "another-secret"] + ) + + spec = cluster_config.build_ray_cluster_spec("test-cluster") + + head_spec = spec["headGroupSpec"] + head_pod_spec = head_spec["template"].spec + assert hasattr(head_pod_spec, "image_pull_secrets") + + head_secrets = head_pod_spec.image_pull_secrets + assert len(head_secrets) == 2 + assert head_secrets[0].name == "my-registry-secret" + assert head_secrets[1].name == "another-secret" + + worker_specs = spec["workerGroupSpecs"] + worker_spec = worker_specs[0] + worker_pod_spec = worker_spec["template"].spec + assert hasattr(worker_pod_spec, "image_pull_secrets") + + worker_secrets = worker_pod_spec.image_pull_secrets + assert len(worker_secrets) == 2 + assert worker_secrets[0].name == "my-registry-secret" + assert worker_secrets[1].name == "another-secret" + + +def test_submit_with_cluster_config_compatible_image_passes(auto_mock_setup): + """ + Test that submission passes with compatible cluster_config image. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = True + + cluster_config = ManagedClusterConfig(image=f"ray:{RAY_VERSION}") + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + result = rayjob.submit() + assert result == "test-job" + + +def test_submit_with_cluster_config_incompatible_image_fails(auto_mock_setup): + """ + Test that submission fails with incompatible cluster_config image. + """ + + cluster_config = ManagedClusterConfig(image="ray:2.8.0") # Different version + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + with pytest.raises( + ValueError, match="Cluster config image: Ray version mismatch detected" + ): + rayjob.submit() + + +def test_validate_ray_version_compatibility_method(auto_mock_setup): + """ + Test the _validate_ray_version_compatibility method directly. + """ + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + rayjob._validate_ray_version_compatibility() + rayjob._cluster_config = ManagedClusterConfig(image=f"ray:{RAY_VERSION}") + rayjob._validate_ray_version_compatibility() + rayjob._cluster_config = ManagedClusterConfig(image="ray:2.8.0") + with pytest.raises( + ValueError, match="Cluster config image: Ray version mismatch detected" + ): + rayjob._validate_ray_version_compatibility() + + rayjob._cluster_config = ManagedClusterConfig(image="custom-image:latest") + with pytest.warns( + UserWarning, match="Cluster config image: Cannot determine Ray version" + ): + rayjob._validate_ray_version_compatibility() + + +def test_validate_cluster_config_image_method(auto_mock_setup): + """ + Test the _validate_cluster_config_image method directly. + """ + + rayjob = RayJob( + job_name="test-job", + cluster_config=ManagedClusterConfig(), + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + rayjob._validate_cluster_config_image() + rayjob._cluster_config.image = f"ray:{RAY_VERSION}" + rayjob._validate_cluster_config_image() + rayjob._cluster_config.image = "ray:2.8.0" + with pytest.raises( + ValueError, match="Cluster config image: Ray version mismatch detected" + ): + rayjob._validate_cluster_config_image() + + rayjob._cluster_config.image = "custom-image:latest" + with pytest.warns( + UserWarning, match="Cluster config image: Cannot determine Ray version" + ): + rayjob._validate_cluster_config_image() + + +def test_validate_cluster_config_image_edge_cases(auto_mock_setup): + """ + Test edge cases in _validate_cluster_config_image method. + """ + + rayjob = RayJob( + job_name="test-job", + cluster_config=ManagedClusterConfig(), + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + rayjob._cluster_config.image = None + rayjob._validate_cluster_config_image() + rayjob._cluster_config.image = "" + rayjob._validate_cluster_config_image() + rayjob._cluster_config.image = 123 + rayjob._validate_cluster_config_image() + + class MockClusterConfig: + pass + + rayjob._cluster_config = MockClusterConfig() + rayjob._validate_cluster_config_image() + + +def test_rayjob_stop_success(auto_mock_setup, caplog): + """ + Test successful RayJob stop operation. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + mock_api_instance.suspend_job.return_value = { + "metadata": {"name": "test-rayjob"}, + "spec": {"suspend": True}, + } + + rayjob = RayJob( + job_name="test-rayjob", + cluster_name="test-cluster", + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + with caplog.at_level("INFO"): + result = rayjob.stop() + + assert result is True + + mock_api_instance.suspend_job.assert_called_once_with( + name="test-rayjob", k8s_namespace="test-namespace" + ) + + # Verify success message was logged + assert "Successfully stopped the RayJob test-rayjob" in caplog.text + + +def test_rayjob_stop_failure(auto_mock_setup): + """ + Test RayJob stop operation when API call fails. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + mock_api_instance.suspend_job.return_value = None + + rayjob = RayJob( + job_name="test-rayjob", + cluster_name="test-cluster", + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + with pytest.raises(RuntimeError, match="Failed to stop the RayJob test-rayjob"): + rayjob.stop() + + mock_api_instance.suspend_job.assert_called_once_with( + name="test-rayjob", k8s_namespace="test-namespace" + ) + + +def test_rayjob_resubmit_success(auto_mock_setup): + """ + Test successful RayJob resubmit operation. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + mock_api_instance.resubmit_job.return_value = { + "metadata": {"name": "test-rayjob"}, + "spec": {"suspend": False}, + } + + rayjob = RayJob( + job_name="test-rayjob", + cluster_name="test-cluster", + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + result = rayjob.resubmit() + + assert result is True + + mock_api_instance.resubmit_job.assert_called_once_with( + name="test-rayjob", k8s_namespace="test-namespace" + ) + + +def test_rayjob_resubmit_failure(auto_mock_setup): + """ + Test RayJob resubmit operation when API call fails. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + mock_api_instance.resubmit_job.return_value = None + + rayjob = RayJob( + job_name="test-rayjob", + cluster_name="test-cluster", + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + with pytest.raises(RuntimeError, match="Failed to resubmit the RayJob test-rayjob"): + rayjob.resubmit() + + mock_api_instance.resubmit_job.assert_called_once_with( + name="test-rayjob", k8s_namespace="test-namespace" + ) + + +def test_rayjob_delete_success(auto_mock_setup): + """ + Test successful RayJob deletion. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + rayjob = RayJob( + job_name="test-rayjob", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + ) + + mock_api_instance.delete_job.return_value = True + + result = rayjob.delete() + + assert result is True + mock_api_instance.delete_job.assert_called_once_with( + name="test-rayjob", k8s_namespace="test-namespace" + ) + + +def test_rayjob_delete_already_deleted(auto_mock_setup, caplog): + """ + Test RayJob deletion when already deleted (should succeed gracefully). + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + rayjob = RayJob( + job_name="test-rayjob", + entrypoint="python -c 'print()'", + cluster_name="test-cluster", + ) + + # Python client returns False when job doesn't exist/already deleted + mock_api_instance.delete_job.return_value = False + + with caplog.at_level("INFO"): + result = rayjob.delete() + + # Should succeed (not raise error) when already deleted + assert result is True + assert "already deleted or does not exist" in caplog.text + + mock_api_instance.delete_job.assert_called_once_with( + name="test-rayjob", k8s_namespace="test-namespace" + ) + + +def test_rayjob_init_both_none_error(auto_mock_setup): + """ + Test RayJob initialization error when both cluster_name and cluster_config are None. + """ + with pytest.raises( + ValueError, + match="Configuration Error: You must provide either 'cluster_name' .* or 'cluster_config'", + ): + RayJob( + job_name="test-job", + entrypoint="python -c 'print()'", + cluster_name=None, + cluster_config=None, + ) + + +def test_rayjob_init_missing_cluster_name_with_no_config(auto_mock_setup): + """ + Test RayJob initialization error when cluster_name is None without cluster_config. + """ + with pytest.raises( + ValueError, + match="Configuration Error: a 'cluster_name' is required when not providing 'cluster_config'", + ): + rayjob = RayJob.__new__(RayJob) + rayjob.name = "test-job" + rayjob.entrypoint = "python test.py" + rayjob.runtime_env = None + rayjob.ttl_seconds_after_finished = 0 + rayjob.active_deadline_seconds = None + rayjob.shutdown_after_job_finishes = True + rayjob.namespace = "test-namespace" + rayjob._cluster_name = None + rayjob._cluster_config = None + if rayjob._cluster_config is None and rayjob._cluster_name is None: + raise ValueError( + "❌ Configuration Error: a 'cluster_name' is required when not providing 'cluster_config'" + ) + + +def test_rayjob_kueue_label_no_default_queue(auto_mock_setup, mocker, caplog): + """ + Test RayJob falls back to 'default' queue when no default queue exists. + """ + mocker.patch( + "codeflare_sdk.ray.rayjobs.rayjob.get_default_kueue_name", + return_value=None, + ) + + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} + + cluster_config = ManagedClusterConfig() + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + ) + + with caplog.at_level("WARNING"): + rayjob.submit() + + # Verify the submitted job has the fallback label + call_args = mock_api_instance.submit_job.call_args + submitted_job = call_args.kwargs["job"] + assert submitted_job["metadata"]["labels"]["kueue.x-k8s.io/queue-name"] == "default" + + # Verify warning was logged + assert "No default Kueue LocalQueue found" in caplog.text + + +def test_rayjob_kueue_explicit_local_queue(auto_mock_setup): + """ + Test RayJob uses explicitly specified local queue. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} + + cluster_config = ManagedClusterConfig() + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + local_queue="custom-queue", + ) + + rayjob.submit() + + # Verify the submitted job has the explicit queue label + call_args = mock_api_instance.submit_job.call_args + submitted_job = call_args.kwargs["job"] + assert ( + submitted_job["metadata"]["labels"]["kueue.x-k8s.io/queue-name"] + == "custom-queue" + ) + + +def test_rayjob_no_kueue_label_for_existing_cluster(auto_mock_setup): + """ + Test RayJob doesn't add Kueue label for existing clusters. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} + + # Using existing cluster (no cluster_config) + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python -c 'print()'", + ) + + rayjob.submit() + + # Verify no Kueue label was added + call_args = mock_api_instance.submit_job.call_args + submitted_job = call_args.kwargs["job"] + assert "kueue.x-k8s.io/queue-name" not in submitted_job["metadata"]["labels"] + + +def test_rayjob_with_ttl_and_deadline(auto_mock_setup): + """ + Test RayJob with TTL and active deadline seconds. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} + + cluster_config = ManagedClusterConfig() + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + ttl_seconds_after_finished=300, + active_deadline_seconds=600, + ) + + rayjob.submit() + + # Verify TTL and deadline were set + call_args = mock_api_instance.submit_job.call_args + submitted_job = call_args.kwargs["job"] + assert submitted_job["spec"]["ttlSecondsAfterFinished"] == 300 + assert submitted_job["spec"]["activeDeadlineSeconds"] == 600 + + +def test_rayjob_shutdown_after_job_finishes(auto_mock_setup): + """ + Test RayJob sets shutdownAfterJobFinishes correctly. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} + + # Test with managed cluster (should shutdown) + cluster_config = ManagedClusterConfig() + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + ) + + rayjob.submit() + + call_args = mock_api_instance.submit_job.call_args + submitted_job = call_args.kwargs["job"] + assert submitted_job["spec"]["shutdownAfterJobFinishes"] is True + + # Test with existing cluster (should not shutdown) + rayjob2 = RayJob( + job_name="test-job2", + cluster_name="existing-cluster", + entrypoint="python -c 'print()'", + ) + + rayjob2.submit() + + call_args2 = mock_api_instance.submit_job.call_args + submitted_job2 = call_args2.kwargs["job"] + assert submitted_job2["spec"]["shutdownAfterJobFinishes"] is False + + +def test_rayjob_stop_delete_resubmit_logging(auto_mock_setup, caplog): + """ + Test logging for stop, delete, and resubmit operations. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + + # Test stop with logging + mock_api_instance.suspend_job.return_value = { + "metadata": {"name": "test-rayjob"}, + "spec": {"suspend": True}, + } + + rayjob = RayJob( + job_name="test-rayjob", + cluster_name="test-cluster", + namespace="test-namespace", + entrypoint="python -c 'print()'", + ) + + with caplog.at_level("INFO"): + result = rayjob.stop() + + assert result is True + assert "Successfully stopped the RayJob test-rayjob" in caplog.text + + # Test delete with logging + caplog.clear() + mock_api_instance.delete_job.return_value = True + + with caplog.at_level("INFO"): + result = rayjob.delete() + + assert result is True + assert "Successfully deleted the RayJob test-rayjob" in caplog.text + + # Test resubmit with logging + caplog.clear() + mock_api_instance.resubmit_job.return_value = { + "metadata": {"name": "test-rayjob"}, + "spec": {"suspend": False}, + } + + with caplog.at_level("INFO"): + result = rayjob.resubmit() + + assert result is True + assert "Successfully resubmitted the RayJob test-rayjob" in caplog.text + + +def test_rayjob_initialization_logging(auto_mock_setup, caplog): + """ + Test RayJob initialization logging. + """ + with caplog.at_level("INFO"): + cluster_config = ManagedClusterConfig() + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + ) + + assert "Creating new cluster: test-job-cluster" in caplog.text + assert "Initialized RayJob: test-job in namespace: test-namespace" in caplog.text + + +def test_build_submitter_pod_template_uses_default_image(auto_mock_setup, mocker): + """ + Test that _build_submitter_pod_template() uses get_ray_image_for_python_version() for default image. + """ + # Mock get_ray_image_for_python_version to verify it's called + mock_get_image = mocker.patch( + "codeflare_sdk.ray.rayjobs.rayjob.get_ray_image_for_python_version", + return_value="auto-detected-image:py3.12", + ) + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + files = {"test.py": "print('hello')"} + secret_name = "test-files" + + # Call _build_submitter_pod_template + submitter_template = rayjob._build_submitter_pod_template(files, secret_name) + + # Verify get_ray_image_for_python_version was called + mock_get_image.assert_called_once() + + # Verify the submitter pod uses the auto-detected image + assert ( + submitter_template["spec"]["containers"][0]["image"] + == "auto-detected-image:py3.12" + ) + + +def test_build_submitter_pod_template_uses_cluster_config_image( + auto_mock_setup, mocker +): + """ + Test that _build_submitter_pod_template() uses cluster_config image when provided. + """ + # Mock get_ray_image_for_python_version (should be called but overridden) + mock_get_image = mocker.patch( + "codeflare_sdk.ray.rayjobs.rayjob.get_ray_image_for_python_version", + return_value="auto-detected-image:py3.12", + ) + + cluster_config = ManagedClusterConfig(image="custom-cluster-image:v1") + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + files = {"test.py": "print('hello')"} + secret_name = "test-files" + + # Call _build_submitter_pod_template + submitter_template = rayjob._build_submitter_pod_template(files, secret_name) + + # Verify get_ray_image_for_python_version was called + mock_get_image.assert_called_once() + + # Verify the submitter pod uses the cluster config image (overrides default) + assert ( + submitter_template["spec"]["containers"][0]["image"] + == "custom-cluster-image:v1" + ) + + +def test_build_submitter_pod_template_with_files(auto_mock_setup): + """ + Test that _build_submitter_pod_template() correctly builds Secret items for files. + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python -c 'print()'", + namespace="test-namespace", + ) + + files = {"main.py": "print('main')", "helper.py": "print('helper')"} + secret_name = "test-files" + + # Call _build_submitter_pod_template + submitter_template = rayjob._build_submitter_pod_template(files, secret_name) + + # Verify Secret items are created for each file + secret_items = submitter_template["spec"]["volumes"][0]["secret"]["items"] + assert len(secret_items) == 2 + + # Verify each file has a Secret item + file_names = [item["key"] for item in secret_items] + assert "main.py" in file_names + assert "helper.py" in file_names + + # Verify paths match keys + for item in secret_items: + assert item["key"] == item["path"] + + +def test_validate_working_dir_entrypoint_no_runtime_env(auto_mock_setup, tmp_path): + """ + Test validation checks file exists even when no runtime_env is specified. + """ + # Create the script file + script_file = tmp_path / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=f"python {script_file}", + namespace="test-namespace", + ) + + # Should not raise exception (file exists) + rayjob._validate_working_dir_entrypoint() + + +def test_validate_working_dir_entrypoint_no_working_dir(auto_mock_setup, tmp_path): + """ + Test validation checks file when runtime_env has no working_dir. + """ + # Create the script file + script_file = tmp_path / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=f"python {script_file}", + namespace="test-namespace", + runtime_env=RuntimeEnv(pip=["numpy"]), + ) + + # Should not raise exception (file exists) + rayjob._validate_working_dir_entrypoint() + + +def test_validate_working_dir_entrypoint_remote_working_dir(auto_mock_setup): + """ + Test validation skips ALL checks for remote working_dir. + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python nonexistent_script.py", # File doesn't exist, but should be ignored + namespace="test-namespace", + runtime_env=RuntimeEnv( + working_dir="https://github.com/user/repo/archive/main.zip" + ), + ) + + # Should not raise any exception (remote working_dir skips all validation) + rayjob._validate_working_dir_entrypoint() + + +def test_validate_working_dir_entrypoint_no_python_file(auto_mock_setup): + """ + Test validation passes when entrypoint has no Python file reference. + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="echo 'hello world'", # No Python file + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir="."), + ) + + # Should not raise any exception + rayjob._validate_working_dir_entrypoint() + + +def test_validate_working_dir_entrypoint_no_redundancy(auto_mock_setup, tmp_path): + """ + Test validation passes when entrypoint doesn't reference working_dir. + """ + # Create test directory and file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + script_file = test_dir / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python script.py", # No directory prefix + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir=str(test_dir)), + ) + + # Should not raise any exception + rayjob._validate_working_dir_entrypoint() + + +def test_validate_working_dir_entrypoint_redundant_reference_error( + auto_mock_setup, tmp_path +): + """ + Test validation raises error when entrypoint redundantly references working_dir. + """ + # Create test directory and file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + script_file = test_dir / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=f"python {test_dir}/script.py", # Redundant reference + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir=str(test_dir)), + ) + + # Should raise ValueError with helpful message + with pytest.raises(ValueError, match="Working directory conflict detected"): + rayjob._validate_working_dir_entrypoint() + + +def test_validate_working_dir_entrypoint_with_dot_slash(auto_mock_setup, tmp_path): + """ + Test validation handles paths with ./ prefix correctly. + """ + # Create test directory and file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + script_file = test_dir / "script.py" + script_file.write_text("print('hello')") + + # Change to temp directory so relative paths work + import os + + original_cwd = os.getcwd() + try: + os.chdir(tmp_path) + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python ./testdir/script.py", # With ./ prefix + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir="./testdir"), # With ./ prefix + ) + + # Should raise ValueError (redundant reference) + with pytest.raises(ValueError, match="Working directory conflict detected"): + rayjob._validate_working_dir_entrypoint() + finally: + os.chdir(original_cwd) + + +def test_validate_working_dir_entrypoint_submit_integration(auto_mock_setup, tmp_path): + """ + Test that validation is called during submit() and blocks submission. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} + + # Create test directory and file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + script_file = test_dir / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=f"python {test_dir}/script.py", # Redundant reference + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir=str(test_dir)), + ) + + # Should raise ValueError during submit() before API call + with pytest.raises(ValueError, match="Working directory conflict detected"): + rayjob.submit() + + # Verify submit_job was never called (validation blocked it) + mock_api_instance.submit_job.assert_not_called() + + +def test_validate_working_dir_entrypoint_error_message_format( + auto_mock_setup, tmp_path +): + """ + Test that error message contains helpful information. + """ + # Create test directory and file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + script_file = test_dir / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=f"python {test_dir}/script.py", + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir=str(test_dir)), + ) + + try: + rayjob._validate_working_dir_entrypoint() + assert False, "Should have raised ValueError" + except ValueError as e: + error_msg = str(e) + # Verify error message contains key information + assert "Working directory conflict detected" in error_msg + assert "working_dir:" in error_msg + assert "entrypoint references:" in error_msg + assert "Fix: Remove the directory prefix" in error_msg + assert "python script.py" in error_msg # Suggested fix + + +def test_validate_working_dir_entrypoint_subdirectory_valid(auto_mock_setup, tmp_path): + """ + Test validation passes when entrypoint references subdirectory within working_dir. + """ + # Create test directory structure: testdir/subdir/script.py + test_dir = tmp_path / "testdir" + test_dir.mkdir() + sub_dir = test_dir / "subdir" + sub_dir.mkdir() + script_file = sub_dir / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python subdir/script.py", # Correct: relative to working_dir + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir=str(test_dir)), + ) + + # Should not raise any exception + rayjob._validate_working_dir_entrypoint() + + +def test_validate_working_dir_entrypoint_runtime_env_as_dict(auto_mock_setup, tmp_path): + """ + Test validation works when runtime_env is passed as dict (not RuntimeEnv object). + """ + # Create test directory and file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + script_file = test_dir / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=f"python {test_dir}/script.py", # Redundant reference + namespace="test-namespace", + runtime_env={"working_dir": str(test_dir)}, # Dict instead of RuntimeEnv + ) + + # Should raise ValueError even with dict runtime_env + with pytest.raises(ValueError, match="Working directory conflict detected"): + rayjob._validate_working_dir_entrypoint() + + +def test_validate_file_exists_with_working_dir(auto_mock_setup, tmp_path): + """ + Test validation checks that entrypoint file exists within working_dir. + """ + # Create working directory but NOT the script file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python script.py", # File doesn't exist + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir=str(test_dir)), + ) + + # Should raise ValueError about missing file + with pytest.raises(ValueError, match="Entrypoint file not found"): + rayjob._validate_working_dir_entrypoint() + + +def test_validate_file_exists_without_working_dir(auto_mock_setup, tmp_path): + """ + Test validation checks that entrypoint file exists when no working_dir and using ./ prefix. + """ + # Don't create the script file + script_path = "./missing_script.py" + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint=f"python {script_path}", # File doesn't exist (local path with ./) + namespace="test-namespace", + ) + + # Should raise ValueError about missing file + with pytest.raises(ValueError, match="Entrypoint file not found"): + rayjob._validate_working_dir_entrypoint() + + +def test_validate_existing_file_with_working_dir_passes(auto_mock_setup, tmp_path): + """ + Test validation passes when file exists in working_dir. + """ + # Create working directory AND the script file + test_dir = tmp_path / "testdir" + test_dir.mkdir() + script_file = test_dir / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python script.py", # File exists + namespace="test-namespace", + runtime_env=RuntimeEnv(working_dir=str(test_dir)), + ) + + # Should not raise any exception + rayjob._validate_working_dir_entrypoint() + + +def test_validate_inline_python_command_skipped(auto_mock_setup): + """ + Test validation skips inline Python commands (no file reference). + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python -c 'print(\"hello world\")'", # No file reference + namespace="test-namespace", + ) + + # Should not raise any exception (no file to validate) + rayjob._validate_working_dir_entrypoint() + + +def test_validate_simple_filename_without_working_dir_missing(auto_mock_setup): + """ + Test validation checks simple filenames without working_dir. + """ + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python script.py", # File doesn't exist locally + namespace="test-namespace", + ) + + # Should raise ValueError (file will be extracted from local, so must exist) + with pytest.raises(ValueError, match="Entrypoint file not found"): + rayjob._validate_working_dir_entrypoint() + + +def test_validate_simple_filename_without_working_dir_exists(auto_mock_setup, tmp_path): + """ + Test validation passes when simple filename exists locally without working_dir. + """ + import os + + original_cwd = os.getcwd() + try: + os.chdir(tmp_path) + # Create the script file in current directory + script_file = tmp_path / "script.py" + script_file.write_text("print('hello')") + + rayjob = RayJob( + job_name="test-job", + cluster_name="test-cluster", + entrypoint="python script.py", # Simple filename exists locally + namespace="test-namespace", + ) + + # Should not raise exception (file exists) + rayjob._validate_working_dir_entrypoint() + finally: + os.chdir(original_cwd) diff --git a/src/codeflare_sdk/ray/rayjobs/test/test_runtime_env.py b/src/codeflare_sdk/ray/rayjobs/test/test_runtime_env.py new file mode 100644 index 00000000..7a4150e5 --- /dev/null +++ b/src/codeflare_sdk/ray/rayjobs/test/test_runtime_env.py @@ -0,0 +1,1090 @@ +# Copyright 2025 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import os +import io +from unittest.mock import MagicMock, patch +from codeflare_sdk.common.utils.constants import MOUNT_PATH, RAY_VERSION +from ray.runtime_env import RuntimeEnv + +from codeflare_sdk.ray.rayjobs.rayjob import RayJob +from codeflare_sdk.ray.cluster.config import ClusterConfiguration +from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig +from kubernetes.client import ( + V1Volume, + V1VolumeMount, + ApiException, +) + +from codeflare_sdk.ray.rayjobs.runtime_env import ( + create_secret_from_spec, + extract_all_local_files, +) + + +def test_rayjob_with_remote_working_dir(auto_mock_setup): + """ + Test RayJob with remote working directory in runtime_env. + Should not extract local files and should pass through remote URL. + """ + runtime_env = RuntimeEnv( + working_dir="https://github.com/org/repo/archive/refs/heads/main.zip", + pip=["numpy", "pandas"], + env_vars={"TEST_VAR": "test_value"}, + ) + + rayjob = RayJob( + job_name="test-job", + entrypoint="python test.py", + cluster_name="test-cluster", + runtime_env=runtime_env, + namespace="test-namespace", + ) + + assert rayjob.runtime_env == runtime_env + + # Should not extract any local files due to remote working_dir + files = extract_all_local_files(rayjob) + assert files is None + + rayjob_cr = rayjob._build_rayjob_cr() + + # Should have runtimeEnvYAML with all fields + expected_runtime_env = ( + "env_vars:\n" + " TEST_VAR: test_value\n" + "pip:\n" + "- numpy\n" + "- pandas\n" + "working_dir: https://github.com/org/repo/archive/refs/heads/main.zip\n" + ) + assert rayjob_cr["spec"]["runtimeEnvYAML"] == expected_runtime_env + + # Should not have submitterPodTemplate since no local files + assert "submitterPodTemplate" not in rayjob_cr["spec"] + + # Entrypoint should be unchanged + assert rayjob_cr["spec"]["entrypoint"] == "python test.py" + + +def test_build_file_secret_spec(): + """ + Test building Secret specification for files. + """ + config = ManagedClusterConfig() + files = {"main.py": "print('main')", "helper.py": "print('helper')"} + + spec = config.build_file_secret_spec( + job_name="test-job", namespace="test-namespace", files=files + ) + + assert spec["apiVersion"] == "v1" + assert spec["kind"] == "Secret" + assert spec["type"] == "Opaque" + assert spec["metadata"]["name"] == "test-job-files" + assert spec["metadata"]["namespace"] == "test-namespace" + assert spec["data"] == files + + +def test_build_file_volume_specs(): + """ + Test building volume and mount specifications for files. + """ + config = ManagedClusterConfig() + + volume_spec, mount_spec = config.build_file_volume_specs( + secret_name="test-files", mount_path="/custom/path" + ) + + assert volume_spec["name"] == "ray-job-files" + assert volume_spec["secret"]["secretName"] == "test-files" + + assert mount_spec["name"] == "ray-job-files" + assert mount_spec["mountPath"] == "/custom/path" + + +def test_add_file_volumes(): + """ + Test adding file volumes to cluster configuration. + """ + config = ManagedClusterConfig() + + # Initially no volumes + assert len(config.volumes) == 0 + assert len(config.volume_mounts) == 0 + + config.add_file_volumes(secret_name="test-files") + + assert len(config.volumes) == 1 + assert len(config.volume_mounts) == 1 + + volume = config.volumes[0] + mount = config.volume_mounts[0] + + assert volume.name == "ray-job-files" + assert volume.secret.secret_name == "test-files" + + assert mount.name == "ray-job-files" + assert mount.mount_path == MOUNT_PATH + + +def test_add_file_volumes_duplicate_prevention(): + """ + Test that adding file volumes twice doesn't create duplicates. + """ + config = ManagedClusterConfig() + + # Add volumes twice + config.add_file_volumes(secret_name="test-files") + config.add_file_volumes(secret_name="test-files") + + assert len(config.volumes) == 1 + assert len(config.volume_mounts) == 1 + + +def test_create_secret_from_spec(auto_mock_setup): + """ + Test creating Secret via Kubernetes API. + """ + mock_api_instance = auto_mock_setup["k8s_api"] + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python test.py", + namespace="test-namespace", + ) + + secret_spec = { + "apiVersion": "v1", + "kind": "Secret", + "type": "Opaque", + "metadata": {"name": "test-files", "namespace": "test-namespace"}, + "data": {"test.py": "print('test')"}, + } + + result = create_secret_from_spec(rayjob, secret_spec) + + assert result == "test-files" + mock_api_instance.create_namespaced_secret.assert_called_once() + + +def test_create_secret_already_exists(auto_mock_setup): + """ + Test creating Secret when it already exists (409 conflict). + """ + mock_api_instance = auto_mock_setup["k8s_api"] + + mock_api_instance.create_namespaced_secret.side_effect = ApiException(status=409) + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python test.py", + namespace="test-namespace", + ) + + secret_spec = { + "apiVersion": "v1", + "kind": "Secret", + "type": "Opaque", + "metadata": {"name": "test-files", "namespace": "test-namespace"}, + "data": {"test.py": "print('test')"}, + } + + result = create_secret_from_spec(rayjob, secret_spec) + + assert result == "test-files" + mock_api_instance.create_namespaced_secret.assert_called_once() + mock_api_instance.replace_namespaced_secret.assert_called_once() + + +def test_create_secret_with_owner_reference_basic(mocker, auto_mock_setup, caplog): + """ + Test creating Secret with owner reference from valid RayJob result. + """ + mock_api_instance = auto_mock_setup["k8s_api"] + + # Mock client.V1ObjectMeta and V1Secret + mock_v1_metadata = mocker.patch("kubernetes.client.V1ObjectMeta") + mock_metadata_instance = MagicMock() + mock_v1_metadata.return_value = mock_metadata_instance + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python test.py", + namespace="test-namespace", + ) + + secret_spec = { + "apiVersion": "v1", + "kind": "Secret", + "type": "Opaque", + "metadata": { + "name": "test-files", + "namespace": "test-namespace", + "labels": { + "ray.io/job-name": "test-job", + "app.kubernetes.io/managed-by": "codeflare-sdk", + "app.kubernetes.io/component": "rayjob-files", + }, + }, + "data": {"test.py": "print('test')"}, + } + + # Valid RayJob result with UID + rayjob_result = { + "metadata": { + "name": "test-job", + "namespace": "test-namespace", + "uid": "a4dd4c5a-ab61-411d-b4d1-4abb5177422a", + } + } + + with caplog.at_level("INFO"): + result = create_secret_from_spec(rayjob, secret_spec, rayjob_result) + + assert result == "test-files" + + # Verify owner reference was set + expected_owner_ref = mocker.ANY # We'll check via the logs + assert ( + "Adding owner reference to Secret 'test-files' with RayJob UID: a4dd4c5a-ab61-411d-b4d1-4abb5177422a" + in caplog.text + ) + + assert mock_metadata_instance.owner_references is not None + mock_api_instance.create_namespaced_secret.assert_called_once() + + +def test_create_secret_without_owner_reference_no_uid(mocker, auto_mock_setup, caplog): + """ + Test creating Secret without owner reference when RayJob has no UID. + """ + mock_api_instance = auto_mock_setup["k8s_api"] + + mock_v1_metadata = mocker.patch("kubernetes.client.V1ObjectMeta") + mock_metadata_instance = MagicMock() + mock_v1_metadata.return_value = mock_metadata_instance + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python test.py", + namespace="test-namespace", + ) + + secret_spec = { + "apiVersion": "v1", + "kind": "Secret", + "type": "Opaque", + "metadata": {"name": "test-files", "namespace": "test-namespace"}, + "data": {"test.py": "print('test')"}, + } + + # RayJob result without UID + rayjob_result = { + "metadata": { + "name": "test-job", + "namespace": "test-namespace", + # No UID field + } + } + + with caplog.at_level("WARNING"): + result = create_secret_from_spec(rayjob, secret_spec, rayjob_result) + + assert result == "test-files" + + # Verify warning was logged and no owner reference was set + assert ( + "No valid RayJob result with UID found, Secret 'test-files' will not have owner reference" + in caplog.text + ) + + # The important part is that the warning was logged, indicating no owner reference was set + mock_api_instance.create_namespaced_secret.assert_called_once() + + +def test_create_secret_with_invalid_rayjob_result(auto_mock_setup, caplog): + """ + Test creating Secret with None or invalid rayjob_result. + """ + mock_api_instance = auto_mock_setup["k8s_api"] + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python test.py", + namespace="test-namespace", + ) + + secret_spec = { + "apiVersion": "v1", + "kind": "Secret", + "type": "Opaque", + "metadata": {"name": "test-files", "namespace": "test-namespace"}, + "data": {"test.py": "print('test')"}, + } + + # Test with None + with caplog.at_level("WARNING"): + result = create_secret_from_spec(rayjob, secret_spec, None) + + assert result == "test-files" + assert "No valid RayJob result with UID found" in caplog.text + + # Test with string instead of dict + caplog.clear() + with caplog.at_level("WARNING"): + result = create_secret_from_spec(rayjob, secret_spec, "not-a-dict") + + assert result == "test-files" + assert "No valid RayJob result with UID found" in caplog.text + + +def test_file_handling_kubernetes_best_practice_flow(mocker, tmp_path): + """ + Test the Kubernetes best practice flow: pre-declare volume, submit, create Secret. + """ + mocker.patch("kubernetes.config.load_kube_config") + + mock_api_class = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") + mock_api_instance = MagicMock() + mock_api_class.return_value = mock_api_instance + + submit_result = { + "metadata": { + "name": "test-job", + "namespace": "test-namespace", + "uid": "test-uid-12345", + } + } + mock_api_instance.submit_job.return_value = submit_result + + # Mock create_file_secret where it's used (imported into rayjob module) + mock_create_secret = mocker.patch( + "codeflare_sdk.ray.rayjobs.rayjob.create_file_secret" + ) + mock_add_volumes = mocker.patch.object(ManagedClusterConfig, "add_file_volumes") + + # RayClusterApi is already mocked by auto_mock_setup + + test_file = tmp_path / "test.py" + test_file.write_text("print('test')") + + call_order = [] + + def track_add_volumes(*args, **kwargs): + call_order.append("add_volumes") + # Should be called with Secret name + assert args[0] == "test-job-files" + + def track_submit(*args, **kwargs): + call_order.append("submit_job") + return submit_result + + def track_create_secret(*args, **kwargs): + call_order.append("create_secret") + # Args should be: (job, files, rayjob_result) + assert len(args) >= 3, f"Expected 3 args, got {len(args)}: {args}" + assert args[2] == submit_result # rayjob_result should be third arg + + mock_add_volumes.side_effect = track_add_volumes + mock_api_instance.submit_job.side_effect = track_submit + mock_create_secret.side_effect = track_create_secret + + original_cwd = os.getcwd() + try: + os.chdir(tmp_path) + + cluster_config = ManagedClusterConfig() + + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python test.py", + namespace="test-namespace", + ) + + rayjob.submit() + finally: + os.chdir(original_cwd) + + # Verify the order: submit → create Secret + assert call_order == ["submit_job", "create_secret"] + + mock_api_instance.submit_job.assert_called_once() + mock_create_secret.assert_called_once() + + # Verify create_file_secret was called with: (job, files, rayjob_result) + # Files dict includes metadata key __entrypoint_path__ for single file case + call_args = mock_create_secret.call_args[0] + assert call_args[0] == rayjob + assert call_args[2] == submit_result + # Check that the actual file content is present + assert "test.py" in call_args[1] + assert call_args[1]["test.py"] == "print('test')" + + +def test_rayjob_submit_with_files_new_cluster(auto_mock_setup, tmp_path): + """ + Test RayJob submission with file detection for new cluster. + """ + mock_api_instance = auto_mock_setup["rayjob_api"] + mock_api_instance.submit_job.return_value = True + + mock_k8s_instance = auto_mock_setup["k8s_api"] + + # Create test file + test_file = tmp_path / "test.py" + test_file.write_text("print('Hello from the test file!')") + + cluster_config = ManagedClusterConfig() + + original_cwd = os.getcwd() + os.chdir(tmp_path) + + try: + rayjob = RayJob( + job_name="test-job", + cluster_config=cluster_config, + entrypoint="python test.py", + namespace="test-namespace", + ) + + # Submit should detect files and handle them + result = rayjob.submit() + + assert result == "test-job" + + mock_k8s_instance.create_namespaced_secret.assert_called_once() + + assert len(cluster_config.volumes) == 0 + assert len(cluster_config.volume_mounts) == 0 + # Entrypoint should be adjusted to use just the filename + assert rayjob.entrypoint == "python test.py" + + finally: + os.chdir(original_cwd) + + +def test_create_secret_api_error_non_409(auto_mock_setup): + """ + Test create_secret_from_spec handles non-409 API errors. + """ + mock_api_instance = auto_mock_setup["k8s_api"] + + # Configure to raise 500 error + mock_api_instance.create_namespaced_secret.side_effect = ApiException(status=500) + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python test.py", + namespace="test-namespace", + ) + + secret_spec = { + "apiVersion": "v1", + "kind": "Secret", + "type": "Opaque", + "metadata": {"name": "test-files", "namespace": "test-namespace"}, + "data": {"test.py": "print('test')"}, + } + + with pytest.raises(RuntimeError, match="Failed to create Secret"): + create_secret_from_spec(rayjob, secret_spec) + + +def test_add_file_volumes_existing_volume_skip(): + """ + Test add_file_volumes skips when volume already exists (missing coverage). + """ + from kubernetes.client import V1SecretVolumeSource + + config = ManagedClusterConfig() + + # Pre-add a volume with same name + existing_volume = V1Volume( + name="ray-job-files", + secret=V1SecretVolumeSource(secret_name="existing-files"), + ) + config.volumes.append(existing_volume) + + config.add_file_volumes(secret_name="new-files") + assert len(config.volumes) == 1 + assert len(config.volume_mounts) == 0 # Mount not added due to volume skip + + +def test_add_file_volumes_existing_mount_skip(): + """ + Test add_file_volumes skips when mount already exists (missing coverage). + """ + config = ManagedClusterConfig() + + # Pre-add a mount with same name + existing_mount = V1VolumeMount(name="ray-job-files", mount_path="/existing/path") + config.volume_mounts.append(existing_mount) + + config.add_file_volumes(secret_name="new-files") + assert len(config.volumes) == 0 # Volume not added due to mount skip + assert len(config.volume_mounts) == 1 + + +def test_zip_directory_functionality(tmp_path): + """ + Test _zip_directory with real directories and files. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _zip_directory + + # Create test directory structure + test_dir = tmp_path / "working_dir" + test_dir.mkdir() + + # Create some test files + (test_dir / "main.py").write_text("print('main script')") + (test_dir / "utils.py").write_text("def helper(): pass") + + # Create subdirectory with file + sub_dir = test_dir / "subdir" + sub_dir.mkdir() + (sub_dir / "nested.py").write_text("print('nested file')") + + # Test zipping + zip_data = _zip_directory(str(test_dir)) + + assert zip_data is not None + assert len(zip_data) > 0 + assert isinstance(zip_data, bytes) + + +def test_zip_directory_excludes_jupyter_notebooks(tmp_path, caplog): + """ + Test that Jupyter notebook files (.ipynb) are excluded from zip. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _zip_directory + import zipfile + + # Create test directory with mixed file types + test_dir = tmp_path / "working_dir" + test_dir.mkdir() + + # Create Python files (should be included) + (test_dir / "main.py").write_text("print('main script')") + (test_dir / "utils.py").write_text("def helper(): pass") + + # Create Jupyter notebook files (should be excluded) + (test_dir / "analysis.ipynb").write_text('{"cells": [], "metadata": {}}') + (test_dir / "experiment.IPYNB").write_text( + '{"cells": [], "metadata": {}}' + ) # Test case insensitive + + # Create subdirectory with mixed files + sub_dir = test_dir / "notebooks" + sub_dir.mkdir() + (sub_dir / "data_exploration.ipynb").write_text('{"cells": [], "metadata": {}}') + (sub_dir / "helper.py").write_text("print('nested file')") + + # Test zipping + with caplog.at_level("INFO"): + zip_data = _zip_directory(str(test_dir)) + + assert zip_data is not None + assert len(zip_data) > 0 + + # Verify log message includes exclusion count + assert "Excluded 3 Jupyter notebook files" in caplog.text + + # Verify excluded files are not in the zip + zip_buffer = io.BytesIO(zip_data) + with zipfile.ZipFile(zip_buffer, "r") as zipf: + zip_contents = zipf.namelist() + + # Python files should be present + assert "main.py" in zip_contents + assert "utils.py" in zip_contents + assert "notebooks/helper.py" in zip_contents + + # Jupyter notebooks should be excluded + assert "analysis.ipynb" not in zip_contents + assert "experiment.IPYNB" not in zip_contents + assert "notebooks/data_exploration.ipynb" not in zip_contents + + +def test_zip_directory_no_exclusions_when_no_notebooks(tmp_path, caplog): + """ + Test that no exclusion message is logged when no notebook files exist. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _zip_directory + + # Create test directory with only Python files + test_dir = tmp_path / "working_dir" + test_dir.mkdir() + (test_dir / "main.py").write_text("print('main script')") + (test_dir / "utils.py").write_text("def helper(): pass") + + # Test zipping + with caplog.at_level("INFO"): + zip_data = _zip_directory(str(test_dir)) + + assert zip_data is not None + + # Verify log message does NOT mention exclusions + assert "Excluded" not in caplog.text + assert "Jupyter notebook files" not in caplog.text + + +def test_should_exclude_file_function(): + """ + Test the _should_exclude_file helper function directly. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _should_exclude_file + + # Should exclude .ipynb files (case insensitive) + assert _should_exclude_file("notebook.ipynb") is True + assert _should_exclude_file("analysis.IPYNB") is True + assert _should_exclude_file("data/exploration.ipynb") is True + assert _should_exclude_file("subdir/nested.Ipynb") is True + + # Should NOT exclude other files + assert _should_exclude_file("script.py") is False + assert _should_exclude_file("data.json") is False + assert _should_exclude_file("requirements.txt") is False + assert _should_exclude_file("README.md") is False + assert _should_exclude_file("model.pkl") is False + + +def test_zip_directory_error_handling(): + """ + Test _zip_directory error handling for IO errors during zipping. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _zip_directory + + # Mock os.walk to raise an OSError + with patch("os.walk", side_effect=OSError("Permission denied")): + zip_data = _zip_directory("/some/path") + assert zip_data is None + + +def test_extract_all_local_files_with_working_dir(tmp_path): + """ + Test extract_all_local_files with local working directory. + """ + # Create test working directory + working_dir = tmp_path / "working_dir" + working_dir.mkdir() + (working_dir / "script.py").write_text("print('working dir script')") + + runtime_env = RuntimeEnv(working_dir=str(working_dir)) + + rayjob = RayJob( + job_name="test-job", + entrypoint="python script.py", + runtime_env=runtime_env, + namespace="test-namespace", + cluster_name="test-cluster", + ) + + files = extract_all_local_files(rayjob) + + assert files is not None + assert "working_dir.zip" in files + assert isinstance(files["working_dir.zip"], str) # base64 encoded + + # Verify it's valid base64 + import base64 + + try: + decoded = base64.b64decode(files["working_dir.zip"]) + assert len(decoded) > 0 + except Exception: + pytest.fail("Invalid base64 encoding") + + +def test_extract_all_local_files_excludes_notebooks(tmp_path, caplog): + """ + Test that extract_all_local_files excludes Jupyter notebooks when zipping working directory. + """ + import zipfile + import base64 + + # Create test working directory with mixed files + working_dir = tmp_path / "working_dir" + working_dir.mkdir() + + # Python files that should be included + (working_dir / "main.py").write_text("print('main script')") + (working_dir / "helper.py").write_text("def helper_function(): pass") + + # Jupyter notebooks that should be excluded + (working_dir / "analysis.ipynb").write_text( + '{"cells": [{"cell_type": "code", "source": ["print(\'hello\')"]}]}' + ) + (working_dir / "data.ipynb").write_text('{"cells": [], "metadata": {}}') + + runtime_env = RuntimeEnv(working_dir=str(working_dir)) + + rayjob = RayJob( + job_name="test-job", + entrypoint="python main.py", + runtime_env=runtime_env, + namespace="test-namespace", + cluster_name="test-cluster", + ) + + # This should zip the directory and exclude notebooks + with caplog.at_level("INFO"): + files = extract_all_local_files(rayjob) + + assert files is not None + assert "working_dir.zip" in files + + # Verify exclusion was logged + assert "Excluded 2 Jupyter notebook files" in caplog.text + + # Decode and verify zip contents + zip_data = base64.b64decode(files["working_dir.zip"]) + zip_buffer = io.BytesIO(zip_data) + + with zipfile.ZipFile(zip_buffer, "r") as zipf: + zip_contents = zipf.namelist() + + # Python files should be present + assert "main.py" in zip_contents + assert "helper.py" in zip_contents + + # Jupyter notebooks should be excluded + assert "analysis.ipynb" not in zip_contents + assert "data.ipynb" not in zip_contents + + +def test_extract_single_entrypoint_file_error_handling(tmp_path): + """ + Test _extract_single_entrypoint_file with file read errors. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _extract_single_entrypoint_file + + # Create a file that exists but make it unreadable + test_file = tmp_path / "unreadable.py" + test_file.write_text("print('test')") + + rayjob = RayJob( + job_name="test-job", + entrypoint=f"python {test_file}", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + # Mock open to raise IOError + with patch("builtins.open", side_effect=IOError("Permission denied")): + result = _extract_single_entrypoint_file(rayjob) + assert result is None + + +def test_extract_single_entrypoint_file_no_match(): + """ + Test _extract_single_entrypoint_file with no Python file matches. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _extract_single_entrypoint_file + + rayjob = RayJob( + job_name="test-job", + entrypoint="echo 'no python files here'", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + result = _extract_single_entrypoint_file(rayjob) + assert result is None + + +def test_parse_requirements_file_valid(tmp_path): + """ + Test parse_requirements_file with valid requirements.txt. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import parse_requirements_file + + # Create test requirements file + req_file = tmp_path / "requirements.txt" + req_file.write_text( + """# This is a comment +numpy==1.21.0 +pandas>=1.3.0 + +# Another comment +scikit-learn +""" + ) + + result = parse_requirements_file(str(req_file)) + + assert result is not None + assert len(result) == 3 + assert "numpy==1.21.0" in result + assert "pandas>=1.3.0" in result + assert "scikit-learn" in result + # Comments and empty lines should be filtered out + assert "# This is a comment" not in result + + +def test_parse_requirements_file_missing(): + """ + Test parse_requirements_file with non-existent file. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import parse_requirements_file + + result = parse_requirements_file("/non/existent/requirements.txt") + assert result is None + + +def test_parse_requirements_file_read_error(tmp_path): + """ + Test parse_requirements_file with file read error. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import parse_requirements_file + + # Create a file + req_file = tmp_path / "requirements.txt" + req_file.write_text("numpy==1.21.0") + + # Mock open to raise IOError + with patch("builtins.open", side_effect=IOError("Permission denied")): + result = parse_requirements_file(str(req_file)) + assert result is None + + +def test_process_pip_dependencies_list(): + """ + Test process_pip_dependencies with list input. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import process_pip_dependencies + + rayjob = RayJob( + job_name="test-job", + entrypoint="python test.py", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + pip_list = ["numpy", "pandas", "scikit-learn"] + result = process_pip_dependencies(rayjob, pip_list) + + assert result == pip_list + + +def test_process_pip_dependencies_requirements_file(tmp_path): + """ + Test process_pip_dependencies with requirements.txt path. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import process_pip_dependencies + + # Create test requirements file + req_file = tmp_path / "requirements.txt" + req_file.write_text("numpy==1.21.0\npandas>=1.3.0") + + rayjob = RayJob( + job_name="test-job", + entrypoint="python test.py", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + result = process_pip_dependencies(rayjob, str(req_file)) + + assert result is not None + assert len(result) == 2 + assert "numpy==1.21.0" in result + assert "pandas>=1.3.0" in result + + +def test_process_pip_dependencies_dict_format(): + """ + Test process_pip_dependencies with dict format containing packages. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import process_pip_dependencies + + rayjob = RayJob( + job_name="test-job", + entrypoint="python test.py", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + pip_dict = {"packages": ["numpy", "pandas"], "pip_check": False} + result = process_pip_dependencies(rayjob, pip_dict) + + assert result == ["numpy", "pandas"] + + +def test_process_pip_dependencies_unsupported_format(): + """ + Test process_pip_dependencies with unsupported format. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import process_pip_dependencies + + rayjob = RayJob( + job_name="test-job", + entrypoint="python test.py", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + # Test with unsupported format (int) + result = process_pip_dependencies(rayjob, 12345) + assert result is None + + +def test_process_runtime_env_local_working_dir(tmp_path): + """ + Test process_runtime_env with local working directory. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import process_runtime_env, UNZIP_PATH + + # Create test working directory + working_dir = tmp_path / "working_dir" + working_dir.mkdir() + (working_dir / "script.py").write_text("print('test')") + + runtime_env = RuntimeEnv( + working_dir=str(working_dir), + env_vars={"TEST_VAR": "test_value"}, + ) + + rayjob = RayJob( + job_name="test-job", + entrypoint="python script.py", + runtime_env=runtime_env, + namespace="test-namespace", + cluster_name="test-cluster", + ) + + result = process_runtime_env(rayjob) + + assert result is not None + assert f"working_dir: {UNZIP_PATH}" in result + assert "env_vars:" in result + assert "TEST_VAR: test_value" in result + + +def test_process_runtime_env_single_file_case(tmp_path): + """ + Test process_runtime_env with single file case (no working_dir in runtime_env). + """ + from codeflare_sdk.ray.rayjobs.runtime_env import process_runtime_env + from codeflare_sdk.common.utils.constants import MOUNT_PATH + + rayjob = RayJob( + job_name="test-job", + entrypoint="python test.py", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + # Files dict without working_dir.zip (single file case) + files = {"test.py": "print('test')"} + + result = process_runtime_env(rayjob, files) + + assert result is not None + assert f"working_dir: {MOUNT_PATH}" in result + + +def test_process_runtime_env_no_processing_needed(): + """ + Test process_runtime_env returns None when no processing needed. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import process_runtime_env + + rayjob = RayJob( + job_name="test-job", + entrypoint="python test.py", + namespace="test-namespace", + cluster_name="test-cluster", + ) + + # No runtime_env and no files + result = process_runtime_env(rayjob) + assert result is None + + +def test_normalize_runtime_env_with_none(): + """ + Test _normalize_runtime_env with None input. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _normalize_runtime_env + + result = _normalize_runtime_env(None) + assert result is None + + +def test_extract_single_entrypoint_file_no_entrypoint(): + """ + Test _extract_single_entrypoint_file with no entrypoint. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import _extract_single_entrypoint_file + + rayjob = RayJob( + job_name="test-job", + entrypoint=None, # No entrypoint + namespace="test-namespace", + cluster_name="test-cluster", + ) + + result = _extract_single_entrypoint_file(rayjob) + assert result is None + + +def test_create_file_secret_filters_metadata_keys(auto_mock_setup, tmp_path): + """ + Test create_file_secret filters out metadata keys from files dict. + """ + from codeflare_sdk.ray.rayjobs.runtime_env import create_file_secret + + rayjob = RayJob( + job_name="test-job", + cluster_name="existing-cluster", + entrypoint="python test.py", + namespace="test-namespace", + ) + + # Files dict with metadata key that should be filtered out + files = { + "__entrypoint_path__": "some/path/test.py", # Should be filtered + "test.py": "print('test')", # Should remain + } + + rayjob_result = { + "metadata": { + "name": "test-job", + "namespace": "test-namespace", + "uid": "test-uid-12345", + } + } + + # This should not raise an error and should filter out metadata keys + create_file_secret(rayjob, files, rayjob_result) + + # Verify the Secret was created (mocked) + mock_api_instance = auto_mock_setup["k8s_api"] + mock_api_instance.create_namespaced_secret.assert_called_once() + + # The call should have filtered data (only test.py, not __entrypoint_path__) + call_args = mock_api_instance.create_namespaced_secret.call_args + secret_data = call_args[1]["body"].string_data # Changed from data to string_data + assert "test.py" in secret_data + assert "__entrypoint_path__" not in secret_data diff --git a/src/codeflare_sdk/ray/rayjobs/test_status.py b/src/codeflare_sdk/ray/rayjobs/test/test_status.py similarity index 100% rename from src/codeflare_sdk/ray/rayjobs/test_status.py rename to src/codeflare_sdk/ray/rayjobs/test/test_status.py diff --git a/src/codeflare_sdk/ray/rayjobs/test_rayjob.py b/src/codeflare_sdk/ray/rayjobs/test_rayjob.py deleted file mode 100644 index 829265d6..00000000 --- a/src/codeflare_sdk/ray/rayjobs/test_rayjob.py +++ /dev/null @@ -1,2196 +0,0 @@ -# Copyright 2022-2025 IBM, Red Hat -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import os -from unittest.mock import MagicMock, patch -from codeflare_sdk.common.utils.constants import MOUNT_PATH, RAY_VERSION - -from codeflare_sdk.ray.rayjobs.rayjob import RayJob -from codeflare_sdk.ray.cluster.config import ClusterConfiguration -from codeflare_sdk.ray.rayjobs.config import ManagedClusterConfig -from kubernetes.client import ( - V1Volume, - V1VolumeMount, - V1Toleration, - V1ConfigMapVolumeSource, - ApiException, -) - - -# Global test setup that runs automatically for ALL tests -@pytest.fixture(autouse=True) -def auto_mock_setup(mocker): - """Automatically mock common dependencies for all tests.""" - mocker.patch("kubernetes.config.load_kube_config") - - # Always mock get_default_kueue_name to prevent K8s API calls - mocker.patch( - "codeflare_sdk.ray.rayjobs.rayjob.get_default_kueue_name", - return_value="default-queue", - ) - - mock_get_ns = mocker.patch( - "codeflare_sdk.ray.rayjobs.rayjob.get_current_namespace", - return_value="test-namespace", - ) - - mock_rayjob_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi") - mock_rayjob_instance = MagicMock() - mock_rayjob_api.return_value = mock_rayjob_instance - - mock_cluster_api = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayClusterApi") - mock_cluster_instance = MagicMock() - mock_cluster_api.return_value = mock_cluster_instance - - mock_k8s_api = mocker.patch("kubernetes.client.CoreV1Api") - mock_k8s_instance = MagicMock() - mock_k8s_api.return_value = mock_k8s_instance - - mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.get_api_client") - - # Return the mocked instances so tests can configure them as needed - return { - "rayjob_api": mock_rayjob_instance, - "cluster_api": mock_cluster_instance, - "k8s_api": mock_k8s_instance, - "get_current_namespace": mock_get_ns, - } - - -def test_rayjob_submit_success(auto_mock_setup): - """Test successful RayJob submission.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - mock_api_instance.submit.return_value = {"metadata": {"name": "test-rayjob"}} - - rayjob = RayJob( - job_name="test-rayjob", - cluster_name="test-ray-cluster", - namespace="test-namespace", - entrypoint="python -c 'print(\"hello world\")'", - runtime_env={"pip": ["requests"]}, - ) - - job_id = rayjob.submit() - - assert job_id == "test-rayjob" - - mock_api_instance.submit_job.assert_called_once() - call_args = mock_api_instance.submit_job.call_args - - assert call_args.kwargs["k8s_namespace"] == "test-namespace" - - job_cr = call_args.kwargs["job"] - assert job_cr["metadata"]["name"] == "test-rayjob" - assert job_cr["metadata"]["namespace"] == "test-namespace" - assert job_cr["spec"]["entrypoint"] == "python -c 'print(\"hello world\")'" - assert job_cr["spec"]["clusterSelector"]["ray.io/cluster"] == "test-ray-cluster" - assert job_cr["spec"]["runtimeEnvYAML"] == "{'pip': ['requests']}" - - -def test_rayjob_submit_failure(auto_mock_setup): - """Test RayJob submission failure.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - mock_api_instance.submit_job.return_value = None - - rayjob = RayJob( - job_name="test-rayjob", - cluster_name="test-ray-cluster", - namespace="default", - entrypoint="python script.py", - runtime_env={"pip": ["numpy"]}, - ) - - with pytest.raises(RuntimeError, match="Failed to submit RayJob test-rayjob"): - rayjob.submit() - - -def test_rayjob_init_validation_both_provided(auto_mock_setup): - """Test that providing both cluster_name and cluster_config raises error.""" - cluster_config = ClusterConfiguration(name="test-cluster", namespace="test") - - with pytest.raises( - ValueError, - match="❌ Configuration Error: You cannot specify both 'cluster_name' and 'cluster_config'", - ): - RayJob( - job_name="test-job", - cluster_name="existing-cluster", - cluster_config=cluster_config, - entrypoint="python script.py", - ) - - -def test_rayjob_init_validation_neither_provided(auto_mock_setup): - """Test that providing neither cluster_name nor cluster_config raises error.""" - with pytest.raises( - ValueError, - match="❌ Configuration Error: You must provide either 'cluster_name'", - ): - RayJob(job_name="test-job", entrypoint="python script.py") - - -def test_rayjob_init_with_cluster_config(auto_mock_setup): - """Test RayJob initialization with cluster configuration for auto-creation.""" - cluster_config = ClusterConfiguration( - name="auto-cluster", namespace="test-namespace", num_workers=2 - ) - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - namespace="test-namespace", - ) - - assert rayjob.name == "test-job" - assert rayjob.cluster_name == "test-job-cluster" # Generated from job name - assert rayjob._cluster_config == cluster_config - assert rayjob._cluster_name is None - - -def test_rayjob_cluster_name_generation(auto_mock_setup): - """Test that cluster names are generated when config has empty name.""" - cluster_config = ClusterConfiguration( - name="", # Empty name should trigger generation - namespace="test-namespace", - num_workers=1, - ) - - rayjob = RayJob( - job_name="my-job", - cluster_config=cluster_config, - entrypoint="python script.py", - namespace="test-namespace", - ) - - assert rayjob.cluster_name == "my-job-cluster" - - -def test_rayjob_cluster_config_namespace_none(auto_mock_setup): - """Test that cluster config namespace is set when None.""" - cluster_config = ClusterConfiguration( - name="test-cluster", - namespace=None, # This should be set to job namespace - num_workers=1, - ) - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - namespace="job-namespace", - entrypoint="python script.py", - ) - - assert rayjob.namespace == "job-namespace" - - -def test_rayjob_with_active_deadline_seconds(auto_mock_setup): - """Test RayJob CR generation with active deadline seconds.""" - rayjob = RayJob( - job_name="test-job", - cluster_name="test-cluster", - namespace="test-namespace", - entrypoint="python main.py", - active_deadline_seconds=30, - ) - - rayjob_cr = rayjob._build_rayjob_cr() - - assert rayjob_cr["spec"]["activeDeadlineSeconds"] == 30 - - -def test_build_ray_cluster_spec_no_config_error(auto_mock_setup): - """Test _build_ray_cluster_spec raises error when no cluster config.""" - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python script.py", - namespace="test-namespace", - ) - - rayjob_cr = rayjob._build_rayjob_cr() - - assert rayjob_cr["spec"]["clusterSelector"]["ray.io/cluster"] == "existing-cluster" - assert "rayClusterSpec" not in rayjob_cr["spec"] - - -def test_build_ray_cluster_spec(mocker, auto_mock_setup): - """Test _build_ray_cluster_spec method.""" - - mock_ray_cluster = { - "apiVersion": "ray.io/v1", - "kind": "RayCluster", - "metadata": {"name": "test-cluster", "namespace": "test"}, - "spec": { - "rayVersion": RAY_VERSION, - "headGroupSpec": {"replicas": 1}, - "workerGroupSpecs": [{"replicas": 2}], - }, - } - cluster_config = ManagedClusterConfig(num_workers=2) - mocker.patch.object( - cluster_config, "build_ray_cluster_spec", return_value=mock_ray_cluster["spec"] - ) - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - namespace="test-namespace", - ) - - rayjob_cr = rayjob._build_rayjob_cr() - - assert "rayClusterSpec" in rayjob_cr["spec"] - cluster_config.build_ray_cluster_spec.assert_called_once_with( - cluster_name="test-job-cluster" - ) - - -def test_build_rayjob_cr_with_existing_cluster(auto_mock_setup): - """Test _build_rayjob_cr method with existing cluster.""" - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - namespace="test-namespace", - entrypoint="python main.py", - ttl_seconds_after_finished=300, - ) - - rayjob_cr = rayjob._build_rayjob_cr() - - assert rayjob_cr["apiVersion"] == "ray.io/v1" - assert rayjob_cr["kind"] == "RayJob" - assert rayjob_cr["metadata"]["name"] == "test-job" - spec = rayjob_cr["spec"] - assert spec["entrypoint"] == "python main.py" - assert spec["shutdownAfterJobFinishes"] is False - assert spec["ttlSecondsAfterFinished"] == 300 - - assert spec["clusterSelector"]["ray.io/cluster"] == "existing-cluster" - assert "rayClusterSpec" not in spec - - -def test_build_rayjob_cr_with_auto_cluster(mocker, auto_mock_setup): - """Test _build_rayjob_cr method with auto-created cluster.""" - mock_ray_cluster = { - "apiVersion": "ray.io/v1", - "kind": "RayCluster", - "metadata": {"name": "auto-cluster", "namespace": "test"}, - "spec": { - "rayVersion": RAY_VERSION, - "headGroupSpec": {"replicas": 1}, - "workerGroupSpecs": [{"replicas": 2}], - }, - } - cluster_config = ManagedClusterConfig(num_workers=2) - - mocker.patch.object( - cluster_config, "build_ray_cluster_spec", return_value=mock_ray_cluster["spec"] - ) - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python main.py", - namespace="test-namespace", - ) - - rayjob_cr = rayjob._build_rayjob_cr() - assert rayjob_cr["spec"]["rayClusterSpec"] == mock_ray_cluster["spec"] - assert "clusterSelector" not in rayjob_cr["spec"] - - -def test_submit_validation_no_entrypoint(auto_mock_setup): - """Test that submit() raises error when entrypoint is None.""" - rayjob = RayJob( - job_name="test-job", - cluster_name="test-cluster", - entrypoint=None, # No entrypoint provided - namespace="test-namespace", - ) - - with pytest.raises( - ValueError, match="Entrypoint must be provided to submit a RayJob" - ): - rayjob.submit() - - -def test_submit_with_auto_cluster(mocker, auto_mock_setup): - """Test successful submission with auto-created cluster.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - mock_ray_cluster = { - "apiVersion": "ray.io/v1", - "kind": "RayCluster", - "spec": { - "rayVersion": RAY_VERSION, - "headGroupSpec": {"replicas": 1}, - "workerGroupSpecs": [{"replicas": 1}], - }, - } - mock_api_instance.submit_job.return_value = True - - cluster_config = ManagedClusterConfig(num_workers=1) - mocker.patch.object( - cluster_config, "build_ray_cluster_spec", return_value=mock_ray_cluster["spec"] - ) - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - namespace="test-namespace", - ) - - result = rayjob.submit() - - assert result == "test-job" - - mock_api_instance.submit_job.assert_called_once() - call_args = mock_api_instance.submit_job.call_args - - job_cr = call_args.kwargs["job"] - assert "rayClusterSpec" in job_cr["spec"] - assert job_cr["spec"]["rayClusterSpec"] == mock_ray_cluster["spec"] - - -def test_namespace_auto_detection_success(auto_mock_setup): - """Test successful namespace auto-detection.""" - auto_mock_setup["get_current_namespace"].return_value = "detected-ns" - - rayjob = RayJob( - job_name="test-job", entrypoint="python script.py", cluster_name="test-cluster" - ) - - assert rayjob.namespace == "detected-ns" - - -def test_namespace_auto_detection_fallback(auto_mock_setup): - """Test that namespace auto-detection failure raises an error.""" - auto_mock_setup["get_current_namespace"].return_value = None - - with pytest.raises(ValueError, match="Could not auto-detect Kubernetes namespace"): - RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="test-cluster", - ) - - -def test_namespace_explicit_override(auto_mock_setup): - """Test that explicit namespace overrides auto-detection.""" - auto_mock_setup["get_current_namespace"].return_value = "detected-ns" - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="test-cluster", - namespace="explicit-ns", - ) - - assert rayjob.namespace == "explicit-ns" - - -def test_rayjob_with_rayjob_cluster_config(auto_mock_setup): - """Test RayJob with the new ManagedClusterConfig.""" - cluster_config = ManagedClusterConfig( - num_workers=2, - head_cpu_requests="500m", - head_memory_requests="512Mi", - ) - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_config=cluster_config, - namespace="test-namespace", - ) - - assert rayjob._cluster_config == cluster_config - assert rayjob.cluster_name == "test-job-cluster" # Generated from job name - - -def test_rayjob_cluster_config_validation(auto_mock_setup): - """Test validation of ManagedClusterConfig parameters.""" - cluster_config = ManagedClusterConfig() - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_config=cluster_config, - namespace="test-namespace", - ) - - assert rayjob._cluster_config is not None - - -def test_rayjob_missing_entrypoint_validation(auto_mock_setup): - """Test that RayJob requires entrypoint for submission.""" - with pytest.raises( - TypeError, match="missing 1 required positional argument: 'entrypoint'" - ): - RayJob( - job_name="test-job", - cluster_name="test-cluster", - ) - - -def test_build_ray_cluster_spec_integration(mocker, auto_mock_setup): - """Test integration with the new build_ray_cluster_spec method.""" - cluster_config = ManagedClusterConfig() - mock_spec = {"spec": "test-spec"} - mocker.patch.object( - cluster_config, "build_ray_cluster_spec", return_value=mock_spec - ) - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_config=cluster_config, - namespace="test-namespace", - ) - - rayjob_cr = rayjob._build_rayjob_cr() - - cluster_config.build_ray_cluster_spec.assert_called_once_with( - cluster_name="test-job-cluster" - ) - assert "rayClusterSpec" in rayjob_cr["spec"] - assert rayjob_cr["spec"]["rayClusterSpec"] == mock_spec - - -def test_rayjob_with_runtime_env(auto_mock_setup): - """Test RayJob with runtime environment configuration.""" - runtime_env = {"pip": ["numpy", "pandas"]} - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="test-cluster", - runtime_env=runtime_env, - namespace="test-namespace", - ) - - assert rayjob.runtime_env == runtime_env - - rayjob_cr = rayjob._build_rayjob_cr() - assert rayjob_cr["spec"]["runtimeEnvYAML"] == str(runtime_env) - - -def test_rayjob_with_active_deadline_and_ttl(auto_mock_setup): - """Test RayJob with both active deadline and TTL settings.""" - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="test-cluster", - active_deadline_seconds=300, - ttl_seconds_after_finished=600, - namespace="test-namespace", - ) - - assert rayjob.active_deadline_seconds == 300 - assert rayjob.ttl_seconds_after_finished == 600 - - rayjob_cr = rayjob._build_rayjob_cr() - assert rayjob_cr["spec"]["activeDeadlineSeconds"] == 300 - assert rayjob_cr["spec"]["ttlSecondsAfterFinished"] == 600 - - -def test_rayjob_cluster_name_generation_with_config(auto_mock_setup): - """Test cluster name generation when using cluster_config.""" - - cluster_config = ManagedClusterConfig() - - rayjob = RayJob( - job_name="my-job", - entrypoint="python script.py", - cluster_config=cluster_config, - namespace="test-namespace", # Explicitly specify namespace - ) - - assert rayjob.cluster_name == "my-job-cluster" - - -def test_rayjob_namespace_propagation_to_cluster_config(auto_mock_setup): - """Test that job namespace is propagated to cluster config when None.""" - auto_mock_setup["get_current_namespace"].return_value = "detected-ns" - - cluster_config = ManagedClusterConfig() - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_config=cluster_config, - ) - - assert rayjob.namespace == "detected-ns" - - -def test_rayjob_error_handling_invalid_cluster_config(auto_mock_setup): - """Test error handling with invalid cluster configuration.""" - - with pytest.raises(ValueError): - RayJob( - job_name="test-job", - entrypoint="python script.py", - ) - - -def test_rayjob_constructor_parameter_validation(auto_mock_setup): - """Test constructor parameter validation.""" - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="test-cluster", - namespace="test-ns", - runtime_env={"pip": ["numpy"]}, - ttl_seconds_after_finished=300, - active_deadline_seconds=600, - ) - - assert rayjob.name == "test-job" - assert rayjob.entrypoint == "python script.py" - assert rayjob.cluster_name == "test-cluster" - assert rayjob.namespace == "test-ns" - assert rayjob.runtime_env == {"pip": ["numpy"]} - assert rayjob.ttl_seconds_after_finished == 300 - assert rayjob.active_deadline_seconds == 600 - - -def test_build_ray_cluster_spec_function(): - """Test the build_ray_cluster_spec method directly.""" - cluster_config = ManagedClusterConfig( - num_workers=2, - head_cpu_requests="500m", - head_memory_requests="512Mi", - worker_cpu_requests="250m", - worker_memory_requests="256Mi", - ) - - spec = cluster_config.build_ray_cluster_spec("test-cluster") - assert "rayVersion" in spec - assert "headGroupSpec" in spec - assert "workerGroupSpecs" in spec - - head_spec = spec["headGroupSpec"] - assert head_spec["serviceType"] == "ClusterIP" - assert head_spec["enableIngress"] is False - assert "rayStartParams" in head_spec - assert "template" in head_spec - worker_specs = spec["workerGroupSpecs"] - assert len(worker_specs) == 1 - worker_spec = worker_specs[0] - assert worker_spec["replicas"] == 2 - assert worker_spec["minReplicas"] == 2 - assert worker_spec["maxReplicas"] == 2 - assert worker_spec["groupName"] == "worker-group-test-cluster" - - -def test_build_ray_cluster_spec_with_accelerators(): - """Test build_ray_cluster_spec with GPU accelerators.""" - cluster_config = ManagedClusterConfig( - head_accelerators={"nvidia.com/gpu": 1}, - worker_accelerators={"nvidia.com/gpu": 2}, - ) - - spec = cluster_config.build_ray_cluster_spec("test-cluster") - head_spec = spec["headGroupSpec"] - head_params = head_spec["rayStartParams"] - assert "num-gpus" in head_params - assert head_params["num-gpus"] == "1" - - worker_specs = spec["workerGroupSpecs"] - worker_spec = worker_specs[0] - worker_params = worker_spec["rayStartParams"] - assert "num-gpus" in worker_params - assert worker_params["num-gpus"] == "2" - - -def test_build_ray_cluster_spec_with_custom_volumes(): - """Test build_ray_cluster_spec with custom volumes and volume mounts.""" - custom_volume = V1Volume(name="custom-data", empty_dir={}) - custom_volume_mount = V1VolumeMount(name="custom-data", mount_path="/data") - cluster_config = ManagedClusterConfig( - volumes=[custom_volume], - volume_mounts=[custom_volume_mount], - ) - - spec = cluster_config.build_ray_cluster_spec("test-cluster") - head_spec = spec["headGroupSpec"] - head_pod_spec = head_spec["template"].spec - assert len(head_pod_spec.volumes) > 0 - - head_container = head_pod_spec.containers[0] - assert len(head_container.volume_mounts) > 0 - - -def test_build_ray_cluster_spec_with_environment_variables(): - """Test build_ray_cluster_spec with environment variables.""" - cluster_config = ManagedClusterConfig( - envs={"CUDA_VISIBLE_DEVICES": "0", "RAY_DISABLE_IMPORT_WARNING": "1"}, - ) - - spec = cluster_config.build_ray_cluster_spec("test-cluster") - - head_spec = spec["headGroupSpec"] - head_pod_spec = head_spec["template"].spec - head_container = head_pod_spec.containers[0] - assert hasattr(head_container, "env") - env_vars = {env.name: env.value for env in head_container.env} - assert env_vars["CUDA_VISIBLE_DEVICES"] == "0" - assert env_vars["RAY_DISABLE_IMPORT_WARNING"] == "1" - worker_specs = spec["workerGroupSpecs"] - worker_spec = worker_specs[0] - worker_pod_spec = worker_spec["template"].spec - worker_container = worker_pod_spec.containers[0] - - assert hasattr(worker_container, "env") - worker_env_vars = {env.name: env.value for env in worker_container.env} - assert worker_env_vars["CUDA_VISIBLE_DEVICES"] == "0" - assert worker_env_vars["RAY_DISABLE_IMPORT_WARNING"] == "1" - - -def test_build_ray_cluster_spec_with_tolerations(): - """Test build_ray_cluster_spec with tolerations.""" - head_toleration = V1Toleration( - key="node-role.kubernetes.io/master", operator="Exists", effect="NoSchedule" - ) - worker_toleration = V1Toleration( - key="nvidia.com/gpu", operator="Exists", effect="NoSchedule" - ) - - cluster_config = ManagedClusterConfig( - head_tolerations=[head_toleration], - worker_tolerations=[worker_toleration], - ) - - spec = cluster_config.build_ray_cluster_spec("test-cluster") - head_spec = spec["headGroupSpec"] - head_pod_spec = head_spec["template"].spec - assert hasattr(head_pod_spec, "tolerations") - assert len(head_pod_spec.tolerations) == 1 - assert head_pod_spec.tolerations[0].key == "node-role.kubernetes.io/master" - - worker_specs = spec["workerGroupSpecs"] - worker_spec = worker_specs[0] - worker_pod_spec = worker_spec["template"].spec - assert hasattr(worker_pod_spec, "tolerations") - assert len(worker_pod_spec.tolerations) == 1 - assert worker_pod_spec.tolerations[0].key == "nvidia.com/gpu" - - -def test_build_ray_cluster_spec_with_image_pull_secrets(): - """Test build_ray_cluster_spec with image pull secrets.""" - cluster_config = ManagedClusterConfig( - image_pull_secrets=["my-registry-secret", "another-secret"] - ) - - spec = cluster_config.build_ray_cluster_spec("test-cluster") - - head_spec = spec["headGroupSpec"] - head_pod_spec = head_spec["template"].spec - assert hasattr(head_pod_spec, "image_pull_secrets") - - head_secrets = head_pod_spec.image_pull_secrets - assert len(head_secrets) == 2 - assert head_secrets[0].name == "my-registry-secret" - assert head_secrets[1].name == "another-secret" - - worker_specs = spec["workerGroupSpecs"] - worker_spec = worker_specs[0] - worker_pod_spec = worker_spec["template"].spec - assert hasattr(worker_pod_spec, "image_pull_secrets") - - worker_secrets = worker_pod_spec.image_pull_secrets - assert len(worker_secrets) == 2 - assert worker_secrets[0].name == "my-registry-secret" - assert worker_secrets[1].name == "another-secret" - - -class TestRayVersionValidation: - """Test Ray version validation in RayJob.""" - - def test_submit_with_cluster_config_compatible_image_passes(self, auto_mock_setup): - """Test that submission passes with compatible cluster_config image.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_api_instance.submit_job.return_value = True - - cluster_config = ManagedClusterConfig(image=f"ray:{RAY_VERSION}") - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - namespace="test-namespace", - entrypoint="python script.py", - ) - - result = rayjob.submit() - assert result == "test-job" - - def test_submit_with_cluster_config_incompatible_image_fails(self, auto_mock_setup): - """Test that submission fails with incompatible cluster_config image.""" - # - - cluster_config = ManagedClusterConfig(image="ray:2.8.0") # Different version - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - namespace="test-namespace", - entrypoint="python script.py", - ) - - with pytest.raises( - ValueError, match="Cluster config image: Ray version mismatch detected" - ): - rayjob.submit() - - def test_validate_ray_version_compatibility_method(self, auto_mock_setup): - """Test the _validate_ray_version_compatibility method directly.""" - # - - rayjob = RayJob( - job_name="test-job", - cluster_name="test-cluster", - namespace="test-namespace", - entrypoint="python script.py", - ) - - rayjob._validate_ray_version_compatibility() - rayjob._cluster_config = ManagedClusterConfig(image=f"ray:{RAY_VERSION}") - rayjob._validate_ray_version_compatibility() - rayjob._cluster_config = ManagedClusterConfig(image="ray:2.8.0") - with pytest.raises( - ValueError, match="Cluster config image: Ray version mismatch detected" - ): - rayjob._validate_ray_version_compatibility() - - rayjob._cluster_config = ManagedClusterConfig(image="custom-image:latest") - with pytest.warns( - UserWarning, match="Cluster config image: Cannot determine Ray version" - ): - rayjob._validate_ray_version_compatibility() - - def test_validate_cluster_config_image_method(self, auto_mock_setup): - """Test the _validate_cluster_config_image method directly.""" - # - - rayjob = RayJob( - job_name="test-job", - cluster_config=ManagedClusterConfig(), - namespace="test-namespace", - entrypoint="python script.py", - ) - - rayjob._validate_cluster_config_image() - rayjob._cluster_config.image = f"ray:{RAY_VERSION}" - rayjob._validate_cluster_config_image() - rayjob._cluster_config.image = "ray:2.8.0" - with pytest.raises( - ValueError, match="Cluster config image: Ray version mismatch detected" - ): - rayjob._validate_cluster_config_image() - - rayjob._cluster_config.image = "custom-image:latest" - with pytest.warns( - UserWarning, match="Cluster config image: Cannot determine Ray version" - ): - rayjob._validate_cluster_config_image() - - def test_validate_cluster_config_image_edge_cases(self, auto_mock_setup): - """Test edge cases in _validate_cluster_config_image method.""" - - rayjob = RayJob( - job_name="test-job", - cluster_config=ManagedClusterConfig(), - namespace="test-namespace", - entrypoint="python script.py", - ) - - rayjob._cluster_config.image = None - rayjob._validate_cluster_config_image() - rayjob._cluster_config.image = "" - rayjob._validate_cluster_config_image() - rayjob._cluster_config.image = 123 - rayjob._validate_cluster_config_image() - - class MockClusterConfig: - pass - - rayjob._cluster_config = MockClusterConfig() - rayjob._validate_cluster_config_image() - - -def test_extract_script_files_from_entrypoint_single_script(auto_mock_setup, tmp_path): - """Test extracting a single script file from entrypoint.""" - - # Create a test script - test_script = tmp_path / "test_script.py" - test_script.write_text("print('Hello World!')") - - # Change to temp directory for test - original_cwd = os.getcwd() - os.chdir(tmp_path) - - try: - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint=f"python {test_script.name}", - namespace="test-namespace", - ) - - scripts = rayjob._extract_script_files_from_entrypoint() - - assert scripts is not None - assert test_script.name in scripts - assert scripts[test_script.name] == "print('Hello World!')" - assert f"{MOUNT_PATH}/{test_script.name}" in rayjob.entrypoint - finally: - os.chdir(original_cwd) - - -def test_extract_script_files_with_dependencies(auto_mock_setup, tmp_path): - """Test extracting script files with local dependencies.""" - - # Create main script and dependency - main_script = tmp_path / "main.py" - main_script.write_text( - """ -import helper -from utils import calculate - -def main(): - helper.do_something() - result = calculate(42) - print(f"Result: {result}") - -if __name__ == "__main__": - main() -""" - ) - - helper_script = tmp_path / "helper.py" - helper_script.write_text( - """ -def do_something(): - print("Doing something...") -""" - ) - - utils_script = tmp_path / "utils.py" - utils_script.write_text( - """ -def calculate(x): - return x * 2 -""" - ) - - # Change to temp directory for test - original_cwd = os.getcwd() - os.chdir(tmp_path) - - try: - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python main.py", - namespace="test-namespace", - ) - - scripts = rayjob._extract_script_files_from_entrypoint() - - assert scripts is not None - assert len(scripts) == 3 - assert "main.py" in scripts - assert "helper.py" in scripts - assert "utils.py" in scripts - - assert "import helper" in scripts["main.py"] - assert "def do_something" in scripts["helper.py"] - assert "def calculate" in scripts["utils.py"] - - finally: - os.chdir(original_cwd) - - -def test_extract_script_files_no_local_scripts(auto_mock_setup): - """Test entrypoint with no local script files.""" - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python -c 'print(\"hello world\")'", - namespace="test-namespace", - ) - - scripts = rayjob._extract_script_files_from_entrypoint() - - assert scripts is None - - -def test_extract_script_files_nonexistent_script(auto_mock_setup): - """Test entrypoint referencing non-existent script.""" - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python nonexistent.py", - namespace="test-namespace", - ) - - scripts = rayjob._extract_script_files_from_entrypoint() - - assert scripts is None - - -def test_build_script_configmap_spec(): - """Test building ConfigMap specification for scripts.""" - config = ManagedClusterConfig() - scripts = {"main.py": "print('main')", "helper.py": "print('helper')"} - - spec = config.build_script_configmap_spec( - job_name="test-job", namespace="test-namespace", scripts=scripts - ) - - assert spec["apiVersion"] == "v1" - assert spec["kind"] == "ConfigMap" - assert spec["metadata"]["name"] == "test-job-scripts" - assert spec["metadata"]["namespace"] == "test-namespace" - assert spec["data"] == scripts - - -def test_build_script_volume_specs(): - """Test building volume and mount specifications for scripts.""" - config = ManagedClusterConfig() - - volume_spec, mount_spec = config.build_script_volume_specs( - configmap_name="test-scripts", mount_path="/custom/path" - ) - - assert volume_spec["name"] == "ray-job-scripts" - assert volume_spec["configMap"]["name"] == "test-scripts" - - assert mount_spec["name"] == "ray-job-scripts" - assert mount_spec["mountPath"] == "/custom/path" - - -def test_add_script_volumes(): - """Test adding script volumes to cluster configuration.""" - config = ManagedClusterConfig() - - # Initially no volumes - assert len(config.volumes) == 0 - assert len(config.volume_mounts) == 0 - - config.add_script_volumes(configmap_name="test-scripts") - - assert len(config.volumes) == 1 - assert len(config.volume_mounts) == 1 - - volume = config.volumes[0] - mount = config.volume_mounts[0] - - assert volume.name == "ray-job-scripts" - assert volume.config_map.name == "test-scripts" - - assert mount.name == "ray-job-scripts" - assert mount.mount_path == MOUNT_PATH - - -def test_add_script_volumes_duplicate_prevention(): - """Test that adding script volumes twice doesn't create duplicates.""" - config = ManagedClusterConfig() - - # Add volumes twice - config.add_script_volumes(configmap_name="test-scripts") - config.add_script_volumes(configmap_name="test-scripts") - - assert len(config.volumes) == 1 - assert len(config.volume_mounts) == 1 - - -def test_create_configmap_from_spec(auto_mock_setup): - """Test creating ConfigMap via Kubernetes API.""" - mock_api_instance = auto_mock_setup["k8s_api"] - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - configmap_spec = { - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": {"name": "test-scripts", "namespace": "test-namespace"}, - "data": {"test.py": "print('test')"}, - } - - result = rayjob._create_configmap_from_spec(configmap_spec) - - assert result == "test-scripts" - mock_api_instance.create_namespaced_config_map.assert_called_once() - - -def test_create_configmap_already_exists(auto_mock_setup): - """Test creating ConfigMap when it already exists (409 conflict).""" - mock_api_instance = auto_mock_setup["k8s_api"] - - mock_api_instance.create_namespaced_config_map.side_effect = ApiException( - status=409 - ) - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - configmap_spec = { - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": {"name": "test-scripts", "namespace": "test-namespace"}, - "data": {"test.py": "print('test')"}, - } - - result = rayjob._create_configmap_from_spec(configmap_spec) - - assert result == "test-scripts" - mock_api_instance.create_namespaced_config_map.assert_called_once() - mock_api_instance.replace_namespaced_config_map.assert_called_once() - - -def test_create_configmap_with_owner_reference_basic(mocker, auto_mock_setup, caplog): - """Test creating ConfigMap with owner reference from valid RayJob result.""" - mock_api_instance = auto_mock_setup["k8s_api"] - - # Mock client.V1ObjectMeta and V1ConfigMap - mock_v1_metadata = mocker.patch("kubernetes.client.V1ObjectMeta") - mock_metadata_instance = MagicMock() - mock_v1_metadata.return_value = mock_metadata_instance - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - configmap_spec = { - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": { - "name": "test-scripts", - "namespace": "test-namespace", - "labels": { - "ray.io/job-name": "test-job", - "app.kubernetes.io/managed-by": "codeflare-sdk", - "app.kubernetes.io/component": "rayjob-scripts", - }, - }, - "data": {"test.py": "print('test')"}, - } - - # Valid RayJob result with UID - rayjob_result = { - "metadata": { - "name": "test-job", - "namespace": "test-namespace", - "uid": "a4dd4c5a-ab61-411d-b4d1-4abb5177422a", - } - } - - with caplog.at_level("INFO"): - result = rayjob._create_configmap_from_spec(configmap_spec, rayjob_result) - - assert result == "test-scripts" - - # Verify owner reference was set - expected_owner_ref = mocker.ANY # We'll check via the logs - assert ( - "Adding owner reference to ConfigMap 'test-scripts' with RayJob UID: a4dd4c5a-ab61-411d-b4d1-4abb5177422a" - in caplog.text - ) - - assert mock_metadata_instance.owner_references is not None - mock_api_instance.create_namespaced_config_map.assert_called_once() - - -def test_create_configmap_without_owner_reference_no_uid( - mocker, auto_mock_setup, caplog -): - """Test creating ConfigMap without owner reference when RayJob has no UID.""" - mock_api_instance = auto_mock_setup["k8s_api"] - - mock_v1_metadata = mocker.patch("kubernetes.client.V1ObjectMeta") - mock_metadata_instance = MagicMock() - mock_v1_metadata.return_value = mock_metadata_instance - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - configmap_spec = { - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": {"name": "test-scripts", "namespace": "test-namespace"}, - "data": {"test.py": "print('test')"}, - } - - # RayJob result without UID - rayjob_result = { - "metadata": { - "name": "test-job", - "namespace": "test-namespace", - # No UID field - } - } - - with caplog.at_level("WARNING"): - result = rayjob._create_configmap_from_spec(configmap_spec, rayjob_result) - - assert result == "test-scripts" - - # Verify warning was logged and no owner reference was set - assert ( - "No valid RayJob result with UID found, ConfigMap 'test-scripts' will not have owner reference" - in caplog.text - ) - - # The important part is that the warning was logged, indicating no owner reference was set - mock_api_instance.create_namespaced_config_map.assert_called_once() - - -def test_create_configmap_with_invalid_rayjob_result(auto_mock_setup, caplog): - """Test creating ConfigMap with None or invalid rayjob_result.""" - mock_api_instance = auto_mock_setup["k8s_api"] - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - configmap_spec = { - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": {"name": "test-scripts", "namespace": "test-namespace"}, - "data": {"test.py": "print('test')"}, - } - - # Test with None - with caplog.at_level("WARNING"): - result = rayjob._create_configmap_from_spec(configmap_spec, None) - - assert result == "test-scripts" - assert "No valid RayJob result with UID found" in caplog.text - - # Test with string instead of dict - caplog.clear() - with caplog.at_level("WARNING"): - result = rayjob._create_configmap_from_spec(configmap_spec, "not-a-dict") - - assert result == "test-scripts" - assert "No valid RayJob result with UID found" in caplog.text - - -def test_handle_script_volumes_for_new_cluster(mocker, auto_mock_setup, tmp_path): - """Test handling script volumes for new cluster creation.""" - # auto_mock_setup handles kubernetes and API mocking - - mock_create = mocker.patch.object(RayJob, "_create_configmap_from_spec") - mock_create.return_value = "test-job-scripts" - - test_script = tmp_path / "test.py" - test_script.write_text("print('test')") - - cluster_config = ManagedClusterConfig() - - original_cwd = os.getcwd() - os.chdir(tmp_path) - - try: - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python test.py", - namespace="test-namespace", - ) - - scripts = {"test.py": "print('test')"} - rayjob._handle_script_volumes_for_new_cluster(scripts) - - mock_create.assert_called_once() - - assert len(cluster_config.volumes) == 1 - assert len(cluster_config.volume_mounts) == 1 - - finally: - os.chdir(original_cwd) - - -def test_ast_parsing_import_detection(auto_mock_setup, tmp_path): - """Test AST parsing correctly detects import statements.""" - - main_script = tmp_path / "main.py" - main_script.write_text( - """# Different import patterns -import helper -from utils import func1, func2 -from local_module import MyClass -import os # Standard library - should be ignored -import non_existent # Non-local - should be ignored -""" - ) - - helper_script = tmp_path / "helper.py" - helper_script.write_text("def helper_func(): pass") - - utils_script = tmp_path / "utils.py" - utils_script.write_text( - """def func1(): pass -def func2(): pass -""" - ) - - local_module_script = tmp_path / "local_module.py" - local_module_script.write_text("class MyClass: pass") - - original_cwd = os.getcwd() - os.chdir(tmp_path) - - try: - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python main.py", - namespace="test-namespace", - ) - - scripts = rayjob._extract_script_files_from_entrypoint() - - assert scripts is not None - assert len(scripts) == 4 # main + 3 dependencies - assert "main.py" in scripts - assert "helper.py" in scripts - assert "utils.py" in scripts - assert "local_module.py" in scripts - - finally: - os.chdir(original_cwd) - - -def test_script_handling_timing_after_rayjob_submission( - mocker, auto_mock_setup, tmp_path -): - """Test that script handling happens after RayJob is submitted (not before).""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - submit_result = { - "metadata": { - "name": "test-job", - "namespace": "test-namespace", - "uid": "test-uid-12345", - } - } - mock_api_instance.submit_job.return_value = submit_result - - mock_handle_new = mocker.patch.object( - RayJob, "_handle_script_volumes_for_new_cluster" - ) - - # RayClusterApi is already mocked by auto_mock_setup - - test_script = tmp_path / "test.py" - test_script.write_text("print('test')") - - call_order = [] - - def track_submit(*args, **kwargs): - call_order.append("submit_job") - return submit_result - - def track_handle_scripts(*args, **kwargs): - call_order.append("handle_scripts") - assert len(args) >= 2 - assert args[1] == submit_result # rayjob_result should be second arg - - mock_api_instance.submit_job.side_effect = track_submit - mock_handle_new.side_effect = track_handle_scripts - - original_cwd = os.getcwd() - try: - os.chdir(tmp_path) - - cluster_config = ManagedClusterConfig() - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python test.py", - namespace="test-namespace", - ) - - rayjob.submit() - finally: - os.chdir(original_cwd) - - assert call_order == ["submit_job", "handle_scripts"] - - mock_api_instance.submit_job.assert_called_once() - mock_handle_new.assert_called_once() - - mock_handle_new.assert_called_with({"test.py": "print('test')"}, submit_result) - - -def test_rayjob_submit_with_scripts_new_cluster(auto_mock_setup, tmp_path): - """Test RayJob submission with script detection for new cluster.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_api_instance.submit_job.return_value = True - - mock_k8s_instance = auto_mock_setup["k8s_api"] - - # Create test script - test_script = tmp_path / "test.py" - test_script.write_text("print('Hello from script!')") - - cluster_config = ManagedClusterConfig() - - original_cwd = os.getcwd() - os.chdir(tmp_path) - - try: - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python test.py", - namespace="test-namespace", - ) - - # Submit should detect scripts and handle them - result = rayjob.submit() - - assert result == "test-job" - - mock_k8s_instance.create_namespaced_config_map.assert_called_once() - - assert len(cluster_config.volumes) == 1 - assert len(cluster_config.volume_mounts) == 1 - assert f"{MOUNT_PATH}/test.py" in rayjob.entrypoint - - finally: - os.chdir(original_cwd) - - -def test_process_script_and_imports_io_error(mocker, auto_mock_setup, tmp_path): - """Test _process_script_and_imports handles IO errors gracefully.""" - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - scripts = {} - processed_files = set() - - # Mock os.path.isfile to return True but open() to raise IOError - mocker.patch("os.path.isfile", return_value=True) - mocker.patch("builtins.open", side_effect=IOError("Permission denied")) - - rayjob._process_script_and_imports("test.py", scripts, MOUNT_PATH, processed_files) - assert "test.py" in processed_files - assert len(scripts) == 0 - - -def test_process_script_and_imports_container_path_skip(auto_mock_setup): - """Test that scripts already in container paths are skipped.""" - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - scripts = {} - processed_files = set() - - # Test script path already in container - rayjob._process_script_and_imports( - f"{MOUNT_PATH}/test.py", scripts, MOUNT_PATH, processed_files - ) - - assert len(scripts) == 0 - assert len(processed_files) == 0 - - -def test_process_script_and_imports_already_processed(auto_mock_setup, tmp_path): - """Test that already processed scripts are skipped (infinite loop prevention).""" - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - scripts = {} - processed_files = {"test.py"} # Already processed - - rayjob._process_script_and_imports("test.py", scripts, MOUNT_PATH, processed_files) - - assert len(scripts) == 0 - assert processed_files == {"test.py"} - - -def test_submit_with_scripts_owner_reference_integration( - mocker, auto_mock_setup, tmp_path, caplog -): - """Integration test for submit() with local scripts to verify end-to-end owner reference flow.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_k8s_instance = auto_mock_setup["k8s_api"] - - # RayJob submission returns result with UID - submit_result = { - "metadata": { - "name": "test-job", - "namespace": "test-namespace", - "uid": "unique-rayjob-uid-12345", - } - } - mock_api_instance.submit_job.return_value = submit_result - - # Capture the ConfigMap that gets created - created_configmap = None - - def capture_configmap(namespace, body): - nonlocal created_configmap - created_configmap = body - return body - - mock_k8s_instance.create_namespaced_config_map.side_effect = capture_configmap - - # Create test scripts - test_script = tmp_path / "main.py" - test_script.write_text("import helper\nprint('main')") - - helper_script = tmp_path / "helper.py" - helper_script.write_text("def help(): print('helper')") - - # Change to temp directory for script detection - original_cwd = os.getcwd() - try: - os.chdir(tmp_path) - - cluster_config = ManagedClusterConfig() - - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python main.py", - namespace="test-namespace", - ) - - with caplog.at_level("INFO"): - result = rayjob.submit() - - assert result == "test-job" - - mock_api_instance.submit_job.assert_called_once() - mock_k8s_instance.create_namespaced_config_map.assert_called_once() - assert created_configmap is not None - - # Verify owner reference was set correctly - assert hasattr(created_configmap.metadata, "owner_references") - assert created_configmap.metadata.owner_references is not None - assert len(created_configmap.metadata.owner_references) == 1 - - owner_ref = created_configmap.metadata.owner_references[0] - assert owner_ref.api_version == "ray.io/v1" - assert owner_ref.kind == "RayJob" - assert owner_ref.name == "test-job" - assert owner_ref.uid == "unique-rayjob-uid-12345" - assert owner_ref.controller is True - assert owner_ref.block_owner_deletion is True - - # Verify labels were set - assert created_configmap.metadata.labels["ray.io/job-name"] == "test-job" - assert ( - created_configmap.metadata.labels["app.kubernetes.io/managed-by"] - == "codeflare-sdk" - ) - assert ( - created_configmap.metadata.labels["app.kubernetes.io/component"] - == "rayjob-scripts" - ) - - assert "main.py" in created_configmap.data - assert "helper.py" in created_configmap.data - assert ( - "Adding owner reference to ConfigMap 'test-job-scripts' with RayJob UID: unique-rayjob-uid-12345" - in caplog.text - ) - - finally: - os.chdir(original_cwd) - - -def test_find_local_imports_syntax_error(mocker, auto_mock_setup): - """Test _find_local_imports handles syntax errors gracefully.""" - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - # Invalid Python syntax - invalid_script_content = "import helper\ndef invalid_syntax(" - - mock_callback = mocker.Mock() - - rayjob._find_local_imports(invalid_script_content, "test.py", mock_callback) - mock_callback.assert_not_called() - - -def test_create_configmap_api_error_non_409(auto_mock_setup): - """Test _create_configmap_from_spec handles non-409 API errors.""" - mock_api_instance = auto_mock_setup["k8s_api"] - - # Configure to raise 500 error - mock_api_instance.create_namespaced_config_map.side_effect = ApiException( - status=500 - ) - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - configmap_spec = { - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": {"name": "test-scripts", "namespace": "test-namespace"}, - "data": {"test.py": "print('test')"}, - } - - with pytest.raises(RuntimeError, match="Failed to create ConfigMap"): - rayjob._create_configmap_from_spec(configmap_spec) - - -def test_update_existing_cluster_get_cluster_error(mocker, auto_mock_setup): - """Test _update_existing_cluster_for_scripts handles get cluster errors.""" - mock_cluster_api_instance = auto_mock_setup["cluster_api"] - - # Configure it to raise an error - mock_cluster_api_instance.get_ray_cluster.side_effect = ApiException(status=404) - - config_builder = ManagedClusterConfig() - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - with pytest.raises(RuntimeError, match="Failed to get RayCluster"): - rayjob._update_existing_cluster_for_scripts("test-scripts", config_builder) - - -def test_update_existing_cluster_patch_error(mocker, auto_mock_setup): - """Test _update_existing_cluster_for_scripts handles patch errors.""" - mock_cluster_api_instance = auto_mock_setup["cluster_api"] - - # Mock successful get but failed patch - mock_cluster_api_instance.get_ray_cluster.return_value = { - "spec": { - "headGroupSpec": { - "template": { - "spec": {"volumes": [], "containers": [{"volumeMounts": []}]} - } - }, - "workerGroupSpecs": [ - { - "template": { - "spec": {"volumes": [], "containers": [{"volumeMounts": []}]} - } - } - ], - } - } - - mock_cluster_api_instance.patch_ray_cluster.side_effect = ApiException(status=500) - - config_builder = ManagedClusterConfig() - - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python test.py", - namespace="test-namespace", - ) - - with pytest.raises(RuntimeError, match="Failed to update RayCluster"): - rayjob._update_existing_cluster_for_scripts("test-scripts", config_builder) - - -def test_extract_script_files_empty_entrypoint(auto_mock_setup): - """Test script extraction with empty entrypoint.""" - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="", # Empty entrypoint - namespace="test-namespace", - ) - - scripts = rayjob._extract_script_files_from_entrypoint() - - assert scripts is None - - -def test_add_script_volumes_existing_volume_skip(): - """Test add_script_volumes skips when volume already exists (missing coverage).""" - config = ManagedClusterConfig() - - # Pre-add a volume with same name - existing_volume = V1Volume( - name="ray-job-scripts", - config_map=V1ConfigMapVolumeSource(name="existing-scripts"), - ) - config.volumes.append(existing_volume) - - config.add_script_volumes(configmap_name="new-scripts") - assert len(config.volumes) == 1 - assert len(config.volume_mounts) == 0 # Mount not added due to volume skip - - -def test_add_script_volumes_existing_mount_skip(): - """Test add_script_volumes skips when mount already exists (missing coverage).""" - config = ManagedClusterConfig() - - # Pre-add a mount with same name - existing_mount = V1VolumeMount(name="ray-job-scripts", mount_path="/existing/path") - config.volume_mounts.append(existing_mount) - - config.add_script_volumes(configmap_name="new-scripts") - assert len(config.volumes) == 0 # Volume not added due to mount skip - assert len(config.volume_mounts) == 1 - - -def test_rayjob_stop_success(auto_mock_setup, caplog): - """Test successful RayJob stop operation.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - mock_api_instance.suspend_job.return_value = { - "metadata": {"name": "test-rayjob"}, - "spec": {"suspend": True}, - } - - rayjob = RayJob( - job_name="test-rayjob", - cluster_name="test-cluster", - namespace="test-namespace", - entrypoint="python script.py", - ) - - with caplog.at_level("INFO"): - result = rayjob.stop() - - assert result is True - - mock_api_instance.suspend_job.assert_called_once_with( - name="test-rayjob", k8s_namespace="test-namespace" - ) - - # Verify success message was logged - assert "Successfully stopped the RayJob test-rayjob" in caplog.text - - -def test_rayjob_stop_failure(auto_mock_setup): - """Test RayJob stop operation when API call fails.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - mock_api_instance.suspend_job.return_value = None - - rayjob = RayJob( - job_name="test-rayjob", - cluster_name="test-cluster", - namespace="test-namespace", - entrypoint="python script.py", - ) - - with pytest.raises(RuntimeError, match="Failed to stop the RayJob test-rayjob"): - rayjob.stop() - - mock_api_instance.suspend_job.assert_called_once_with( - name="test-rayjob", k8s_namespace="test-namespace" - ) - - -def test_rayjob_resubmit_success(auto_mock_setup): - """Test successful RayJob resubmit operation.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - mock_api_instance.resubmit_job.return_value = { - "metadata": {"name": "test-rayjob"}, - "spec": {"suspend": False}, - } - - rayjob = RayJob( - job_name="test-rayjob", - cluster_name="test-cluster", - namespace="test-namespace", - entrypoint="python script.py", - ) - - result = rayjob.resubmit() - - assert result is True - - mock_api_instance.resubmit_job.assert_called_once_with( - name="test-rayjob", k8s_namespace="test-namespace" - ) - - -def test_rayjob_resubmit_failure(auto_mock_setup): - """Test RayJob resubmit operation when API call fails.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - mock_api_instance.resubmit_job.return_value = None - - rayjob = RayJob( - job_name="test-rayjob", - cluster_name="test-cluster", - namespace="test-namespace", - entrypoint="python script.py", - ) - - with pytest.raises(RuntimeError, match="Failed to resubmit the RayJob test-rayjob"): - rayjob.resubmit() - - mock_api_instance.resubmit_job.assert_called_once_with( - name="test-rayjob", k8s_namespace="test-namespace" - ) - - -def test_rayjob_delete_success(auto_mock_setup): - """Test successful RayJob deletion.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - rayjob = RayJob( - job_name="test-rayjob", - entrypoint="python script.py", - cluster_name="test-cluster", - ) - - mock_api_instance.delete_job.return_value = True - - result = rayjob.delete() - - assert result is True - mock_api_instance.delete_job.assert_called_once_with( - name="test-rayjob", k8s_namespace="test-namespace" - ) - - -def test_rayjob_delete_failure(auto_mock_setup): - """Test failed RayJob deletion.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - rayjob = RayJob( - job_name="test-rayjob", - entrypoint="python script.py", - cluster_name="test-cluster", - ) - - mock_api_instance.delete_job.return_value = False - - with pytest.raises(RuntimeError, match="Failed to delete the RayJob test-rayjob"): - rayjob.delete() - - mock_api_instance.delete_job.assert_called_once_with( - name="test-rayjob", k8s_namespace="test-namespace" - ) - - -def test_rayjob_init_both_none_error(auto_mock_setup): - """Test RayJob initialization error when both cluster_name and cluster_config are None.""" - with pytest.raises( - ValueError, - match="Configuration Error: You must provide either 'cluster_name' .* or 'cluster_config'", - ): - RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name=None, - cluster_config=None, - ) - - -def test_rayjob_init_missing_cluster_name_with_no_config(auto_mock_setup): - """Test RayJob initialization error when cluster_name is None without cluster_config.""" - with pytest.raises( - ValueError, - match="Configuration Error: a 'cluster_name' is required when not providing 'cluster_config'", - ): - rayjob = RayJob.__new__(RayJob) - rayjob.name = "test-job" - rayjob.entrypoint = "python script.py" - rayjob.runtime_env = None - rayjob.ttl_seconds_after_finished = 0 - rayjob.active_deadline_seconds = None - rayjob.shutdown_after_job_finishes = True - rayjob.namespace = "test-namespace" - rayjob._cluster_name = None - rayjob._cluster_config = None - if rayjob._cluster_config is None and rayjob._cluster_name is None: - raise ValueError( - "❌ Configuration Error: a 'cluster_name' is required when not providing 'cluster_config'" - ) - - -def test_handle_script_volumes_for_existing_cluster_direct_call(auto_mock_setup): - """Test _handle_script_volumes_for_existing_cluster method directly.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_cluster_api = auto_mock_setup["cluster_api"] - mock_k8s_api = auto_mock_setup["k8s_api"] - - # Mock existing cluster - mock_cluster = { - "spec": { - "headGroupSpec": { - "template": { - "spec": {"containers": [{"volumeMounts": []}], "volumes": []} - } - }, - "workerGroupSpecs": [ - { - "template": { - "spec": {"containers": [{"volumeMounts": []}], "volumes": []} - } - } - ], - } - } - mock_cluster_api.get_ray_cluster.return_value = mock_cluster - - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="existing-cluster", - ) - - scripts = {"test_script.py": "print('Hello World')"} - rayjob._handle_script_volumes_for_existing_cluster( - scripts, {"metadata": {"uid": "test-uid"}} - ) - - mock_k8s_api.create_namespaced_config_map.assert_called_once() - created_configmap = mock_k8s_api.create_namespaced_config_map.call_args[1]["body"] - assert "test_script.py" in created_configmap.data - - mock_cluster_api.patch_ray_cluster.assert_called_once_with( - name="existing-cluster", ray_patch=mock_cluster, k8s_namespace="test-namespace" - ) - - -def test_handle_script_volumes_for_existing_cluster_no_volumes_init(auto_mock_setup): - """Test _handle_script_volumes_for_existing_cluster when volumes/mounts don't exist initially.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_cluster_api = auto_mock_setup["cluster_api"] - mock_k8s_api = auto_mock_setup["k8s_api"] - - # Mock existing cluster WITHOUT volumes/volumeMounts (to test initialization) - mock_cluster = { - "spec": { - "headGroupSpec": {"template": {"spec": {"containers": [{}]}}}, - "workerGroupSpecs": [{"template": {"spec": {"containers": [{}]}}}], - } - } - mock_cluster_api.get_ray_cluster.return_value = mock_cluster - - # Create RayJob with existing cluster - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="existing-cluster", - ) - - # Call the method directly with test scripts - scripts = {"test_script.py": "print('Hello World')"} - rayjob._handle_script_volumes_for_existing_cluster( - scripts, {"metadata": {"uid": "test-uid"}} - ) - - # Verify volumes and volumeMounts were initialized - patched_cluster = mock_cluster_api.patch_ray_cluster.call_args[1]["ray_patch"] - - # Check head group - head_spec = patched_cluster["spec"]["headGroupSpec"]["template"]["spec"] - assert "volumes" in head_spec - assert len(head_spec["volumes"]) == 1 - assert "volumeMounts" in head_spec["containers"][0] - assert len(head_spec["containers"][0]["volumeMounts"]) == 1 - - # Check worker group - worker_spec = patched_cluster["spec"]["workerGroupSpecs"][0]["template"]["spec"] - assert "volumes" in worker_spec - assert len(worker_spec["volumes"]) == 1 - assert "volumeMounts" in worker_spec["containers"][0] - assert len(worker_spec["containers"][0]["volumeMounts"]) == 1 - - -def test_update_existing_cluster_for_scripts_api_errors(mocker, auto_mock_setup): - """Test _update_existing_cluster_for_scripts error handling.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_cluster_api = auto_mock_setup["cluster_api"] - - # Mock config builder - mock_config_builder = mocker.MagicMock() - mocker.patch( - "codeflare_sdk.ray.rayjobs.rayjob.ManagedClusterConfig", - return_value=mock_config_builder, - ) - - # Set up config builder to return valid specs - mock_config_builder.build_script_volume_specs.return_value = ( - {"name": "script-volume", "configMap": {"name": "test-configmap"}}, - {"name": "script-volume", "mountPath": "/home/ray/scripts"}, - ) - - # Mock cluster API to raise error - mock_cluster_api.get_ray_cluster.side_effect = ApiException( - status=404, reason="Not Found" - ) - - # Create RayJob - rayjob = RayJob( - job_name="test-job", - entrypoint="python script.py", - cluster_name="existing-cluster", - ) - - # Call the method directly - with pytest.raises( - RuntimeError, match="Failed to get RayCluster 'existing-cluster'" - ): - rayjob._update_existing_cluster_for_scripts( - "test-configmap", mock_config_builder - ) - - -def test_rayjob_kueue_label_no_default_queue(auto_mock_setup, mocker, caplog): - """Test RayJob falls back to 'default' queue when no default queue exists.""" - mocker.patch( - "codeflare_sdk.ray.rayjobs.rayjob.get_default_kueue_name", - return_value=None, - ) - - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} - - cluster_config = ManagedClusterConfig() - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - ) - - with caplog.at_level("WARNING"): - rayjob.submit() - - # Verify the submitted job has the fallback label - call_args = mock_api_instance.submit_job.call_args - submitted_job = call_args.kwargs["job"] - assert submitted_job["metadata"]["labels"]["kueue.x-k8s.io/queue-name"] == "default" - - # Verify warning was logged - assert "No default Kueue LocalQueue found" in caplog.text - - -def test_rayjob_kueue_explicit_local_queue(auto_mock_setup): - """Test RayJob uses explicitly specified local queue.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} - - cluster_config = ManagedClusterConfig() - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - local_queue="custom-queue", - ) - - rayjob.submit() - - # Verify the submitted job has the explicit queue label - call_args = mock_api_instance.submit_job.call_args - submitted_job = call_args.kwargs["job"] - assert ( - submitted_job["metadata"]["labels"]["kueue.x-k8s.io/queue-name"] - == "custom-queue" - ) - - -def test_rayjob_no_kueue_label_for_existing_cluster(auto_mock_setup): - """Test RayJob doesn't add Kueue label for existing clusters.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} - - # Using existing cluster (no cluster_config) - rayjob = RayJob( - job_name="test-job", - cluster_name="existing-cluster", - entrypoint="python script.py", - ) - - rayjob.submit() - - # Verify no Kueue label was added - call_args = mock_api_instance.submit_job.call_args - submitted_job = call_args.kwargs["job"] - assert "kueue.x-k8s.io/queue-name" not in submitted_job["metadata"]["labels"] - - -def test_rayjob_with_ttl_and_deadline(auto_mock_setup): - """Test RayJob with TTL and active deadline seconds.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} - - cluster_config = ManagedClusterConfig() - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - ttl_seconds_after_finished=300, - active_deadline_seconds=600, - ) - - rayjob.submit() - - # Verify TTL and deadline were set - call_args = mock_api_instance.submit_job.call_args - submitted_job = call_args.kwargs["job"] - assert submitted_job["spec"]["ttlSecondsAfterFinished"] == 300 - assert submitted_job["spec"]["activeDeadlineSeconds"] == 600 - - -def test_rayjob_shutdown_after_job_finishes(auto_mock_setup): - """Test RayJob sets shutdownAfterJobFinishes correctly.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}} - - # Test with managed cluster (should shutdown) - cluster_config = ManagedClusterConfig() - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - ) - - rayjob.submit() - - call_args = mock_api_instance.submit_job.call_args - submitted_job = call_args.kwargs["job"] - assert submitted_job["spec"]["shutdownAfterJobFinishes"] is True - - # Test with existing cluster (should not shutdown) - rayjob2 = RayJob( - job_name="test-job2", - cluster_name="existing-cluster", - entrypoint="python script.py", - ) - - rayjob2.submit() - - call_args2 = mock_api_instance.submit_job.call_args - submitted_job2 = call_args2.kwargs["job"] - assert submitted_job2["spec"]["shutdownAfterJobFinishes"] is False - - -def test_rayjob_stop_delete_resubmit_logging(auto_mock_setup, caplog): - """Test logging for stop, delete, and resubmit operations.""" - mock_api_instance = auto_mock_setup["rayjob_api"] - - # Test stop with logging - mock_api_instance.suspend_job.return_value = { - "metadata": {"name": "test-rayjob"}, - "spec": {"suspend": True}, - } - - rayjob = RayJob( - job_name="test-rayjob", - cluster_name="test-cluster", - namespace="test-namespace", - entrypoint="python script.py", - ) - - with caplog.at_level("INFO"): - result = rayjob.stop() - - assert result is True - assert "Successfully stopped the RayJob test-rayjob" in caplog.text - - # Test delete with logging - caplog.clear() - mock_api_instance.delete_job.return_value = True - - with caplog.at_level("INFO"): - result = rayjob.delete() - - assert result is True - assert "Successfully deleted the RayJob test-rayjob" in caplog.text - - # Test resubmit with logging - caplog.clear() - mock_api_instance.resubmit_job.return_value = { - "metadata": {"name": "test-rayjob"}, - "spec": {"suspend": False}, - } - - with caplog.at_level("INFO"): - result = rayjob.resubmit() - - assert result is True - assert "Successfully resubmitted the RayJob test-rayjob" in caplog.text - - -def test_rayjob_initialization_logging(auto_mock_setup, caplog): - """Test RayJob initialization logging.""" - with caplog.at_level("INFO"): - cluster_config = ManagedClusterConfig() - rayjob = RayJob( - job_name="test-job", - cluster_config=cluster_config, - entrypoint="python script.py", - ) - - assert "Creating new cluster: test-job-cluster" in caplog.text - assert "Initialized RayJob: test-job in namespace: test-namespace" in caplog.text diff --git a/tests/e2e/rayjob/rayjob_existing_cluster_test.py b/tests/e2e/rayjob/rayjob_existing_cluster_test.py index b62ea1ef..82858d28 100644 --- a/tests/e2e/rayjob/rayjob_existing_cluster_test.py +++ b/tests/e2e/rayjob/rayjob_existing_cluster_test.py @@ -41,18 +41,22 @@ def test_existing_kueue_cluster(self): ) auth.login() + resources = get_platform_appropriate_resources() + cluster = Cluster( ClusterConfiguration( name=cluster_name, namespace=self.namespace, num_workers=1, - head_cpu_requests="500m", - head_cpu_limits="500m", - worker_cpu_requests=1, - worker_cpu_limits=1, - worker_memory_requests=1, - worker_memory_limits=4, - image=get_ray_image(), + head_cpu_requests=resources["head_cpu_requests"], + head_cpu_limits=resources["head_cpu_limits"], + head_memory_requests=resources["head_memory_requests"], + head_memory_limits=resources["head_memory_limits"], + worker_cpu_requests=resources["worker_cpu_requests"], + worker_cpu_limits=resources["worker_cpu_limits"], + worker_memory_requests=resources["worker_memory_requests"], + worker_memory_limits=resources["worker_memory_limits"], + image=constants.CUDA_PY312_RUNTIME_IMAGE, local_queue=self.local_queues[0], write_to_file=True, verify_tls=False, @@ -60,7 +64,11 @@ def test_existing_kueue_cluster(self): ) cluster.apply() - sleep(20) + + # Wait for cluster to be ready (with Kueue admission) + print(f"Waiting for cluster '{cluster_name}' to be ready...") + cluster.wait_ready(timeout=600) + print(f"✓ Cluster '{cluster_name}' is ready") # RayJob with explicit local_queue rayjob_explicit = RayJob( diff --git a/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py b/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py index 51c72df6..10390011 100644 --- a/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py +++ b/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py @@ -2,11 +2,14 @@ import sys import os from time import sleep +import tempfile sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from support import * from codeflare_sdk import RayJob, ManagedClusterConfig + +from kubernetes import client from python_client.kuberay_job_api import RayjobApi from python_client.kuberay_cluster_api import RayClusterApi @@ -22,7 +25,7 @@ def teardown_method(self): delete_kueue_resources(self) def test_lifecycled_kueue_managed(self): - """Test RayJob with Kueue-managed lifecycled cluster.""" + """Test RayJob with Kueue-managed lifecycled cluster with Secret validation.""" self.setup_method() create_namespace(self) create_kueue_resources(self) @@ -46,17 +49,36 @@ def test_lifecycled_kueue_managed(self): worker_memory_limits=resources["worker_memory_limits"], ) - rayjob = RayJob( - job_name=job_name, - namespace=self.namespace, - cluster_config=cluster_config, - entrypoint="python -c \"import ray; ray.init(); print('Kueue job done')\"", - runtime_env={"env_vars": get_setup_env_variables(ACCELERATOR="cpu")}, - local_queue=self.local_queues[0], - ) + # Create a temporary script file to test Secret functionality + with tempfile.NamedTemporaryFile( + mode="w", suffix=".py", delete=False, dir=os.getcwd() + ) as script_file: + script_file.write( + """ + import ray + ray.init() + print('Kueue job with Secret done') + ray.shutdown() + """ + ) + script_file.flush() + script_filename = os.path.basename(script_file.name) try: + rayjob = RayJob( + job_name=job_name, + namespace=self.namespace, + cluster_config=cluster_config, + entrypoint=f"python {script_filename}", + runtime_env={"env_vars": get_setup_env_variables(ACCELERATOR="cpu")}, + local_queue=self.local_queues[0], + ) + assert rayjob.submit() == job_name + + # Verify Secret was created with owner reference + self.verify_secret_with_owner_reference(rayjob) + assert self.job_api.wait_until_job_running( name=rayjob.name, k8s_namespace=rayjob.namespace, timeout=600 ) @@ -70,6 +92,12 @@ def test_lifecycled_kueue_managed(self): except Exception: pass # Job might already be deleted verify_rayjob_cluster_cleanup(cluster_api, rayjob.name, rayjob.namespace) + # Clean up the temporary script file + if "script_filename" in locals(): + try: + os.remove(script_filename) + except: + pass def test_lifecycled_kueue_resource_queueing(self): """Test Kueue resource queueing with lifecycled clusters.""" @@ -161,3 +189,61 @@ def test_lifecycled_kueue_resource_queueing(self): ) except: pass + + def verify_secret_with_owner_reference(self, rayjob: RayJob): + """Verify that the Secret was created with proper owner reference to the RayJob.""" + v1 = client.CoreV1Api() + secret_name = f"{rayjob.name}-files" + + try: + # Get the Secret + secret = v1.read_namespaced_secret( + name=secret_name, namespace=rayjob.namespace + ) + + # Verify Secret exists + assert secret is not None, f"Secret {secret_name} not found" + + # Verify it contains the script + assert secret.data is not None, "Secret has no data" + assert len(secret.data) > 0, "Secret data is empty" + + # Verify owner reference + assert ( + secret.metadata.owner_references is not None + ), "Secret has no owner references" + assert ( + len(secret.metadata.owner_references) > 0 + ), "Secret owner references list is empty" + + owner_ref = secret.metadata.owner_references[0] + assert ( + owner_ref.api_version == "ray.io/v1" + ), f"Wrong API version: {owner_ref.api_version}" + assert owner_ref.kind == "RayJob", f"Wrong kind: {owner_ref.kind}" + assert owner_ref.name == rayjob.name, f"Wrong owner name: {owner_ref.name}" + assert ( + owner_ref.controller is True + ), "Owner reference controller not set to true" + assert ( + owner_ref.block_owner_deletion is True + ), "Owner reference blockOwnerDeletion not set to true" + + # Verify labels + assert secret.metadata.labels.get("ray.io/job-name") == rayjob.name + assert ( + secret.metadata.labels.get("app.kubernetes.io/managed-by") + == "codeflare-sdk" + ) + assert ( + secret.metadata.labels.get("app.kubernetes.io/component") + == "rayjob-files" + ) + + print(f"✓ Secret {secret_name} verified with proper owner reference") + + except client.rest.ApiException as e: + if e.status == 404: + raise AssertionError(f"Secret {secret_name} not found") + else: + raise e