kryanbeane
diff --git a/‎src/codeflare_sdk/ray/rayjobs/config.py‎
Lines changed: 3 additions & 13 deletions b/‎src/codeflare_sdk/ray/rayjobs/config.py‎
Lines changed: 3 additions & 13 deletions
diff --git a/‎src/codeflare_sdk/ray/rayjobs/rayjob.py‎
Lines changed: 14 additions & 0 deletions b/‎src/codeflare_sdk/ray/rayjobs/rayjob.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/e2e/rayjob/lifecycled_cluster_kueue_test.py‎
Lines changed: 295 additions & 0 deletions b/‎tests/e2e/rayjob/lifecycled_cluster_kueue_test.py‎
Lines changed: 295 additions & 0 deletions
@@ -131,10 +131,8 @@ class ManagedClusterConfig:
         accelerator_configs:
             A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names.
             Defaults to DEFAULT_ACCELERATORS but can be overridden with custom mappings.
-        local_queue:
-            The name of the queue to use for the cluster.
         annotations:
-            A dictionary of annotations to apply to the cluster.
+            A dictionary of annotations to apply to the Job.
         volumes:
             A list of V1Volume objects to add to the Cluster
         volume_mounts:
@@ -161,7 +159,6 @@ class ManagedClusterConfig:
     accelerator_configs: Dict[str, str] = field(
         default_factory=lambda: DEFAULT_ACCELERATORS.copy()
     )
-    local_queue: Optional[str] = None
     annotations: Dict[str, str] = field(default_factory=dict)
     volumes: list[V1Volume] = field(default_factory=list)
     volume_mounts: list[V1VolumeMount] = field(default_factory=list)
@@ -248,7 +245,6 @@ def build_ray_cluster_spec(self, cluster_name: str) -> Dict[str, Any]:
         """
         ray_cluster_spec = {
             "rayVersion": RAY_VERSION,
-            "enableInTreeAutoscaling": False,
             "headGroupSpec": self._build_head_group_spec(),
             "workerGroupSpecs": [self._build_worker_group_spec(cluster_name)],
         }
@@ -346,12 +342,9 @@ def _build_head_container(self) -> V1Container:
                 self.head_accelerators,
             ),
             volume_mounts=self._generate_volume_mounts(),
+            env=self._build_env_vars() if hasattr(self, "envs") and self.envs else None,
         )
 
-        # Add environment variables if specified
-        if hasattr(self, "envs") and self.envs:
-            container.env = self._build_env_vars()
-
         return container
 
     def _build_worker_container(self) -> V1Container:
@@ -373,12 +366,9 @@ def _build_worker_container(self) -> V1Container:
                 self.worker_accelerators,
             ),
             volume_mounts=self._generate_volume_mounts(),
+            env=self._build_env_vars() if hasattr(self, "envs") and self.envs else None,
         )
 
-        # Add environment variables if specified
-        if hasattr(self, "envs") and self.envs:
-            container.env = self._build_env_vars()
-
         return container
 
     def _build_resource_requirements(
 
@@ -22,6 +22,7 @@
 import re
 import ast
 from typing import Dict, Any, Optional, Tuple
+from codeflare_sdk.common.kueue.kueue import get_default_kueue_name
 from codeflare_sdk.common.utils.constants import MOUNT_PATH
 from kubernetes import client
 from ...common.kubernetes_cluster.auth import get_api_client
@@ -62,6 +63,7 @@ def __init__(
         shutdown_after_job_finishes: Optional[bool] = None,
         ttl_seconds_after_finished: int = 0,
         active_deadline_seconds: Optional[int] = None,
+        local_queue: Optional[str] = None,
     ):
         """
         Initialize a RayJob instance.
@@ -108,6 +110,7 @@ def __init__(
         self.runtime_env = runtime_env
         self.ttl_seconds_after_finished = ttl_seconds_after_finished
         self.active_deadline_seconds = active_deadline_seconds
+        self.local_queue = local_queue
 
         # Auto-set shutdown_after_job_finishes based on cluster_config presence
         # If cluster_config is provided, we want to clean up the cluster after job finishes
@@ -232,9 +235,20 @@ def _build_rayjob_cr(self) -> Dict[str, Any]:
                 "entrypoint": self.entrypoint,
                 "shutdownAfterJobFinishes": self.shutdown_after_job_finishes,
                 "ttlSecondsAfterFinished": self.ttl_seconds_after_finished,
+                "enableInTreeAutoscaling": False,
             },
         }
 
+        # Configure Kueue label
+        if self.local_queue:
+            rayjob_cr["metadata"]["labels"] = {
+                "kueue.x-k8s.io/queue-name": self.local_queue
+            }
+        else:
+            rayjob_cr["metadata"]["labels"] = {
+                "kueue.x-k8s.io/queue-name": get_default_kueue_name(self.namespace)
+            }
+
         # Add active deadline if specified
         if self.active_deadline_seconds:
             rayjob_cr["spec"]["activeDeadlineSeconds"] = self.active_deadline_seconds
 
@@ -0,0 +1,295 @@
+import pytest
+import sys
+import os
+from time import sleep
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from support import *
+
+from codeflare_sdk import RayJob, ManagedClusterConfig
+import kubernetes.client.rest
+from python_client.kuberay_job_api import RayjobApi
+from python_client.kuberay_cluster_api import RayClusterApi
+
+
+@pytest.mark.openshift
+class TestRayJobLifecycledClusterKueue:
+    """Test RayJob with Kueue integration and auto-created cluster lifecycle management."""
+
+    def setup_method(self):
+        initialize_kubernetes_client(self)
+
+    def teardown_method(self):
+        delete_namespace(self)
+        delete_kueue_resources(self)
+
+    def test_rayjob_with_kueue_integration(self):
+        """
+        Test RayJob submission with Kueue queue management, including:
+        1. Job submission to Kueue queue
+        2. Waiting for Kueue admission
+        3. Job execution and completion
+        4. Automatic cluster cleanup after job deletion
+
+        Note: This test does NOT test manual suspend/resume as that conflicts
+        with Kueue's queue management.
+        """
+        self.setup_method()
+        create_namespace(self)
+        create_kueue_resources(self)
+
+        ray_image = get_ray_image()
+        self.job_api = RayjobApi()
+        job_name = "kueue-managed-job"
+
+        cluster_config = ManagedClusterConfig(
+            head_cpu_requests="500m",
+            head_cpu_limits="500m",
+            head_memory_requests=4,
+            head_memory_limits=6,
+            num_workers=1,
+            worker_cpu_requests="500m",
+            worker_cpu_limits="500m",
+            worker_memory_requests=4,
+            worker_memory_limits=5,
+            image=ray_image,
+        )
+
+        rayjob = RayJob(
+            job_name=job_name,
+            namespace=self.namespace,
+            cluster_config=cluster_config,
+            entrypoint="python -c \"import ray; ray.init(); print('Kueue-managed RayJob completed successfully')\"",
+            runtime_env={"env_vars": get_setup_env_variables(ACCELERATOR="cpu")},
+            shutdown_after_job_finishes=True,
+            local_queue=self.local_queues[0],
+        )
+
+        try:
+            # 1. Submit job to Kueue queue
+            print(f"Submitting RayJob to Kueue queue: {self.local_queues[0]}")
+            assert rayjob.submit() == job_name
+
+            # 2. Check if job is suspended or immediately admitted by Kueue
+            job_cr = self.job_api.get_job(
+                name=rayjob.name, k8s_namespace=rayjob.namespace
+            )
+            is_suspended = job_cr.get("spec", {}).get("suspend", False)
+
+            if is_suspended:
+                print("✓ Job is queued and suspended by Kueue (waiting for resources)")
+                # 3. Wait for Kueue to admit the job
+                print("Waiting for Kueue to admit the job...")
+                admitted = wait_for_kueue_admission(
+                    self, self.job_api, rayjob.name, rayjob.namespace, timeout=120
+                )
+                assert admitted, "Job was not admitted by Kueue within timeout"
+                print("✓ Job admitted by Kueue")
+            else:
+                print("✓ Job was immediately admitted by Kueue (resources available)")
+
+            # 4. Wait for job to reach running state
+            print("Waiting for job to start running...")
+            assert self.job_api.wait_until_job_running(
+                name=rayjob.name, k8s_namespace=rayjob.namespace, timeout=300
+            ), "Job did not reach running state after Kueue admission"
+            print("✓ Job is running")
+
+            # 5. Verify RayCluster was created (KubeRay adds random suffix to cluster name)
+            cluster_api = RayClusterApi()
+            # List all RayClusters in the namespace since KubeRay adds a suffix
+            clusters = cluster_api.list_ray_clusters(
+                k8s_namespace=rayjob.namespace, async_req=False
+            )
+
+            # Find the cluster that starts with our job name
+            found_cluster = None
+            for cluster in clusters.get("items", []):
+                cluster_name = cluster.get("metadata", {}).get("name", "")
+                if cluster_name.startswith(f"{rayjob.name}-raycluster"):
+                    found_cluster = cluster
+                    break
+
+            assert (
+                found_cluster is not None
+            ), f"RayCluster not found for RayJob {rayjob.name}"
+            print(
+                f"✓ RayCluster created successfully: {found_cluster['metadata']['name']}"
+            )
+
+            # 6. Wait for job completion
+            print("Waiting for job to complete...")
+            assert self.job_api.wait_until_job_finished(
+                name=rayjob.name, k8s_namespace=rayjob.namespace, timeout=300
+            ), "Job did not complete"
+
+            # Verify final job status
+            final_status = self.job_api.get_job_status(
+                name=rayjob.name, k8s_namespace=rayjob.namespace
+            )
+            print(
+                f"✓ Job completed with status: {final_status.get('jobDeploymentStatus')}"
+            )
+
+        finally:
+            # 7. Delete the job and verify cleanup
+            print("Cleaning up...")
+            assert rayjob.delete()
+            self.verify_cluster_cleanup(rayjob)
+            print("✓ Cleanup complete")
+
+    def test_rayjob_kueue_with_preemption(self):
+        """
+        Test RayJob behavior when using Kueue with potential preemption scenarios.
+        This tests that manual suspend/resume still works even with Kueue management.
+        """
+        self.setup_method()
+        create_namespace(self)
+        # Create Kueue resources with limited quota to force suspension
+        create_limited_kueue_resources(self)
+
+        ray_image = get_ray_image()
+        self.job_api = RayjobApi()
+        job_name = "kueue-job"
+
+        cluster_config = ManagedClusterConfig(
+            head_cpu_requests="500m",
+            head_cpu_limits="500m",
+            head_memory_requests=4,
+            head_memory_limits=6,
+            num_workers=1,
+            worker_cpu_requests="500m",
+            worker_cpu_limits="500m",
+            worker_memory_requests=4,
+            worker_memory_limits=6,
+            image=ray_image,
+        )
+
+        rayjob = RayJob(
+            job_name=job_name,
+            namespace=self.namespace,
+            cluster_config=cluster_config,
+            entrypoint="python -c \"import ray; import time; ray.init(); time.sleep(30); print('Job completed')\"",
+            runtime_env={"env_vars": get_setup_env_variables(ACCELERATOR="cpu")},
+            shutdown_after_job_finishes=True,
+            local_queue=self.local_queues[0],
+        )
+
+        try:
+            # 1. Submit job
+            assert rayjob.submit() == job_name
+
+            # 2. Check if job is suspended or immediately admitted by Kueue
+            job_cr = self.job_api.get_job(
+                name=rayjob.name, k8s_namespace=rayjob.namespace
+            )
+            is_suspended = job_cr.get("spec", {}).get("suspend", False)
+
+            if is_suspended:
+                print("Job is queued and suspended by Kueue (waiting for resources)")
+                assert wait_for_kueue_admission(
+                    self, self.job_api, rayjob.name, rayjob.namespace, timeout=120
+                ), "Job was not admitted by Kueue"
+                print("✓ Job admitted by Kueue")
+            else:
+                print("✓ Job was immediately admitted by Kueue (resources available)")
+
+            # 3. Wait for job to be running
+            assert self.job_api.wait_until_job_running(
+                name=rayjob.name, k8s_namespace=rayjob.namespace, timeout=300
+            ), "Job did not reach running state"
+            print("✓ Job is running")
+
+            # 4. Manually suspend the job (simulating preemption or manual intervention)
+            print("Manually suspending the job...")
+            assert rayjob.stop(), "Job stop failed"
+
+            # Verify suspension
+            job_cr = self.job_api.get_job(
+                name=rayjob.name, k8s_namespace=rayjob.namespace
+            )
+            assert job_cr["spec"]["suspend"] is True, "Job suspend not set to true"
+
+            # Wait for suspended state
+            assert self._wait_for_job_status(
+                rayjob, "Suspended", timeout=30
+            ), "Job did not reach Suspended state"
+            print("✓ Job manually suspended")
+
+            # 5. Resume the job
+            print("Resuming the job...")
+            assert rayjob.resubmit(), "Job resubmit failed"
+
+            # Note: With Kueue, the job might go back to the queue and need re-admission
+            # Check if Kueue re-queued it
+            job_cr = self.job_api.get_job(
+                name=rayjob.name, k8s_namespace=rayjob.namespace
+            )
+            if job_cr.get("spec", {}).get("suspend", False):
+                print("Job re-queued by Kueue, waiting for re-admission...")
+                assert wait_for_kueue_admission(
+                    self, self.job_api, rayjob.name, rayjob.namespace, timeout=120
+                ), "Job was not re-admitted by Kueue"
+
+            # 6. Wait for job to complete
+            assert self.job_api.wait_until_job_finished(
+                name=rayjob.name, k8s_namespace=rayjob.namespace, timeout=300
+            ), "Job did not complete after resume"
+            print("✓ Job completed after manual suspend/resume")
+
+        finally:
+            assert rayjob.delete()
+            self.verify_cluster_cleanup(rayjob)
+
+    def _wait_for_job_status(
+        self,
+        rayjob: RayJob,
+        expected_status: str,
+        timeout: int = 30,
+    ) -> bool:
+        """Wait for a job to reach a specific deployment status."""
+        elapsed_time = 0
+        check_interval = 2
+
+        while elapsed_time < timeout:
+            status = self.job_api.get_job_status(
+                name=rayjob.name, k8s_namespace=rayjob.namespace
+            )
+            if status and status.get("jobDeploymentStatus") == expected_status:
+                return True
+
+            sleep(check_interval)
+            elapsed_time += check_interval
+
+        return False
+
+    def verify_cluster_cleanup(self, rayjob: RayJob, timeout: int = 60):
+        """Verify that the cluster created by the RayJob has been cleaned up."""
+        elapsed_time = 0
+        check_interval = 5
+        cluster_api = RayClusterApi()
+
+        while elapsed_time < timeout:
+            # List all RayClusters in the namespace
+            clusters = cluster_api.list_ray_clusters(
+                k8s_namespace=rayjob.namespace, async_req=False
+            )
+
+            # Check if any cluster exists that starts with our job name
+            found = False
+            for cluster in clusters.get("items", []):
+                cluster_name = cluster.get("metadata", {}).get("name", "")
+                if cluster_name.startswith(f"{rayjob.name}-raycluster"):
+                    found = True
+                    break
+
+            if not found:
+                # No cluster found, cleanup successful
+                return
+
+            sleep(check_interval)
+            elapsed_time += check_interval
+
+        raise TimeoutError(
+            f"RayCluster for job '{rayjob.name}' was not cleaned up within {timeout} seconds"
+        )