Skip to content

Commit 7979429

Browse files
committed
task(RHOAIENG-26590): Report RayJob status via SDK
Signed-off-by: Pat O'Connor <[email protected]>
1 parent 77bc95a commit 7979429

File tree

7 files changed

+708
-3
lines changed

7 files changed

+708
-3
lines changed

src/codeflare_sdk/ray/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
from .rayjobs import (
88
RayJob,
9+
RayJobDeploymentStatus,
10+
CodeflareRayJobStatus,
11+
RayJobInfo,
912
)
1013

1114
from .cluster import (
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
from .rayjob import RayJob
2+
from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Copyright 2025 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
This sub-module exists primarily to be used internally by the RayJob object
17+
(in the rayjob sub-module) for pretty-printing job status and details.
18+
"""
19+
20+
from rich.console import Console
21+
from rich.table import Table
22+
from rich.panel import Panel
23+
from typing import Tuple, Optional
24+
25+
from .status import RayJobDeploymentStatus, RayJobInfo
26+
27+
28+
def print_job_status(job_info: RayJobInfo):
29+
"""
30+
Pretty print the job status in a format similar to cluster status.
31+
"""
32+
status_display, header_color = _get_status_display(job_info.status)
33+
34+
# Create main info table
35+
table = _create_info_table(header_color, job_info.name, status_display)
36+
table.add_row(f"[bold]Job ID:[/bold] {job_info.job_id}")
37+
table.add_row(f"[bold]Status:[/bold] {job_info.status.value}")
38+
table.add_row(f"[bold]RayCluster:[/bold] {job_info.cluster_name}")
39+
table.add_row(f"[bold]Namespace:[/bold] {job_info.namespace}")
40+
41+
# Add timing information if available
42+
if job_info.start_time:
43+
table.add_row(f"[bold]Started:[/bold] {job_info.start_time}")
44+
45+
# Add attempt counts if there are failures
46+
if job_info.failed_attempts > 0:
47+
table.add_row(f"[bold]Failed Attempts:[/bold] {job_info.failed_attempts}")
48+
49+
_print_table_in_panel(table)
50+
51+
52+
def print_no_job_found(job_name: str, namespace: str):
53+
"""
54+
Print a message when no job is found.
55+
"""
56+
# Create table with error message
57+
table = _create_info_table("[white on red][bold]Name", job_name, "[bold red]No RayJob found")
58+
table.add_row()
59+
table.add_row("Have you run rayjob.submit() yet?")
60+
table.add_row()
61+
table.add_row(f"[bold]Namespace:[/bold] {namespace}")
62+
63+
_print_table_in_panel(table)
64+
65+
66+
def _get_status_display(status: RayJobDeploymentStatus) -> Tuple[str, str]:
67+
"""
68+
Get the display string and header color for a given status.
69+
70+
Returns:
71+
Tuple of (status_display, header_color)
72+
"""
73+
status_mapping = {
74+
RayJobDeploymentStatus.COMPLETE: ("Complete :white_heavy_check_mark:", "[white on green][bold]Name"),
75+
RayJobDeploymentStatus.RUNNING: ("Running :gear:", "[white on blue][bold]Name"),
76+
RayJobDeploymentStatus.FAILED: ("Failed :x:", "[white on red][bold]Name"),
77+
RayJobDeploymentStatus.SUSPENDED: ("Suspended :pause_button:", "[white on yellow][bold]Name"),
78+
}
79+
80+
return status_mapping.get(status, ("Unknown :question:", "[white on red][bold]Name"))
81+
82+
83+
def _create_info_table(header_color: str, name: str, status_display: str) -> Table:
84+
"""
85+
Create a standardized info table with header and status.
86+
87+
Returns:
88+
Table with header row, name/status row, and empty separator row
89+
"""
90+
table = Table(box=None, show_header=False)
91+
table.add_row(header_color)
92+
table.add_row("[bold underline]" + name, status_display)
93+
table.add_row() # Empty separator row
94+
return table
95+
96+
97+
def _print_table_in_panel(table: Table):
98+
"""
99+
Print a table wrapped in a consistent panel format.
100+
"""
101+
console = Console()
102+
main_table = Table(box=None, title="[bold] :package: CodeFlare RayJob Status :package:")
103+
main_table.add_row(Panel.fit(table))
104+
console.print(main_table)

src/codeflare_sdk/ray/rayjobs/rayjob.py

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,16 @@
33
"""
44

55
import logging
6-
from typing import Dict, Any, Optional
6+
from typing import Dict, Any, Optional, Tuple
77
from odh_kuberay_client.kuberay_job_api import RayjobApi
88

9+
from .status import (
10+
RayJobDeploymentStatus,
11+
CodeflareRayJobStatus,
12+
RayJobInfo,
13+
)
14+
from . import pretty_print
15+
916
# Set up logging
1017
logger = logging.getLogger(__name__)
1118

@@ -15,7 +22,7 @@ class RayJob:
1522
A client for managing Ray jobs using the KubeRay operator.
1623
1724
This class provides a simplified interface for submitting and managing
18-
Ray jobs in a Kubernetes cluster with the KubeRay operator installed.
25+
Ray jobs via the Codeflare SDK (using the KubeRay RayJob python client).
1926
"""
2027

2128
def __init__(
@@ -40,7 +47,7 @@ def __init__(
4047
self.cluster_name = cluster_name
4148
self.entrypoint = entrypoint
4249
self.runtime_env = runtime_env
43-
50+
4451
# Initialize the KubeRay job API client
4552
self._api = RayjobApi()
4653

@@ -109,3 +116,66 @@ def _build_rayjob_cr(
109116
rayjob_cr["spec"]["runtimeEnvYAML"] = str(runtime_env)
110117

111118
return rayjob_cr
119+
120+
def status(self, print_to_console: bool = True) -> Tuple[CodeflareRayJobStatus, bool]:
121+
"""
122+
Get the status of the Ray job.
123+
124+
Args:
125+
print_to_console: Whether to print formatted status to console (default: True)
126+
127+
Returns:
128+
Tuple of (CodeflareRayJobStatus, ready: bool) where ready indicates job completion
129+
"""
130+
status_data = self._api.get_job_status(name=self.name, k8s_namespace=self.namespace)
131+
132+
if not status_data:
133+
if print_to_console:
134+
pretty_print.print_no_job_found(self.name, self.namespace)
135+
return CodeflareRayJobStatus.UNKNOWN, False
136+
137+
# Map deployment status to our enums
138+
deployment_status_str = status_data.get('jobDeploymentStatus', 'Unknown')
139+
140+
try:
141+
deployment_status = RayJobDeploymentStatus(deployment_status_str)
142+
except ValueError:
143+
deployment_status = RayJobDeploymentStatus.UNKNOWN
144+
145+
# Create RayJobInfo dataclass
146+
job_info = RayJobInfo(
147+
name=self.name,
148+
job_id=status_data.get('jobId', ''),
149+
status=deployment_status,
150+
namespace=self.namespace,
151+
cluster_name=self.cluster_name,
152+
start_time=status_data.get('startTime'),
153+
end_time=status_data.get('endTime'),
154+
failed_attempts=status_data.get('failed', 0),
155+
succeeded_attempts=status_data.get('succeeded', 0),
156+
dashboard_url=status_data.get('dashboardURL')
157+
)
158+
159+
# Map to CodeFlare status and determine readiness
160+
codeflare_status, ready = self._map_to_codeflare_status(deployment_status)
161+
162+
if print_to_console:
163+
pretty_print.print_job_status(job_info)
164+
165+
return codeflare_status, ready
166+
167+
def _map_to_codeflare_status(self, deployment_status: RayJobDeploymentStatus) -> Tuple[CodeflareRayJobStatus, bool]:
168+
"""
169+
Map deployment status to CodeFlare status and determine readiness.
170+
171+
Returns:
172+
Tuple of (CodeflareRayJobStatus, ready: bool)
173+
"""
174+
status_mapping = {
175+
RayJobDeploymentStatus.COMPLETE: (CodeflareRayJobStatus.COMPLETE, True),
176+
RayJobDeploymentStatus.RUNNING: (CodeflareRayJobStatus.RUNNING, False),
177+
RayJobDeploymentStatus.FAILED: (CodeflareRayJobStatus.FAILED, False),
178+
RayJobDeploymentStatus.SUSPENDED: (CodeflareRayJobStatus.SUSPENDED, False),
179+
}
180+
181+
return status_mapping.get(deployment_status, (CodeflareRayJobStatus.UNKNOWN, False))
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright 2025 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
The status sub-module defines Enums containing information for Ray job
17+
deployment states and CodeFlare job states, as well as
18+
dataclasses to store information for Ray jobs.
19+
"""
20+
21+
from dataclasses import dataclass
22+
from enum import Enum
23+
from typing import Optional
24+
25+
26+
class RayJobDeploymentStatus(Enum):
27+
"""
28+
Defines the possible deployment states of a Ray job (from the KubeRay RayJob API).
29+
"""
30+
COMPLETE = "Complete"
31+
RUNNING = "Running"
32+
FAILED = "Failed"
33+
SUSPENDED = "Suspended"
34+
UNKNOWN = "Unknown"
35+
36+
37+
class CodeflareRayJobStatus(Enum):
38+
"""
39+
Defines the possible reportable states of a CodeFlare Ray job.
40+
"""
41+
COMPLETE = 1
42+
RUNNING = 2
43+
FAILED = 3
44+
SUSPENDED = 4
45+
UNKNOWN = 5
46+
47+
48+
@dataclass
49+
class RayJobInfo:
50+
"""
51+
For storing information about a Ray job.
52+
"""
53+
name: str
54+
job_id: str
55+
status: RayJobDeploymentStatus
56+
namespace: str
57+
cluster_name: str
58+
start_time: Optional[str] = None
59+
end_time: Optional[str] = None
60+
failed_attempts: int = 0
61+
succeeded_attempts: int = 0
62+
dashboard_url: Optional[str] = None

0 commit comments

Comments
 (0)