feat: Enhance healthiness and interruption reason for "PLX dashboard" (#1191)

lechipai0209 · Chrisliao0806 · web-flow · commit 02f90e3e43b8 · 2026-02-13T16:54:36.000+08:00
Add additional metrics to the XLML metadata logging for JobSet observability.
- `jobset_name`: add jobset name.
- `pod_names`: List of pod names associated with the JobSet, captured before the interruption event.

Co-authored-by: Chris liao &lt;388chris@gmail.com&gt;
diff --git a/dags/tpu_observability/jobset_ttr_kill_process.py b/dags/tpu_observability/jobset_ttr_kill_process.py
@@ -156,19 +156,25 @@ def kill_tpu_pod_workload(info: node_pool.Info, pod_name: str) -> None:
           workload_type=Workload.JAX_TPU_BENCHMARK,
       )
 
-      pod_names = jobset.list_pod_names.override(task_id="list_pod_names")(
+      running_pods = jobset.wait_for_all_pods_running.override(
+          task_id="ensure_all_pods_running"
+      )(
           node_pool=cluster_info,
           jobset_config=jobset_config,
       )
 
       wait_for_job_start = jobset.wait_for_jobset_started.override(
           task_id="wait_for_job_start"
-      )(cluster_info, pod_name_list=pod_names, job_apply_time=apply_time)
+      )(
+          cluster_info,
+          pod_name_list=running_pods,
+          job_apply_time=apply_time,
+      )
 
       kill_tasks = (
           kill_tpu_pod_workload.override(task_id="kill_tpu_pod_workload")
           .partial(info=cluster_info)
-          .expand(pod_name=pod_names)
+          .expand(pod_name=running_pods)
       )
 
       wait_for_metric_upload = jobset.wait_for_jobset_ttr_to_be_found.override(
@@ -199,7 +205,7 @@ def kill_tpu_pod_workload(info: node_pool.Info, pod_name: str) -> None:
           cluster_info,
           create_node_pool,
           apply_time,
-          pod_names,
+          running_pods,
           wait_for_job_start,
           kill_tasks,
           wait_for_metric_upload,
diff --git a/dags/tpu_observability/jobset_ttr_pod_delete.py b/dags/tpu_observability/jobset_ttr_pod_delete.py
@@ -56,7 +56,8 @@
     ],
     description=(
         "This DAG tests the JobSet time-to-recover metric by deleting a random "
-        "pod to trigger a recovery, then polls the metric to check if it is updated."
+        "pod to trigger a recovery, then polls the metric to check if it is"
+        " updated."
     ),
     doc_md="""
       # JobSet Time-To-Recover (TTR) Test Using Random Pod Deletion
diff --git a/dags/tpu_observability/tpu_info_format_validation_dags.py b/dags/tpu_observability/tpu_info_format_validation_dags.py
@@ -406,23 +406,25 @@ def generate_second_node_pool_name(
           workload_type=Workload.JAX_TPU_BENCHMARK,
       )
 
-      pod_names = jobset.list_pod_names.override(
-          task_id="list_pod_names",
-          retries=5,
-          retry_delay=datetime.timedelta(seconds=10),
+      running_pods = jobset.wait_for_all_pods_running.override(
+          task_id="ensure_all_pods_running"
       )(
           node_pool=cluster_info,
           jobset_config=jobset_config,
       )
 
       wait_for_job_start = jobset.wait_for_jobset_started.override(
           task_id="wait_for_job_start"
-      )(cluster_info, pod_name_list=pod_names, job_apply_time=apply_time)
+      )(
+          cluster_info,
+          pod_name_list=running_pods,
+          job_apply_time=apply_time,
+      )
 
       outputs_of_tpu_info = (
           get_tpu_info_from_pod.override(task_id="get_tpu_info")
           .partial(info=cluster_info)
-          .expand(pod_name=pod_names)
+          .expand(pod_name=running_pods)
       )
 
       output_of_tpu_info = (
@@ -521,7 +523,7 @@ def generate_second_node_pool_name(
           cluster_info_2,
           create_node_pool,
           apply_time,
-          pod_names,
+          running_pods,
           wait_for_job_start,
           outputs_of_tpu_info,
           output_of_tpu_info,
diff --git a/dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py b/dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py
@@ -162,23 +162,25 @@ def validate_monitoring_sdk(info: node_pool.Info, pod_name: str) -> None:
         workload_type=Workload.JAX_TPU_BENCHMARK,
     )
 
-    pod_names = jobset.list_pod_names.override(task_id="list_pod_names")(
+    running_pods = jobset.wait_for_all_pods_running.override(
+        task_id="ensure_all_pods_running"
+    )(
         node_pool=cluster_info,
         jobset_config=jobset_config,
     )
 
     wait_for_jobset_started = jobset.wait_for_jobset_started.override(
         task_id="wait_for_jobset_started"
     )(
-        node_pool=cluster_info,
-        pod_name_list=pod_names,
+        cluster_info,
+        pod_name_list=running_pods,
         job_apply_time=apply_time,
     )
 
     sdk_validation = (
         validate_monitoring_sdk.override(task_id="sdk_validation")
         .partial(info=cluster_info)
-        .expand(pod_name=pod_names)
+        .expand(pod_name=running_pods)
     )
 
     cleanup_workload = jobset.end_workload.override(
@@ -202,7 +204,7 @@ def validate_monitoring_sdk(info: node_pool.Info, pod_name: str) -> None:
         cluster_info,
         create_node_pool,
         apply_time,
-        pod_names,
+        running_pods,
         wait_for_jobset_started,
         sdk_validation,
         cleanup_workload,
diff --git a/dags/tpu_observability/utils/jobset_util.py b/dags/tpu_observability/utils/jobset_util.py
@@ -28,6 +28,7 @@
 
 from airflow.decorators import task
 from airflow.exceptions import AirflowFailException
+from airflow.sensors.base import PokeReturnValue
 from google.cloud.monitoring_v3 import types
 import kubernetes
 
@@ -38,6 +39,7 @@
 from dags.tpu_observability.utils.node_pool_util import NODE_POOL_SELECTOR_KEY
 from dags.tpu_observability.utils.time_util import TimeUtil
 from xlml.apis import gcs
+from xlml.utils import composer
 from xlml.utils import gke
 
 
@@ -507,6 +509,24 @@ def run_workload(
 
     subprocess.run_exec(cmd, env=env)
 
+    # Log metadata for XLML dashboard
+    # Pod names follow the pattern:
+    #   {jobset_name}-{replicated_job_name}-{job-index}-{pod-index}-{random}
+    # The jobset_name prefix is stable across pod recreations, so a regex
+    # pattern is more reliable than an exact pod name list.
+    pod_name_pattern = f"{jobset_config.jobset_name}.*"
+    jobset_metadata = {
+        "project_id": node_pool.project_id,
+        "cluster_name": node_pool.cluster_name,
+        "node_pool_name": node_pool.node_pool_name,
+        "jobset_name": jobset_config.jobset_name,
+        "pod_name_pattern": pod_name_pattern,
+    }
+    composer.log_metadata_for_xlml_dashboard(jobset_metadata)
+    logging.info(
+        "Logged JobSet metadata to XLML dashboard: %s", jobset_metadata
+    )
+
     current_time_utc = datetime.datetime.now(datetime.timezone.utc)
     return TimeUtil.from_datetime(current_time_utc)
 
@@ -724,7 +744,8 @@ def wait_for_jobset_ttr_to_be_found(
 
   Args:
     node_pool (Info): An instance of the Info class containing GKE metadata.
-    jobset_config: An instance of the JobSet class representing the jobset configuration.
+    jobset_config: An instance of the JobSet class representing the jobset
+      configuration.
     start_time (TimeUtil, optional): The UTC timestamp to start polling from.
     If not provided, defaults to 60 minutes before the current time.
 
@@ -749,23 +770,39 @@ def wait_for_jobset_ttr_to_be_found(
       end_time=TimeUtil.from_datetime(now),
   )
 
-  # This function checks whether the TTR metric is present;
-  # it does not assess its value.
   logging.info("Time series: %s", time_series)
   return len(time_series) > 0
 
 
 @task.sensor(poke_interval=30, timeout=600, mode="poke")
-def wait_for_all_pods_running(node_pool: node_pool_info, jobset_config: JobSet):
-  num_running = len(
-      get_running_pods(
-          node_pool=node_pool,
-          jobset_name=jobset_config.jobset_name,
-          namespace="default",
-      )
+def wait_for_all_pods_running(
+    node_pool: node_pool_info, jobset_config: JobSet
+) -> PokeReturnValue:
+  """Waits for all pods to be running and returns the pod names.
+
+  Args:
+    node_pool: The Info object containing the cluster information.
+    jobset_config: The JobSet configuration.
+
+  Returns:
+    PokeReturnValue with is_done=True and pod names when all pods are running,
+    or is_done=False to continue polling.
+  """
+  running_pods = get_running_pods(
+      node_pool=node_pool,
+      jobset_name=jobset_config.jobset_name,
+      namespace="default",
   )
   num_pods = jobset_config.replicas * jobset_config.parallelism
-  return num_running == num_pods
+  if len(running_pods) == num_pods:
+    logging.info(
+        "All %d pods are running for JobSet '%s': %s",
+        num_pods,
+        jobset_config.jobset_name,
+        running_pods,
+    )
+    return PokeReturnValue(is_done=True, xcom_value=running_pods)
+  return PokeReturnValue(is_done=False)
 
 
 def query_uptime_metrics(