From 0f9e0ac9ab9da1d412f97ac5e92779708e80cdcc Mon Sep 17 00:00:00 2001 From: "yihao.woon" Date: Wed, 28 Jan 2026 16:50:17 +0800 Subject: [PATCH] fix: Reschedule TPU observability DAGs to avoid cluster resources conflict (#197) This change reschedules all TPU observability DAGs to the following time to avoid cluster resources conflict: 1. `node_pool_status`: UTC 18:00 2. `interruption_validation_dag`: UTC 18:30 3. `jobset_ttr_pod_delete`: UTC 19:00 4. `multi_host_nodepool_rollback_dag`: UTC 19:30 5. `tpu_info_format_validation_dags`: UTC 20:00 6. `update_node_pool_label`: UTC 20:30 7. `node_pool_ttr_disk_size`: UTC 21:00 8. `node_pool_ttr_update_label`: UTC 21:30 9. `tpu_sdk_monitoring_validation_dag`: UTC 22:00 10. `jobset_ttr_rollback`: UTC 22:30 --- dags/tpu_observability/jobset_ttr_pod_delete.py | 2 +- dags/tpu_observability/jobset_ttr_rollback.py | 2 +- dags/tpu_observability/multi_host_nodepool_rollback_dag.py | 2 +- dags/tpu_observability/node_pool_ttr_disk_size.py | 2 +- dags/tpu_observability/node_pool_ttr_update_label.py | 2 +- dags/tpu_observability/tpu_info_format_validation_dags.py | 2 +- dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py | 2 +- dags/tpu_observability/update_node_pool_label.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dags/tpu_observability/jobset_ttr_pod_delete.py b/dags/tpu_observability/jobset_ttr_pod_delete.py index 299e1f0ac..111ca23bd 100644 --- a/dags/tpu_observability/jobset_ttr_pod_delete.py +++ b/dags/tpu_observability/jobset_ttr_pod_delete.py @@ -32,7 +32,7 @@ with models.DAG( # pylint: disable=unexpected-keyword-arg dag_id="jobset_ttr_pod_delete", start_date=datetime.datetime(2026, 1, 8), - schedule="0 18 * * *" if composer_env.is_prod_env() else None, + schedule="0 19 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=[ "cloud-ml-auto-solutions", diff --git a/dags/tpu_observability/jobset_ttr_rollback.py b/dags/tpu_observability/jobset_ttr_rollback.py index 1e68af605..59c1d921e 100644 --- a/dags/tpu_observability/jobset_ttr_rollback.py +++ b/dags/tpu_observability/jobset_ttr_rollback.py @@ -32,7 +32,7 @@ with models.DAG( # pylint: disable=unexpected-keyword-arg dag_id="jobset_rollback_ttr", start_date=datetime.datetime(2025, 8, 10), - schedule="0 18 * * *" if composer_env.is_prod_env() else None, + schedule="30 22 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=[ "cloud-ml-auto-solutions", diff --git a/dags/tpu_observability/multi_host_nodepool_rollback_dag.py b/dags/tpu_observability/multi_host_nodepool_rollback_dag.py index b9e8483e7..0b6b431eb 100644 --- a/dags/tpu_observability/multi_host_nodepool_rollback_dag.py +++ b/dags/tpu_observability/multi_host_nodepool_rollback_dag.py @@ -35,7 +35,7 @@ with models.DAG( # pylint: disable=unexpected-keyword-arg dag_id="multi-host-availability-rollback", start_date=datetime.datetime(2025, 8, 10), - schedule="30 18 * * *" if composer_env.is_prod_env() else None, + schedule="30 19 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=[ "cloud-ml-auto-solutions", diff --git a/dags/tpu_observability/node_pool_ttr_disk_size.py b/dags/tpu_observability/node_pool_ttr_disk_size.py index 2b70f55d9..56db07502 100644 --- a/dags/tpu_observability/node_pool_ttr_disk_size.py +++ b/dags/tpu_observability/node_pool_ttr_disk_size.py @@ -32,7 +32,7 @@ with models.DAG( dag_id="node_pool_ttr_disk_size", start_date=datetime.datetime(2025, 6, 26), - schedule="00 20 * * *" if composer_env.is_prod_env() else None, + schedule="0 21 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=[ "gke", diff --git a/dags/tpu_observability/node_pool_ttr_update_label.py b/dags/tpu_observability/node_pool_ttr_update_label.py index 81bcb6b57..8ab2b6761 100644 --- a/dags/tpu_observability/node_pool_ttr_update_label.py +++ b/dags/tpu_observability/node_pool_ttr_update_label.py @@ -29,7 +29,7 @@ with models.DAG( dag_id="node_pool_ttr_update_label", start_date=datetime.datetime(2025, 9, 30), - schedule="0 20 * * *" if composer_env.is_prod_env() else None, + schedule="30 21 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=[ "gke", diff --git a/dags/tpu_observability/tpu_info_format_validation_dags.py b/dags/tpu_observability/tpu_info_format_validation_dags.py index 6882738ef..1d88c6560 100644 --- a/dags/tpu_observability/tpu_info_format_validation_dags.py +++ b/dags/tpu_observability/tpu_info_format_validation_dags.py @@ -292,7 +292,7 @@ def validate_latency_table(tpu_info_output: list[tpu_info.Table]): dag_id="tpu_info_format_validation_dag", start_date=datetime.datetime(2025, 8, 15), default_args={"retries": 0}, - schedule="0 19 * * *" if composer_env.is_prod_env() else None, + schedule="0 20 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=["gke", "tpu-observability", "tpu-info", "TPU", "v6e-16"], description=( diff --git a/dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py b/dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py index fd180fdb4..677b95571 100644 --- a/dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py +++ b/dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py @@ -80,7 +80,7 @@ def validate_monitoring_sdk(info: node_pool.Info, pod_name: str) -> None: with models.DAG( dag_id="tpu_sdk_monitoring_validation", start_date=datetime.datetime(2026, 1, 13), - schedule="0 18 * * *" if composer_env.is_prod_env() else None, + schedule="0 22 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=[ "cloud-ml-auto-solutions", diff --git a/dags/tpu_observability/update_node_pool_label.py b/dags/tpu_observability/update_node_pool_label.py index b2c540dc5..c10228b14 100644 --- a/dags/tpu_observability/update_node_pool_label.py +++ b/dags/tpu_observability/update_node_pool_label.py @@ -33,7 +33,7 @@ with models.DAG( # pylint: disable=unexpected-keyword-arg dag_id="gke_node_pool_label_update", start_date=datetime.datetime(2025, 8, 1), - schedule="30 19 * * *" if composer_env.is_prod_env() else None, + schedule="30 20 * * *" if composer_env.is_prod_env() else None, catchup=False, tags=["gke", "tpu-observability", "node-pool-status", "TPU", "v6e-16"], description=(