Skip to content

Commit 0f9e0ac

Browse files
YiHao990416alfredyu-cienet
authored andcommitted
fix: Reschedule TPU observability DAGs to avoid cluster resources conflict (#197)
This change reschedules all TPU observability DAGs to the following time to avoid cluster resources conflict: 1. `node_pool_status`: UTC 18:00 2. `interruption_validation_dag`: UTC 18:30 3. `jobset_ttr_pod_delete`: UTC 19:00 4. `multi_host_nodepool_rollback_dag`: UTC 19:30 5. `tpu_info_format_validation_dags`: UTC 20:00 6. `update_node_pool_label`: UTC 20:30 7. `node_pool_ttr_disk_size`: UTC 21:00 8. `node_pool_ttr_update_label`: UTC 21:30 9. `tpu_sdk_monitoring_validation_dag`: UTC 22:00 10. `jobset_ttr_rollback`: UTC 22:30
1 parent bc9a60b commit 0f9e0ac

8 files changed

+8
-8
lines changed

dags/tpu_observability/jobset_ttr_pod_delete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
with models.DAG( # pylint: disable=unexpected-keyword-arg
3333
dag_id="jobset_ttr_pod_delete",
3434
start_date=datetime.datetime(2026, 1, 8),
35-
schedule="0 18 * * *" if composer_env.is_prod_env() else None,
35+
schedule="0 19 * * *" if composer_env.is_prod_env() else None,
3636
catchup=False,
3737
tags=[
3838
"cloud-ml-auto-solutions",

dags/tpu_observability/jobset_ttr_rollback.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
with models.DAG( # pylint: disable=unexpected-keyword-arg
3333
dag_id="jobset_rollback_ttr",
3434
start_date=datetime.datetime(2025, 8, 10),
35-
schedule="0 18 * * *" if composer_env.is_prod_env() else None,
35+
schedule="30 22 * * *" if composer_env.is_prod_env() else None,
3636
catchup=False,
3737
tags=[
3838
"cloud-ml-auto-solutions",

dags/tpu_observability/multi_host_nodepool_rollback_dag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
with models.DAG( # pylint: disable=unexpected-keyword-arg
3636
dag_id="multi-host-availability-rollback",
3737
start_date=datetime.datetime(2025, 8, 10),
38-
schedule="30 18 * * *" if composer_env.is_prod_env() else None,
38+
schedule="30 19 * * *" if composer_env.is_prod_env() else None,
3939
catchup=False,
4040
tags=[
4141
"cloud-ml-auto-solutions",

dags/tpu_observability/node_pool_ttr_disk_size.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
with models.DAG(
3333
dag_id="node_pool_ttr_disk_size",
3434
start_date=datetime.datetime(2025, 6, 26),
35-
schedule="00 20 * * *" if composer_env.is_prod_env() else None,
35+
schedule="0 21 * * *" if composer_env.is_prod_env() else None,
3636
catchup=False,
3737
tags=[
3838
"gke",

dags/tpu_observability/node_pool_ttr_update_label.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
with models.DAG(
3030
dag_id="node_pool_ttr_update_label",
3131
start_date=datetime.datetime(2025, 9, 30),
32-
schedule="0 20 * * *" if composer_env.is_prod_env() else None,
32+
schedule="30 21 * * *" if composer_env.is_prod_env() else None,
3333
catchup=False,
3434
tags=[
3535
"gke",

dags/tpu_observability/tpu_info_format_validation_dags.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def validate_latency_table(tpu_info_output: list[tpu_info.Table]):
292292
dag_id="tpu_info_format_validation_dag",
293293
start_date=datetime.datetime(2025, 8, 15),
294294
default_args={"retries": 0},
295-
schedule="0 19 * * *" if composer_env.is_prod_env() else None,
295+
schedule="0 20 * * *" if composer_env.is_prod_env() else None,
296296
catchup=False,
297297
tags=["gke", "tpu-observability", "tpu-info", "TPU", "v6e-16"],
298298
description=(

dags/tpu_observability/tpu_sdk_monitoring_validation_dag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def validate_monitoring_sdk(info: node_pool.Info, pod_name: str) -> None:
8080
with models.DAG(
8181
dag_id="tpu_sdk_monitoring_validation",
8282
start_date=datetime.datetime(2026, 1, 13),
83-
schedule="0 18 * * *" if composer_env.is_prod_env() else None,
83+
schedule="0 22 * * *" if composer_env.is_prod_env() else None,
8484
catchup=False,
8585
tags=[
8686
"cloud-ml-auto-solutions",

dags/tpu_observability/update_node_pool_label.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
with models.DAG( # pylint: disable=unexpected-keyword-arg
3434
dag_id="gke_node_pool_label_update",
3535
start_date=datetime.datetime(2025, 8, 1),
36-
schedule="30 19 * * *" if composer_env.is_prod_env() else None,
36+
schedule="30 20 * * *" if composer_env.is_prod_env() else None,
3737
catchup=False,
3838
tags=["gke", "tpu-observability", "node-pool-status", "TPU", "v6e-16"],
3939
description=(

0 commit comments

Comments
 (0)