From 7eea8af0335a8667d7ec2b1ae1d2003d26f112cc Mon Sep 17 00:00:00 2001 From: RUEI4341 <82101769+RUEI4341@users.noreply.github.com> Date: Tue, 10 Feb 2026 07:31:50 +0000 Subject: [PATCH] fix: Extend the timeout for `maxtext_convergence` The convergence test using the grain dataset frequently exceeds the previous 5-hour limit. Observations show that this specific task often requires more than 5 hours to reach completion. This change extends the timeout to 6 hours to ensure the DAG can finish successfully without manual intervention. --- dags/multipod/maxtext_convergence.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dags/multipod/maxtext_convergence.py b/dags/multipod/maxtext_convergence.py index 5f7f57951..c02ada322 100644 --- a/dags/multipod/maxtext_convergence.py +++ b/dags/multipod/maxtext_convergence.py @@ -84,9 +84,12 @@ sequential_tests = [] for test_name, run_command in convergence_tests.items(): + # The grain dataset takes longer to run, so we give it a longer timeout. The other tests are expected to complete within 5 hours. + timeout_in_min = 360 if test_name == "maxtext-convergence-grain" else 300 + test_task = gke_config.get_gke_config( cluster=XpkClusters.TPU_V6E_256_MLPERF_CLUSTER, - time_out_in_min=300, + time_out_in_min=timeout_in_min, test_name=test_name, run_model_cmds=run_command, docker_image=DockerImage.MAXTEXT_TPU_JAX_STABLE.value,