Skip to content

Commit 60b16f4

Browse files
authored
[forge] add debug for cluster list (#17679)
1 parent f21d371 commit 60b16f4

File tree

2 files changed

+25
-4
lines changed

2 files changed

+25
-4
lines changed

testsuite/forge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1735,6 +1735,11 @@ def test(
17351735

17361736
try:
17371737
forge_runner = forge_runner_mapping[forge_runner_mode]()
1738+
1739+
# For debug! if forge_runner_duration_secs is 0, then just quit
1740+
if forge_runner_duration_secs == "0":
1741+
raise SystemExit(0)
1742+
17381743
result = forge_runner.run(forge_context)
17391744

17401745
outputs = []

testsuite/test_framework/cluster.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
from enum import Enum
77
import json
88
import os
9+
import time
910
from typing import Dict, List, Optional, TypedDict
1011

1112
from .shell import Shell
13+
from .logging import log
1214

1315

1416
class Cloud(Enum):
@@ -213,11 +215,25 @@ def list_eks_clusters(shell: Shell) -> Dict[str, ForgeCluster]:
213215

214216

215217
def list_gke_clusters(shell: Shell) -> Dict[str, ForgeCluster]:
216-
cluster_json = shell.run(
217-
["gcloud", "container", "clusters", "list", "--format=json"]
218-
).unwrap()
218+
# hack: Retry GKE cluster listing few times in case of transient failures
219+
max_retries = 10
220+
cluster_json_str = "[]"
221+
for attempt in range(max_retries):
222+
try:
223+
cluster_json = shell.run(
224+
["gcloud", "container", "clusters", "list", "--format=json(name, location)"]
225+
).unwrap()
226+
cluster_json_str = cluster_json.decode()
227+
log.info(f"GKE clusters list (attempt {attempt + 1}): {cluster_json_str}")
228+
break # Success, exit retry loop
229+
except Exception as e:
230+
if attempt == max_retries - 1:
231+
# Last attempt failed, re-raise the exception
232+
raise e
233+
log.warning(f"GKE cluster listing attempt {attempt + 1} failed: {e}. Retrying...")
234+
time.sleep(10)
219235
try:
220-
cluster_result = json.loads(cluster_json.decode())
236+
cluster_result = json.loads(cluster_json_str)
221237
clusters: Dict[str, ForgeCluster] = {}
222238
for cluster_config in cluster_result:
223239
cluster_name = cluster_config["name"]

0 commit comments

Comments
 (0)