|
6 | 6 | from enum import Enum |
7 | 7 | import json |
8 | 8 | import os |
| 9 | +import time |
9 | 10 | from typing import Dict, List, Optional, TypedDict |
10 | 11 |
|
11 | 12 | from .shell import Shell |
| 13 | +from .logging import log |
12 | 14 |
|
13 | 15 |
|
14 | 16 | class Cloud(Enum): |
@@ -213,11 +215,25 @@ def list_eks_clusters(shell: Shell) -> Dict[str, ForgeCluster]: |
213 | 215 |
|
214 | 216 |
|
215 | 217 | def list_gke_clusters(shell: Shell) -> Dict[str, ForgeCluster]: |
216 | | - cluster_json = shell.run( |
217 | | - ["gcloud", "container", "clusters", "list", "--format=json"] |
218 | | - ).unwrap() |
| 218 | + # hack: Retry GKE cluster listing few times in case of transient failures |
| 219 | + max_retries = 10 |
| 220 | + cluster_json_str = "[]" |
| 221 | + for attempt in range(max_retries): |
| 222 | + try: |
| 223 | + cluster_json = shell.run( |
| 224 | + ["gcloud", "container", "clusters", "list", "--format=json(name, location)"] |
| 225 | + ).unwrap() |
| 226 | + cluster_json_str = cluster_json.decode() |
| 227 | + log.info(f"GKE clusters list (attempt {attempt + 1}): {cluster_json_str}") |
| 228 | + break # Success, exit retry loop |
| 229 | + except Exception as e: |
| 230 | + if attempt == max_retries - 1: |
| 231 | + # Last attempt failed, re-raise the exception |
| 232 | + raise e |
| 233 | + log.warning(f"GKE cluster listing attempt {attempt + 1} failed: {e}. Retrying...") |
| 234 | + time.sleep(10) |
219 | 235 | try: |
220 | | - cluster_result = json.loads(cluster_json.decode()) |
| 236 | + cluster_result = json.loads(cluster_json_str) |
221 | 237 | clusters: Dict[str, ForgeCluster] = {} |
222 | 238 | for cluster_config in cluster_result: |
223 | 239 | cluster_name = cluster_config["name"] |
|
0 commit comments