@@ -1463,14 +1463,16 @@ def install_diagon_prerequisites():
14631463 """
14641464 deployment_name = 'kueue-controller-manager'
14651465 namespace_name = 'kueue-system'
1466+ cert_webhook_deployment_name = 'cert-manager-webhook'
1467+ cert_webhook_namespace_name = 'cert-manager'
14661468 # is_running = wait_for_cluster_running(args)
14671469 is_running = wait_for_deployment_ready (deployment_name , namespace_name )
14681470 if is_running :
14691471 return_code = install_cert_manager ()
14701472 if return_code != 0 :
14711473 return return_code
14721474
1473- cert_webhook_ready = check_cert_manager_webhook_status ( )
1475+ cert_webhook_ready = wait_for_deployment_ready ( cert_webhook_deployment_name , cert_webhook_namespace_name )
14741476 if cert_webhook_ready :
14751477
14761478 webhook_package = "mldiagnostics-injection-webhook"
@@ -1553,96 +1555,3 @@ def wait_for_deployment_ready(deployment_name: str, namespace: str, timeout_seco
15531555 xpk_print (f"\n Unexpected API request error while checking deployment status: { e } " )
15541556 time .sleep (10 )
15551557 return False
1556-
1557-
1558- def wait_for_cluster_running (args , timeout_minutes : int = 30 ) -> bool :
1559- """
1560- Polls the GKE Cluster status using gcloud CLI until it enters the RUNNING state.
1561-
1562- Args:
1563- args: user provided arguments for running the command.
1564- timeout_minutes: Timeout duration in minutes.
1565-
1566- Returns:
1567- bool: True if the Cluster successfully enters the RUNNING state, False otherwise.
1568- """
1569- timeout_seconds = timeout_minutes * 60
1570- start_time = time .time ()
1571-
1572- # Construct gcloud command to describe the cluster status
1573- command = (
1574- 'gcloud container clusters describe'
1575- f' { args .cluster } --region={ zone_to_region (args .zone )} --project={ args .project } '
1576- " --format='value(status)'"
1577- )
1578-
1579- print (f"Waiting for cluster { args .cluster } ({ args .zone } ) to enter RUNNING state (using gcloud CLI)..." )
1580-
1581- while time .time () - start_time < timeout_seconds :
1582- try :
1583- # Execute the gcloud command
1584- return_code , return_output = run_command_for_value (
1585- command , f'Get the status of cluster...'
1586- )
1587- # Check if gcloud command itself returned an error
1588- if return_code != 0 :
1589- # If Not found error, the cluster does not exist
1590- if "Not found" in return_output :
1591- xpk_print (f"\n Error: Cluster { args .cluster } does not exist in { args .zone } ." )
1592- return False
1593-
1594- # Other execution errors, wait and retry
1595- xpk_print (f"\n Error: gcloud command failed. { return_output } " )
1596- time .sleep (10 )
1597- continue
1598-
1599- # Check cluster status returned by gcloud
1600- if "RUNNING" in return_output :
1601- xpk_print (f"Success: Cluster { args .cluster } status is RUNNING." )
1602- return True
1603-
1604- elif "ERROR" in return_output or "DEGRADED" in return_output :
1605- xpk_print (f"Error: Cluster status is { return_output } , creation failed." )
1606- return False
1607- else :
1608- elapsed_time = int (time .time () - start_time )
1609- xpk_print (f"Current status: { return_output } . Elapsed time: { elapsed_time } seconds. Checking again..." )
1610- except Exception as e :
1611- xpk_print (f"\n Unexpected API request error: { e } " )
1612- time .sleep (10 ) # Wait longer on unexpected errors
1613-
1614- # Poll interval
1615- time .sleep (30 )
1616-
1617- xpk_print (f"\n Timeout Error: Cluster did not reach RUNNING state within { timeout_minutes } minutes." )
1618- return False
1619-
1620- def check_cert_manager_webhook_status (timeout_seconds : int = 300 ) -> bool :
1621- """
1622- Runs and checks the exit code of kubectl rollout status for a specific deployment.
1623-
1624- Args:
1625- timeout_seconds (int): The maximum time to wait for the rollout to complete.
1626-
1627- Returns:
1628- bool: True if the rollout status is successful (exit code 0), False otherwise.
1629- """
1630- # Build the kubectl command
1631- kubectl_command = (
1632- 'kubectl rollout status deployment/cert-manager-webhook -n cert-manager'
1633- f' --timeout={ timeout_seconds } s'
1634- )
1635-
1636- xpk_print (f"Running command to check deployment status: { kubectl_command } " )
1637-
1638- try :
1639- return_code , return_output = run_command_for_value (
1640- kubectl_command , f'check cert manager...'
1641- )
1642- if "successfully rolled out" in return_output :
1643- xpk_print (f"SUCCESS: Deployment cert-manager-webhook rollout completed." )
1644- return True
1645-
1646- except Exception as e :
1647- xpk_print (f"\n Unexpected error during kubectl execution: { e } " )
1648- return False
0 commit comments