@@ -1464,14 +1464,16 @@ def install_diagon_prerequisites():
14641464 """
14651465 deployment_name = 'kueue-controller-manager'
14661466 namespace_name = 'kueue-system'
1467+ cert_webhook_deployment_name = 'cert-manager-webhook'
1468+ cert_webhook_namespace_name = 'cert-manager'
14671469 # is_running = wait_for_cluster_running(args)
14681470 is_running = wait_for_deployment_ready (deployment_name , namespace_name )
14691471 if is_running :
14701472 return_code = install_cert_manager ()
14711473 if return_code != 0 :
14721474 return return_code
14731475
1474- cert_webhook_ready = check_cert_manager_webhook_status ( )
1476+ cert_webhook_ready = wait_for_deployment_ready ( cert_webhook_deployment_name , cert_webhook_namespace_name )
14751477 if cert_webhook_ready :
14761478
14771479 webhook_package = "mldiagnostics-injection-webhook"
@@ -1554,96 +1556,3 @@ def wait_for_deployment_ready(deployment_name: str, namespace: str, timeout_seco
15541556 xpk_print (f"\n Unexpected API request error while checking deployment status: { e } " )
15551557 time .sleep (10 )
15561558 return False
1557-
1558-
1559- def wait_for_cluster_running (args , timeout_minutes : int = 30 ) -> bool :
1560- """
1561- Polls the GKE Cluster status using gcloud CLI until it enters the RUNNING state.
1562-
1563- Args:
1564- args: user provided arguments for running the command.
1565- timeout_minutes: Timeout duration in minutes.
1566-
1567- Returns:
1568- bool: True if the Cluster successfully enters the RUNNING state, False otherwise.
1569- """
1570- timeout_seconds = timeout_minutes * 60
1571- start_time = time .time ()
1572-
1573- # Construct gcloud command to describe the cluster status
1574- command = (
1575- 'gcloud container clusters describe'
1576- f' { args .cluster } --region={ zone_to_region (args .zone )} --project={ args .project } '
1577- " --format='value(status)'"
1578- )
1579-
1580- print (f"Waiting for cluster { args .cluster } ({ args .zone } ) to enter RUNNING state (using gcloud CLI)..." )
1581-
1582- while time .time () - start_time < timeout_seconds :
1583- try :
1584- # Execute the gcloud command
1585- return_code , return_output = run_command_for_value (
1586- command , f'Get the status of cluster...'
1587- )
1588- # Check if gcloud command itself returned an error
1589- if return_code != 0 :
1590- # If Not found error, the cluster does not exist
1591- if "Not found" in return_output :
1592- xpk_print (f"\n Error: Cluster { args .cluster } does not exist in { args .zone } ." )
1593- return False
1594-
1595- # Other execution errors, wait and retry
1596- xpk_print (f"\n Error: gcloud command failed. { return_output } " )
1597- time .sleep (10 )
1598- continue
1599-
1600- # Check cluster status returned by gcloud
1601- if "RUNNING" in return_output :
1602- xpk_print (f"Success: Cluster { args .cluster } status is RUNNING." )
1603- return True
1604-
1605- elif "ERROR" in return_output or "DEGRADED" in return_output :
1606- xpk_print (f"Error: Cluster status is { return_output } , creation failed." )
1607- return False
1608- else :
1609- elapsed_time = int (time .time () - start_time )
1610- xpk_print (f"Current status: { return_output } . Elapsed time: { elapsed_time } seconds. Checking again..." )
1611- except Exception as e :
1612- xpk_print (f"\n Unexpected API request error: { e } " )
1613- time .sleep (10 ) # Wait longer on unexpected errors
1614-
1615- # Poll interval
1616- time .sleep (30 )
1617-
1618- xpk_print (f"\n Timeout Error: Cluster did not reach RUNNING state within { timeout_minutes } minutes." )
1619- return False
1620-
1621- def check_cert_manager_webhook_status (timeout_seconds : int = 300 ) -> bool :
1622- """
1623- Runs and checks the exit code of kubectl rollout status for a specific deployment.
1624-
1625- Args:
1626- timeout_seconds (int): The maximum time to wait for the rollout to complete.
1627-
1628- Returns:
1629- bool: True if the rollout status is successful (exit code 0), False otherwise.
1630- """
1631- # Build the kubectl command
1632- kubectl_command = (
1633- 'kubectl rollout status deployment/cert-manager-webhook -n cert-manager'
1634- f' --timeout={ timeout_seconds } s'
1635- )
1636-
1637- xpk_print (f"Running command to check deployment status: { kubectl_command } " )
1638-
1639- try :
1640- return_code , return_output = run_command_for_value (
1641- kubectl_command , f'check cert manager...'
1642- )
1643- if "successfully rolled out" in return_output :
1644- xpk_print (f"SUCCESS: Deployment cert-manager-webhook rollout completed." )
1645- return True
1646-
1647- except Exception as e :
1648- xpk_print (f"\n Unexpected error during kubectl execution: { e } " )
1649- return False
0 commit comments