Add wait_for_deployment_ready()

DannyLiCom · DannyLiCom · commit a7167bac4547 · 2025-11-06T03:40:56.000Z
diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py
@@ -81,6 +81,7 @@
 from ..utils.templates import get_templates_absolute_path
 import shutil
 import os
+import time
 
 CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
 
@@ -308,8 +309,6 @@ def cluster_create(args) -> None:
   if update_coredns_command_code != 0:
     xpk_exit(update_cluster_command_code)
 
-  install_diagon_prerequisites()
-
   if not is_dry_run():
     k8s_client = setup_k8s_env(args)
     install_storage_crd(k8s_client)
@@ -409,6 +408,8 @@ def cluster_create(args) -> None:
       # pylint: disable=line-too-long
       f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
   )
+  if args.managed_mldiagnostics:
+    install_diagon_prerequisites()
   xpk_exit(0)
 
 
@@ -1404,10 +1405,10 @@ def install_mldiagnostics_yaml(artifact_filename: str):
     0 if successful and 1 otherwise.
   """
   
-  command = f'kubectl apply -f {artifact_filename}'
+  command = f'kubectl apply -f {artifact_filename} -n gke-diagon'
   
   return_code = run_command_with_updates(
-      command, f'Starting kubectl apply -f {artifact_filename} ...'
+      command, f'Starting kubectl apply -f {artifact_filename} -n gke-diagon...'
   )
   
   if return_code != 0:
@@ -1460,44 +1461,188 @@ def install_diagon_prerequisites():
   Returns:
     0 if successful and 1 otherwise.
   """
+  deployment_name = 'kueue-controller-manager'
+  namespace_name = 'kueue-system'
+  # is_running = wait_for_cluster_running(args)
+  is_running = wait_for_deployment_ready(deployment_name, namespace_name)
+  if is_running:
+    return_code = install_cert_manager()
+    if return_code != 0:
+      return return_code
 
-  return_code = install_cert_manager()
-  if return_code != 0:
-    return return_code
-  
+    cert_webhook_ready = check_cert_manager_webhook_status()
+    if cert_webhook_ready:    
 
-  webhook_package = "mldiagnostics-injection-webhook"
-  webhook_version = "v0.3.0"
-  webhook_filename = f"{webhook_package}-{webhook_version}.yaml"
+      webhook_package = "mldiagnostics-injection-webhook"
+      webhook_version = "v0.3.0"
+      webhook_filename = f"{webhook_package}-{webhook_version}.yaml"
 
-  return_code = download_mldiagnostics_yaml(package_name=webhook_package, version=webhook_version)
-  if return_code != 0:
-    return return_code
-  
-  return_code = create_mldiagnostics_namespace()
-  if return_code != 0:
-    return return_code
+      return_code = download_mldiagnostics_yaml(package_name=webhook_package, version=webhook_version)
+      if return_code != 0:
+        return return_code
+      
+      return_code = create_mldiagnostics_namespace()
+      if return_code != 0:
+        return return_code
 
-  return_code = install_mldiagnostics_yaml(artifact_filename=webhook_filename)
-  if return_code != 0:
-    return return_code
+      return_code = install_mldiagnostics_yaml(artifact_filename=webhook_filename)
+      if return_code != 0:
+        return return_code
+      
+      return_code = label_default_namespace_mldiagnostics()
+      if return_code != 0:
+        return return_code
+
+      # --- Install Operator ---
+      operator_package = "mldiagnostics-connection-operator"
+      operator_version = "v0.3.0"
+      operator_filename = f"{operator_package}-{operator_version}.yaml"
+      
+      return_code = download_mldiagnostics_yaml(package_name=operator_package, version=operator_version)
+      if return_code != 0:
+        return return_code
+        
+      return_code = install_mldiagnostics_yaml(artifact_filename=operator_filename)
+      if return_code != 0:
+        return return_code
+      
+      xpk_print("All diagon installation and setup steps have been successfully completed!")
+      return return_code
+    else:
+      xpk_print("The cert-manager-webhook installation failed.")
+      xpk_exit(1)
+  else:
+    xpk_print(f"Application {deployment_name} failed to become ready within the timeout.")
+    xpk_exit(1)
+
+def wait_for_deployment_ready(deployment_name: str, namespace: str, timeout_seconds: int = 300) -> bool:
+  """
+  Polls the Kubernetes Deployment status using kubectl rollout status 
+  until it successfully rolls out (all replicas are ready) or times out.
+
+  Args:
+      deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager').
+      namespace: The namespace where the Deployment is located (e.g., 'kueue-system').
+      timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes).
+
+  Returns:
+      bool: True if the Deployment successfully rolled out, False otherwise (timeout or error).
+  """
   
-  return_code = label_default_namespace_mldiagnostics()
-  if return_code != 0:
-    return return_code
+  command = (
+      f'kubectl rollout status deployment/{deployment_name} -n {namespace}'
+      f' --timeout={timeout_seconds}s'
+  )
+  
+  print(f"Waiting for deployment {deployment_name} in namespace {namespace} to successfully roll out...")
+
+  try:
+    return_code, return_output = run_command_for_value(
+        command, f'Checking status of deployment {deployment_name}...'
+    )
+
+    if return_code != 0:
+      xpk_print(f"\nError: Deployment {deployment_name} failed to roll out.")
+      xpk_print(f"kubectl output: {return_output}")
+      return False
 
-  # --- Install Operator ---
-  operator_package = "mldiagnostics-connection-operator"
-  operator_version = "v0.3.0"
-  operator_filename = f"{operator_package}-{operator_version}.yaml"
+    xpk_print(f"Success: Deployment {deployment_name} successfully rolled out.")
+    return True
+      
+  except Exception as e:
+    xpk_print(f"\nUnexpected API request error while checking deployment status: {e}")
+    time.sleep(10)
+    return False
+
+
+def wait_for_cluster_running(args, timeout_minutes: int = 30) -> bool:
+  """
+  Polls the GKE Cluster status using gcloud CLI until it enters the RUNNING state.
+
+  Args:
+      args: user provided arguments for running the command.
+      timeout_minutes: Timeout duration in minutes.
+
+  Returns:
+      bool: True if the Cluster successfully enters the RUNNING state, False otherwise.
+  """
+  timeout_seconds = timeout_minutes * 60
+  start_time = time.time()
   
-  return_code = download_mldiagnostics_yaml(package_name=operator_package, version=operator_version)
-  if return_code != 0:
-    return return_code
+  # Construct gcloud command to describe the cluster status
+  command = (
+      'gcloud container clusters describe'
+      f' {args.cluster} --region={zone_to_region(args.zone)} --project={args.project}'
+      " --format='value(status)'"
+  )
+  
+  print(f"Waiting for cluster {args.cluster} ({args.zone}) to enter RUNNING state (using gcloud CLI)...")
+
+  while time.time() - start_time < timeout_seconds:
+    try:
+      # Execute the gcloud command
+      return_code, return_output = run_command_for_value(
+          command, f'Get the status of cluster...'
+      )
+      # Check if gcloud command itself returned an error
+      if return_code != 0:
+        # If Not found error, the cluster does not exist
+        if "Not found" in return_output:
+          xpk_print(f"\nError: Cluster {args.cluster} does not exist in {args.zone}.")
+          return False
+        
+        # Other execution errors, wait and retry
+        xpk_print(f"\nError: gcloud command failed. {return_output}")
+        time.sleep(10)
+        continue
+
+      # Check cluster status returned by gcloud
+      if "RUNNING" in return_output:
+        xpk_print(f"Success: Cluster {args.cluster} status is RUNNING.")
+        return True
+      
+      elif "ERROR" in return_output or "DEGRADED" in return_output:
+        xpk_print(f"Error: Cluster status is {return_output}, creation failed.")
+        return False                
+      else:
+        elapsed_time = int(time.time() - start_time)
+        xpk_print(f"Current status: {return_output}. Elapsed time: {elapsed_time} seconds. Checking again...")
+    except Exception as e:
+      xpk_print(f"\nUnexpected API request error: {e}")
+      time.sleep(10) # Wait longer on unexpected errors
     
-  return_code = install_mldiagnostics_yaml(artifact_filename=operator_filename)
-  if return_code != 0:
-    return return_code
+    # Poll interval
+    time.sleep(30) 
+
+  xpk_print(f"\nTimeout Error: Cluster did not reach RUNNING state within {timeout_minutes} minutes.")
+  return False
+
+def check_cert_manager_webhook_status(timeout_seconds: int = 300) -> bool:
+  """
+  Runs and checks the exit code of kubectl rollout status for a specific deployment.
+
+  Args:
+      timeout_seconds (int): The maximum time to wait for the rollout to complete.
+
+  Returns:
+      bool: True if the rollout status is successful (exit code 0), False otherwise.
+  """
+  # Build the kubectl command
+  kubectl_command = (
+      'kubectl rollout status deployment/cert-manager-webhook -n cert-manager'
+      f' --timeout={timeout_seconds}s'
+  )
   
-  xpk_print("All diagon installation and setup steps have been successfully completed!")
-  return return_code
+  xpk_print(f"Running command to check deployment status: {kubectl_command}")
+
+  try:
+    return_code, return_output = run_command_for_value(
+        kubectl_command, f'check cert manager...'
+    )
+    if "successfully rolled out" in return_output:
+      xpk_print(f"SUCCESS: Deployment cert-manager-webhook rollout completed.")
+      return True
+
+  except Exception as e:
+    xpk_print(f"\nUnexpected error during kubectl execution: {e}")
+    return False
diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py
@@ -111,7 +111,6 @@
   labels:
     kueue.x-k8s.io/queue-name: {local_queue_name}  # Name of the LocalQueue
     xpk.google.com/workload: {args.workload}
-    {mldiagnostics_labels}
   annotations:
     alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
 spec:
@@ -399,12 +398,6 @@ def workload_create(args) -> None:
     if return_code != 0:
       xpk_exit(return_code)
 
-  mldiagnostics_labels = ''
-
-  if args.managed_mldiagnostics:
-    mldiagnostics_labels = """diagon-enabled: "true" """
-    xpk_print('Managed ML Diagnostics injection enabled. Adding Pod Label.')
-
   service_account = ''
   all_storages = []
   # Currently storage customization is not supported for Pathways workloads. b/408468941
@@ -606,7 +599,6 @@ def workload_create(args) -> None:
         """ if system.accelerator_type == AcceleratorType.TPU else '',
         failure_policy_rules=failure_policy_rules,
         pod_failure_policy=pod_failure_policy,
-        mldiagnostics_labels=mldiagnostics_labels,
     )
   tmp = write_tmp_file(yml_string)
   command = f'kubectl apply -f {str(tmp)}'
diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py
@@ -150,6 +150,18 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
           ' enable cluster to accept Pathways workloads.'
       ),
   )
+
+  cluster_create_optional_arguments.add_argument(
+      '--managed-mldiagnostics',
+      action='store_true',
+      default=False,
+      help=(
+          '[Optional] Enables the installation of required ML Diagnostics components: '
+          'cert-manager, injection-webhook, and connection-operator. '
+          'This feature is OFF by default.'
+      ),
+  )
+
   if FeatureFlags.SUB_SLICING_ENABLED:
     cluster_create_optional_arguments.add_argument(
         '--sub-slicing',
diff --git a/src/xpk/parser/workload.py b/src/xpk/parser/workload.py
@@ -224,15 +224,6 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
           ' conditions.'
       ),
   )
-  workload_create_parser_optional_arguments.add_argument(
-      '--managed-mldiagnostics',
-      action='store_true',
-      default=False,
-      help=(
-          '[Optional] Enables injection of GKE managed ML Diagnostics'
-          '  webhook metadata.'
-      ),
-  )
 
   add_shared_workload_create_required_arguments([
       workload_create_parser_required_arguments,