|
81 | 81 | from ..utils.templates import get_templates_absolute_path |
82 | 82 | import shutil |
83 | 83 | import os |
| 84 | +import time |
84 | 85 |
|
85 | 86 | CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2' |
86 | 87 |
|
@@ -308,8 +309,6 @@ def cluster_create(args) -> None: |
308 | 309 | if update_coredns_command_code != 0: |
309 | 310 | xpk_exit(update_cluster_command_code) |
310 | 311 |
|
311 | | - install_diagon_prerequisites() |
312 | | - |
313 | 312 | if not is_dry_run(): |
314 | 313 | k8s_client = setup_k8s_env(args) |
315 | 314 | install_storage_crd(k8s_client) |
@@ -409,6 +408,8 @@ def cluster_create(args) -> None: |
409 | 408 | # pylint: disable=line-too-long |
410 | 409 | f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}' |
411 | 410 | ) |
| 411 | + if args.managed_mldiagnostics: |
| 412 | + install_diagon_prerequisites() |
412 | 413 | xpk_exit(0) |
413 | 414 |
|
414 | 415 |
|
@@ -1405,10 +1406,10 @@ def install_mldiagnostics_yaml(artifact_filename: str): |
1405 | 1406 | 0 if successful and 1 otherwise. |
1406 | 1407 | """ |
1407 | 1408 |
|
1408 | | - command = f'kubectl apply -f {artifact_filename}' |
| 1409 | + command = f'kubectl apply -f {artifact_filename} -n gke-diagon' |
1409 | 1410 |
|
1410 | 1411 | return_code = run_command_with_updates( |
1411 | | - command, f'Starting kubectl apply -f {artifact_filename} ...' |
| 1412 | + command, f'Starting kubectl apply -f {artifact_filename} -n gke-diagon...' |
1412 | 1413 | ) |
1413 | 1414 |
|
1414 | 1415 | if return_code != 0: |
@@ -1461,44 +1462,188 @@ def install_diagon_prerequisites(): |
1461 | 1462 | Returns: |
1462 | 1463 | 0 if successful and 1 otherwise. |
1463 | 1464 | """ |
| 1465 | + deployment_name = 'kueue-controller-manager' |
| 1466 | + namespace_name = 'kueue-system' |
| 1467 | + # is_running = wait_for_cluster_running(args) |
| 1468 | + is_running = wait_for_deployment_ready(deployment_name, namespace_name) |
| 1469 | + if is_running: |
| 1470 | + return_code = install_cert_manager() |
| 1471 | + if return_code != 0: |
| 1472 | + return return_code |
1464 | 1473 |
|
1465 | | - return_code = install_cert_manager() |
1466 | | - if return_code != 0: |
1467 | | - return return_code |
1468 | | - |
| 1474 | + cert_webhook_ready = check_cert_manager_webhook_status() |
| 1475 | + if cert_webhook_ready: |
1469 | 1476 |
|
1470 | | - webhook_package = "mldiagnostics-injection-webhook" |
1471 | | - webhook_version = "v0.3.0" |
1472 | | - webhook_filename = f"{webhook_package}-{webhook_version}.yaml" |
| 1477 | + webhook_package = "mldiagnostics-injection-webhook" |
| 1478 | + webhook_version = "v0.3.0" |
| 1479 | + webhook_filename = f"{webhook_package}-{webhook_version}.yaml" |
1473 | 1480 |
|
1474 | | - return_code = download_mldiagnostics_yaml(package_name=webhook_package, version=webhook_version) |
1475 | | - if return_code != 0: |
1476 | | - return return_code |
1477 | | - |
1478 | | - return_code = create_mldiagnostics_namespace() |
1479 | | - if return_code != 0: |
1480 | | - return return_code |
| 1481 | + return_code = download_mldiagnostics_yaml(package_name=webhook_package, version=webhook_version) |
| 1482 | + if return_code != 0: |
| 1483 | + return return_code |
| 1484 | + |
| 1485 | + return_code = create_mldiagnostics_namespace() |
| 1486 | + if return_code != 0: |
| 1487 | + return return_code |
1481 | 1488 |
|
1482 | | - return_code = install_mldiagnostics_yaml(artifact_filename=webhook_filename) |
1483 | | - if return_code != 0: |
1484 | | - return return_code |
| 1489 | + return_code = install_mldiagnostics_yaml(artifact_filename=webhook_filename) |
| 1490 | + if return_code != 0: |
| 1491 | + return return_code |
| 1492 | + |
| 1493 | + return_code = label_default_namespace_mldiagnostics() |
| 1494 | + if return_code != 0: |
| 1495 | + return return_code |
| 1496 | + |
| 1497 | + # --- Install Operator --- |
| 1498 | + operator_package = "mldiagnostics-connection-operator" |
| 1499 | + operator_version = "v0.3.0" |
| 1500 | + operator_filename = f"{operator_package}-{operator_version}.yaml" |
| 1501 | + |
| 1502 | + return_code = download_mldiagnostics_yaml(package_name=operator_package, version=operator_version) |
| 1503 | + if return_code != 0: |
| 1504 | + return return_code |
| 1505 | + |
| 1506 | + return_code = install_mldiagnostics_yaml(artifact_filename=operator_filename) |
| 1507 | + if return_code != 0: |
| 1508 | + return return_code |
| 1509 | + |
| 1510 | + xpk_print("All diagon installation and setup steps have been successfully completed!") |
| 1511 | + return return_code |
| 1512 | + else: |
| 1513 | + xpk_print("The cert-manager-webhook installation failed.") |
| 1514 | + xpk_exit(1) |
| 1515 | + else: |
| 1516 | + xpk_print(f"Application {deployment_name} failed to become ready within the timeout.") |
| 1517 | + xpk_exit(1) |
| 1518 | + |
| 1519 | +def wait_for_deployment_ready(deployment_name: str, namespace: str, timeout_seconds: int = 300) -> bool: |
| 1520 | + """ |
| 1521 | + Polls the Kubernetes Deployment status using kubectl rollout status |
| 1522 | + until it successfully rolls out (all replicas are ready) or times out. |
| 1523 | +
|
| 1524 | + Args: |
| 1525 | + deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager'). |
| 1526 | + namespace: The namespace where the Deployment is located (e.g., 'kueue-system'). |
| 1527 | + timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes). |
| 1528 | +
|
| 1529 | + Returns: |
| 1530 | + bool: True if the Deployment successfully rolled out, False otherwise (timeout or error). |
| 1531 | + """ |
1485 | 1532 |
|
1486 | | - return_code = label_default_namespace_mldiagnostics() |
1487 | | - if return_code != 0: |
1488 | | - return return_code |
| 1533 | + command = ( |
| 1534 | + f'kubectl rollout status deployment/{deployment_name} -n {namespace}' |
| 1535 | + f' --timeout={timeout_seconds}s' |
| 1536 | + ) |
| 1537 | + |
| 1538 | + print(f"Waiting for deployment {deployment_name} in namespace {namespace} to successfully roll out...") |
| 1539 | + |
| 1540 | + try: |
| 1541 | + return_code, return_output = run_command_for_value( |
| 1542 | + command, f'Checking status of deployment {deployment_name}...' |
| 1543 | + ) |
| 1544 | + |
| 1545 | + if return_code != 0: |
| 1546 | + xpk_print(f"\nError: Deployment {deployment_name} failed to roll out.") |
| 1547 | + xpk_print(f"kubectl output: {return_output}") |
| 1548 | + return False |
1489 | 1549 |
|
1490 | | - # --- Install Operator --- |
1491 | | - operator_package = "mldiagnostics-connection-operator" |
1492 | | - operator_version = "v0.3.0" |
1493 | | - operator_filename = f"{operator_package}-{operator_version}.yaml" |
| 1550 | + xpk_print(f"Success: Deployment {deployment_name} successfully rolled out.") |
| 1551 | + return True |
| 1552 | + |
| 1553 | + except Exception as e: |
| 1554 | + xpk_print(f"\nUnexpected API request error while checking deployment status: {e}") |
| 1555 | + time.sleep(10) |
| 1556 | + return False |
| 1557 | + |
| 1558 | + |
| 1559 | +def wait_for_cluster_running(args, timeout_minutes: int = 30) -> bool: |
| 1560 | + """ |
| 1561 | + Polls the GKE Cluster status using gcloud CLI until it enters the RUNNING state. |
| 1562 | +
|
| 1563 | + Args: |
| 1564 | + args: user provided arguments for running the command. |
| 1565 | + timeout_minutes: Timeout duration in minutes. |
| 1566 | +
|
| 1567 | + Returns: |
| 1568 | + bool: True if the Cluster successfully enters the RUNNING state, False otherwise. |
| 1569 | + """ |
| 1570 | + timeout_seconds = timeout_minutes * 60 |
| 1571 | + start_time = time.time() |
1494 | 1572 |
|
1495 | | - return_code = download_mldiagnostics_yaml(package_name=operator_package, version=operator_version) |
1496 | | - if return_code != 0: |
1497 | | - return return_code |
| 1573 | + # Construct gcloud command to describe the cluster status |
| 1574 | + command = ( |
| 1575 | + 'gcloud container clusters describe' |
| 1576 | + f' {args.cluster} --region={zone_to_region(args.zone)} --project={args.project}' |
| 1577 | + " --format='value(status)'" |
| 1578 | + ) |
| 1579 | + |
| 1580 | + print(f"Waiting for cluster {args.cluster} ({args.zone}) to enter RUNNING state (using gcloud CLI)...") |
| 1581 | + |
| 1582 | + while time.time() - start_time < timeout_seconds: |
| 1583 | + try: |
| 1584 | + # Execute the gcloud command |
| 1585 | + return_code, return_output = run_command_for_value( |
| 1586 | + command, f'Get the status of cluster...' |
| 1587 | + ) |
| 1588 | + # Check if gcloud command itself returned an error |
| 1589 | + if return_code != 0: |
| 1590 | + # If Not found error, the cluster does not exist |
| 1591 | + if "Not found" in return_output: |
| 1592 | + xpk_print(f"\nError: Cluster {args.cluster} does not exist in {args.zone}.") |
| 1593 | + return False |
| 1594 | + |
| 1595 | + # Other execution errors, wait and retry |
| 1596 | + xpk_print(f"\nError: gcloud command failed. {return_output}") |
| 1597 | + time.sleep(10) |
| 1598 | + continue |
| 1599 | + |
| 1600 | + # Check cluster status returned by gcloud |
| 1601 | + if "RUNNING" in return_output: |
| 1602 | + xpk_print(f"Success: Cluster {args.cluster} status is RUNNING.") |
| 1603 | + return True |
| 1604 | + |
| 1605 | + elif "ERROR" in return_output or "DEGRADED" in return_output: |
| 1606 | + xpk_print(f"Error: Cluster status is {return_output}, creation failed.") |
| 1607 | + return False |
| 1608 | + else: |
| 1609 | + elapsed_time = int(time.time() - start_time) |
| 1610 | + xpk_print(f"Current status: {return_output}. Elapsed time: {elapsed_time} seconds. Checking again...") |
| 1611 | + except Exception as e: |
| 1612 | + xpk_print(f"\nUnexpected API request error: {e}") |
| 1613 | + time.sleep(10) # Wait longer on unexpected errors |
1498 | 1614 |
|
1499 | | - return_code = install_mldiagnostics_yaml(artifact_filename=operator_filename) |
1500 | | - if return_code != 0: |
1501 | | - return return_code |
| 1615 | + # Poll interval |
| 1616 | + time.sleep(30) |
| 1617 | + |
| 1618 | + xpk_print(f"\nTimeout Error: Cluster did not reach RUNNING state within {timeout_minutes} minutes.") |
| 1619 | + return False |
| 1620 | + |
| 1621 | +def check_cert_manager_webhook_status(timeout_seconds: int = 300) -> bool: |
| 1622 | + """ |
| 1623 | + Runs and checks the exit code of kubectl rollout status for a specific deployment. |
| 1624 | +
|
| 1625 | + Args: |
| 1626 | + timeout_seconds (int): The maximum time to wait for the rollout to complete. |
| 1627 | +
|
| 1628 | + Returns: |
| 1629 | + bool: True if the rollout status is successful (exit code 0), False otherwise. |
| 1630 | + """ |
| 1631 | + # Build the kubectl command |
| 1632 | + kubectl_command = ( |
| 1633 | + 'kubectl rollout status deployment/cert-manager-webhook -n cert-manager' |
| 1634 | + f' --timeout={timeout_seconds}s' |
| 1635 | + ) |
1502 | 1636 |
|
1503 | | - xpk_print("All diagon installation and setup steps have been successfully completed!") |
1504 | | - return return_code |
| 1637 | + xpk_print(f"Running command to check deployment status: {kubectl_command}") |
| 1638 | + |
| 1639 | + try: |
| 1640 | + return_code, return_output = run_command_for_value( |
| 1641 | + kubectl_command, f'check cert manager...' |
| 1642 | + ) |
| 1643 | + if "successfully rolled out" in return_output: |
| 1644 | + xpk_print(f"SUCCESS: Deployment cert-manager-webhook rollout completed.") |
| 1645 | + return True |
| 1646 | + |
| 1647 | + except Exception as e: |
| 1648 | + xpk_print(f"\nUnexpected error during kubectl execution: {e}") |
| 1649 | + return False |
0 commit comments