|
81 | 81 | from ..utils.templates import get_templates_absolute_path |
82 | 82 | import shutil |
83 | 83 | import os |
| 84 | +import time |
84 | 85 |
|
85 | 86 | CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2' |
86 | 87 |
|
@@ -407,6 +408,8 @@ def cluster_create(args) -> None: |
407 | 408 | # pylint: disable=line-too-long |
408 | 409 | f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}' |
409 | 410 | ) |
| 411 | + if args.managed_mldiagnostics: |
| 412 | + install_diagon_prerequisites() |
410 | 413 | xpk_exit(0) |
411 | 414 |
|
412 | 415 |
|
@@ -1319,3 +1322,269 @@ def prepare_gpus(system: SystemCharacteristics): |
1319 | 1322 | err_code = disable_mglru_on_cluster() |
1320 | 1323 | if err_code > 0: |
1321 | 1324 | xpk_exit(err_code) |
| 1325 | + |
| 1326 | + |
| 1327 | +def install_cert_manager(version: str = 'v1.13.0'): |
| 1328 | + """ |
| 1329 | + Apply the cert-manager manifest. |
| 1330 | +
|
| 1331 | + Returns: |
| 1332 | + 0 if successful and 1 otherwise. |
| 1333 | + """ |
| 1334 | + |
| 1335 | + command = ( |
| 1336 | + 'kubectl apply -f' |
| 1337 | + ' https://github.com/cert-manager/cert-manager/releases/download/' |
| 1338 | + f'{version}/cert-manager.yaml' |
| 1339 | + ) |
| 1340 | + |
| 1341 | + return_code = run_command_with_updates( |
| 1342 | + command, f'Applying cert-manager {version} manifest...' |
| 1343 | + ) |
| 1344 | + |
| 1345 | + if return_code != 0: |
| 1346 | + xpk_print(f'Applying cert-manager returned with ERROR {return_code}.\n') |
| 1347 | + |
| 1348 | + return return_code |
| 1349 | + |
| 1350 | + |
| 1351 | +def download_mldiagnostics_yaml(package_name: str, version: str): |
| 1352 | + """ |
| 1353 | + Downloads the mldiagnostics injection webhook YAML from Artifact Registry. |
| 1354 | +
|
| 1355 | + Returns: |
| 1356 | + 0 if successful and 1 otherwise. |
| 1357 | + """ |
| 1358 | + |
| 1359 | + command = ( |
| 1360 | + 'gcloud artifacts generic download' |
| 1361 | + ' --repository=mldiagnostics-webhook-and-operator-yaml --location=us' |
| 1362 | + f' --package={package_name} --version={version} --destination=./' |
| 1363 | + ' --project=ai-on-gke' |
| 1364 | + ) |
| 1365 | + |
| 1366 | + return_code, return_output = run_command_for_value( |
| 1367 | + command, |
| 1368 | + f'Starting gcloud artifacts download for {package_name} {version}...', |
| 1369 | + ) |
| 1370 | + |
| 1371 | + if return_code != 0: |
| 1372 | + if 'already exists' in return_output: |
| 1373 | + xpk_print( |
| 1374 | + f'Artifact file for {package_name} {version} already exists locally.' |
| 1375 | + ' Skipping download.' |
| 1376 | + ) |
| 1377 | + return 0 |
| 1378 | + xpk_print(f'gcloud download returned with ERROR {return_code}.\n') |
| 1379 | + xpk_exit(return_code) |
| 1380 | + |
| 1381 | + xpk_print('Artifact download completed successfully.') |
| 1382 | + return return_code |
| 1383 | + |
| 1384 | + |
| 1385 | +def create_mldiagnostics_namespace(): |
| 1386 | + """ |
| 1387 | + Creates the 'gke-diagon' namespace. |
| 1388 | +
|
| 1389 | + Returns: |
| 1390 | + 0 if successful and 1 otherwise. |
| 1391 | + """ |
| 1392 | + |
| 1393 | + command = 'kubectl create namespace gke-diagon' |
| 1394 | + |
| 1395 | + return_code, return_output = run_command_for_value( |
| 1396 | + command, 'Starting kubectl create namespace...' |
| 1397 | + ) |
| 1398 | + |
| 1399 | + if return_code != 0: |
| 1400 | + if 'already exists' in return_output: |
| 1401 | + xpk_print('Namespace already exists locally. Skipping creation.') |
| 1402 | + return 0 |
| 1403 | + xpk_print(f'Namespace creation returned with ERROR {return_code}.\n') |
| 1404 | + xpk_exit(return_code) |
| 1405 | + |
| 1406 | + xpk_print('gke-diagon Namespace created or already exists.') |
| 1407 | + return return_code |
| 1408 | + |
| 1409 | + |
| 1410 | +def install_mldiagnostics_yaml(artifact_filename: str): |
| 1411 | + """ |
| 1412 | + Applies the mldiagnostics injection webhook YAML manifest. |
| 1413 | +
|
| 1414 | + Returns: |
| 1415 | + 0 if successful and 1 otherwise. |
| 1416 | + """ |
| 1417 | + |
| 1418 | + command = f'kubectl apply -f {artifact_filename} -n gke-diagon' |
| 1419 | + |
| 1420 | + return_code = run_command_with_updates( |
| 1421 | + command, f'Starting kubectl apply -f {artifact_filename} -n gke-diagon...' |
| 1422 | + ) |
| 1423 | + |
| 1424 | + if return_code != 0: |
| 1425 | + xpk_print(f'kubectl apply returned with ERROR {return_code}.\n') |
| 1426 | + xpk_exit(return_code) |
| 1427 | + |
| 1428 | + xpk_print(f'{artifact_filename} applied successfully.') |
| 1429 | + |
| 1430 | + if os.path.exists(artifact_filename): |
| 1431 | + try: |
| 1432 | + os.remove(artifact_filename) |
| 1433 | + xpk_print(f'Successfully deleted local file: {artifact_filename}') |
| 1434 | + |
| 1435 | + except PermissionError: |
| 1436 | + xpk_print( |
| 1437 | + f'Failed to delete file {artifact_filename} due to Permission Error.' |
| 1438 | + ) |
| 1439 | + |
| 1440 | + else: |
| 1441 | + xpk_print( |
| 1442 | + f'File {artifact_filename} does not exist locally. Skipping deletion' |
| 1443 | + ' (Cleanup assumed).' |
| 1444 | + ) |
| 1445 | + |
| 1446 | + return return_code |
| 1447 | + |
| 1448 | + |
| 1449 | +def label_default_namespace_mldiagnostics(): |
| 1450 | + """ |
| 1451 | + Labels the 'default' namespace with 'diagon-enabled=true'. |
| 1452 | +
|
| 1453 | + Returns: |
| 1454 | + 0 if successful and 1 otherwise. |
| 1455 | + """ |
| 1456 | + |
| 1457 | + command = 'kubectl label namespace default diagon-enabled=true' |
| 1458 | + |
| 1459 | + return_code = run_command_with_updates( |
| 1460 | + command, |
| 1461 | + 'Starting kubectl label namespace default with diagon-enabled=true...', |
| 1462 | + ) |
| 1463 | + |
| 1464 | + if return_code != 0: |
| 1465 | + xpk_print(f'Namespace labeling returned with ERROR {return_code}.\n') |
| 1466 | + xpk_exit(return_code) |
| 1467 | + |
| 1468 | + xpk_print('default Namespace successfully labeled.') |
| 1469 | + return return_code |
| 1470 | + |
| 1471 | + |
| 1472 | +def install_diagon_prerequisites(): |
| 1473 | + """ |
| 1474 | + Diagon installation requirements. |
| 1475 | +
|
| 1476 | + Returns: |
| 1477 | + 0 if successful and 1 otherwise. |
| 1478 | + """ |
| 1479 | + deployment_name = 'kueue-controller-manager' |
| 1480 | + namespace_name = 'kueue-system' |
| 1481 | + cert_webhook_deployment_name = 'cert-manager-webhook' |
| 1482 | + cert_webhook_namespace_name = 'cert-manager' |
| 1483 | + # is_running = wait_for_cluster_running(args) |
| 1484 | + is_running = wait_for_deployment_ready(deployment_name, namespace_name) |
| 1485 | + time.sleep(30) |
| 1486 | + if is_running: |
| 1487 | + return_code = install_cert_manager() |
| 1488 | + if return_code != 0: |
| 1489 | + return return_code |
| 1490 | + |
| 1491 | + cert_webhook_ready = wait_for_deployment_ready( |
| 1492 | + cert_webhook_deployment_name, cert_webhook_namespace_name |
| 1493 | + ) |
| 1494 | + time.sleep(30) |
| 1495 | + if cert_webhook_ready: |
| 1496 | + |
| 1497 | + webhook_package = 'mldiagnostics-injection-webhook' |
| 1498 | + webhook_version = 'v0.3.0' |
| 1499 | + webhook_filename = f'{webhook_package}-{webhook_version}.yaml' |
| 1500 | + |
| 1501 | + return_code = download_mldiagnostics_yaml( |
| 1502 | + package_name=webhook_package, version=webhook_version |
| 1503 | + ) |
| 1504 | + if return_code != 0: |
| 1505 | + return return_code |
| 1506 | + |
| 1507 | + return_code = create_mldiagnostics_namespace() |
| 1508 | + if return_code != 0: |
| 1509 | + return return_code |
| 1510 | + |
| 1511 | + return_code = install_mldiagnostics_yaml( |
| 1512 | + artifact_filename=webhook_filename |
| 1513 | + ) |
| 1514 | + if return_code != 0: |
| 1515 | + return return_code |
| 1516 | + |
| 1517 | + return_code = label_default_namespace_mldiagnostics() |
| 1518 | + if return_code != 0: |
| 1519 | + return return_code |
| 1520 | + |
| 1521 | + # --- Install Operator --- |
| 1522 | + operator_package = 'mldiagnostics-connection-operator' |
| 1523 | + operator_version = 'v0.3.0' |
| 1524 | + operator_filename = f'{operator_package}-{operator_version}.yaml' |
| 1525 | + |
| 1526 | + return_code = download_mldiagnostics_yaml( |
| 1527 | + package_name=operator_package, version=operator_version |
| 1528 | + ) |
| 1529 | + if return_code != 0: |
| 1530 | + return return_code |
| 1531 | + |
| 1532 | + return_code = install_mldiagnostics_yaml( |
| 1533 | + artifact_filename=operator_filename |
| 1534 | + ) |
| 1535 | + if return_code != 0: |
| 1536 | + return return_code |
| 1537 | + |
| 1538 | + xpk_print( |
| 1539 | + 'All diagon installation and setup steps have been successfully' |
| 1540 | + ' completed!' |
| 1541 | + ) |
| 1542 | + return return_code |
| 1543 | + else: |
| 1544 | + xpk_print('The cert-manager-webhook installation failed.') |
| 1545 | + xpk_exit(1) |
| 1546 | + else: |
| 1547 | + xpk_print( |
| 1548 | + f'Application {deployment_name} failed to become ready within the' |
| 1549 | + ' timeout.' |
| 1550 | + ) |
| 1551 | + xpk_exit(1) |
| 1552 | + |
| 1553 | + |
| 1554 | +def wait_for_deployment_ready( |
| 1555 | + deployment_name: str, namespace: str, timeout_seconds: int = 300 |
| 1556 | +) -> bool: |
| 1557 | + """ |
| 1558 | + Polls the Kubernetes Deployment status using kubectl rollout status |
| 1559 | + until it successfully rolls out (all replicas are ready) or times out. |
| 1560 | +
|
| 1561 | + Args: |
| 1562 | + deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager'). |
| 1563 | + namespace: The namespace where the Deployment is located (e.g., 'kueue-system'). |
| 1564 | + timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes). |
| 1565 | +
|
| 1566 | + Returns: |
| 1567 | + bool: True if the Deployment successfully rolled out, False otherwise (timeout or error). |
| 1568 | + """ |
| 1569 | + |
| 1570 | + command = ( |
| 1571 | + f'kubectl rollout status deployment/{deployment_name} -n {namespace}' |
| 1572 | + f' --timeout={timeout_seconds}s' |
| 1573 | + ) |
| 1574 | + |
| 1575 | + print( |
| 1576 | + f'Waiting for deployment {deployment_name} in namespace {namespace} to' |
| 1577 | + ' successfully roll out...' |
| 1578 | + ) |
| 1579 | + |
| 1580 | + return_code, return_output = run_command_for_value( |
| 1581 | + command, f'Checking status of deployment {deployment_name}...' |
| 1582 | + ) |
| 1583 | + |
| 1584 | + if return_code != 0: |
| 1585 | + xpk_print(f'\nError: Deployment {deployment_name} failed to roll out.') |
| 1586 | + xpk_print(f'kubectl output: {return_output}') |
| 1587 | + return False |
| 1588 | + |
| 1589 | + xpk_print(f'Success: Deployment {deployment_name} successfully rolled out.') |
| 1590 | + return True |
0 commit comments