@@ -308,6 +308,8 @@ def cluster_create(args) -> None:
308308 if update_coredns_command_code != 0 :
309309 xpk_exit (update_cluster_command_code )
310310
311+ install_diagon_prerequisites ()
312+
311313 if not is_dry_run ():
312314 k8s_client = setup_k8s_env (args )
313315 install_storage_crd (k8s_client )
@@ -1319,3 +1321,183 @@ def prepare_gpus(system: SystemCharacteristics):
13191321 err_code = disable_mglru_on_cluster ()
13201322 if err_code > 0 :
13211323 xpk_exit (err_code )
1324+
1325+ def install_cert_manager (version : str = 'v1.13.0' ):
1326+ """
1327+ Apply the cert-manager manifest.
1328+
1329+ Returns:
1330+ 0 if successful and 1 otherwise.
1331+ """
1332+
1333+ command = (
1334+ f'kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/'
1335+ f'{ version } /cert-manager.yaml'
1336+ )
1337+
1338+ return_code = run_command_with_updates (
1339+ command , f'Applying cert-manager { version } manifest...'
1340+ )
1341+
1342+ if return_code != 0 :
1343+ xpk_print (f'Applying cert-manager returned with ERROR { return_code } .\n ' )
1344+
1345+ return return_code
1346+
1347+ def download_mldiagnostics_yaml (package_name : str , version : str ):
1348+ """
1349+ Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1350+
1351+ Returns:
1352+ 0 if successful and 1 otherwise.
1353+ """
1354+
1355+ command = (
1356+ f'gcloud artifacts generic download --repository=mldiagnostics-webhook-and-operator-yaml --location=us '
1357+ f'--package={ package_name } --version={ version } --destination=./ '
1358+ f'--project=ai-on-gke'
1359+ )
1360+
1361+ return_code , return_output = run_command_for_value (
1362+ command , f'Starting gcloud artifacts download for { package_name } { version } ...'
1363+ )
1364+
1365+ if return_code != 0 :
1366+ if 'already exists' in return_output :
1367+ xpk_print (f'Artifact file for { package_name } { version } already exists locally. Skipping download.' )
1368+ return 0
1369+ xpk_print (f'gcloud download returned with ERROR { return_code } .\n ' )
1370+ xpk_exit (return_code )
1371+
1372+ xpk_print (f'Artifact download completed successfully.' )
1373+ return return_code
1374+
1375+ def create_mldiagnostics_namespace ():
1376+ """
1377+ Creates the 'gke-diagon' namespace.
1378+
1379+ Returns:
1380+ 0 if successful and 1 otherwise.
1381+ """
1382+
1383+ command = f'kubectl create namespace gke-diagon'
1384+
1385+ return_code , return_output = run_command_for_value (
1386+ command , f'Starting kubectl create namespace...'
1387+ )
1388+
1389+ if return_code != 0 :
1390+ if 'already exists' in return_output :
1391+ xpk_print ('Namespace already exists locally. Skipping creation.' )
1392+ return 0
1393+ xpk_print (f'Namespace creation returned with ERROR { return_code } .\n ' )
1394+ xpk_exit (return_code )
1395+
1396+ xpk_print (f'gke-diagon Namespace created or already exists.' )
1397+ return return_code
1398+
1399+ def install_mldiagnostics_yaml (artifact_filename : str ):
1400+ """
1401+ Applies the mldiagnostics injection webhook YAML manifest.
1402+
1403+ Returns:
1404+ 0 if successful and 1 otherwise.
1405+ """
1406+
1407+ command = f'kubectl apply -f { artifact_filename } '
1408+
1409+ return_code = run_command_with_updates (
1410+ command , f'Starting kubectl apply -f { artifact_filename } ...'
1411+ )
1412+
1413+ if return_code != 0 :
1414+ xpk_print (f'kubectl apply returned with ERROR { return_code } .\n ' )
1415+ xpk_exit (return_code )
1416+
1417+ xpk_print (f'{ artifact_filename } applied successfully.' )
1418+
1419+ if os .path .exists (artifact_filename ):
1420+ try :
1421+ os .remove (artifact_filename )
1422+ xpk_print (f'Successfully deleted local file: { artifact_filename } ' )
1423+
1424+ except PermissionError :
1425+ xpk_print (f'Failed to delete file { artifact_filename } due to Permission Error.' )
1426+
1427+ except Exception as e :
1428+ xpk_print (f'Failed to delete file { artifact_filename } . Unexpected error: { e } ' )
1429+
1430+ else :
1431+ xpk_print (f'File { artifact_filename } does not exist locally. Skipping deletion (Cleanup assumed).' )
1432+
1433+ return return_code
1434+
1435+ def label_default_namespace_mldiagnostics ():
1436+ """
1437+ Labels the 'default' namespace with 'diagon-enabled=true'.
1438+
1439+ Returns:
1440+ 0 if successful and 1 otherwise.
1441+ """
1442+
1443+ command = f'kubectl label namespace default diagon-enabled=true'
1444+
1445+ return_code = run_command_with_updates (
1446+ command , f"Starting kubectl label namespace default with diagon-enabled=true..."
1447+ )
1448+
1449+ if return_code != 0 :
1450+ xpk_print (f'Namespace labeling returned with ERROR { return_code } .\n ' )
1451+ xpk_exit (return_code )
1452+
1453+ xpk_print ('default Namespace successfully labeled.' )
1454+ return return_code
1455+
1456+ def install_diagon_prerequisites ():
1457+ """
1458+ Diagon installation requirements.
1459+
1460+ Returns:
1461+ 0 if successful and 1 otherwise.
1462+ """
1463+
1464+ return_code = install_cert_manager ()
1465+ if return_code != 0 :
1466+ return return_code
1467+
1468+
1469+ webhook_package = "mldiagnostics-injection-webhook"
1470+ webhook_version = "v0.3.0"
1471+ webhook_filename = f"{ webhook_package } -{ webhook_version } .yaml"
1472+
1473+ return_code = download_mldiagnostics_yaml (package_name = webhook_package , version = webhook_version )
1474+ if return_code != 0 :
1475+ return return_code
1476+
1477+ return_code = create_mldiagnostics_namespace ()
1478+ if return_code != 0 :
1479+ return return_code
1480+
1481+ return_code = install_mldiagnostics_yaml (artifact_filename = webhook_filename )
1482+ if return_code != 0 :
1483+ return return_code
1484+
1485+ return_code = label_default_namespace_mldiagnostics ()
1486+ if return_code != 0 :
1487+ return return_code
1488+
1489+ # --- Install Operator ---
1490+ operator_package = "mldiagnostics-connection-operator"
1491+ operator_version = "v0.3.0"
1492+ operator_filename = f"{ operator_package } -{ operator_version } .yaml"
1493+
1494+ return_code = download_mldiagnostics_yaml (package_name = operator_package , version = operator_version )
1495+ if return_code != 0 :
1496+ return return_code
1497+
1498+ return_code = install_mldiagnostics_yaml (artifact_filename = operator_filename )
1499+ if return_code != 0 :
1500+ return return_code
1501+
1502+ xpk_print ("All diagon installation and setup steps have been successfully completed!" )
1503+ return return_code
0 commit comments