@@ -306,6 +306,8 @@ def cluster_create(args) -> None:
306306 if update_coredns_command_code != 0 :
307307 xpk_exit (update_cluster_command_code )
308308
309+ install_diagon_prerequisites ()
310+
309311 if not is_dry_run ():
310312 k8s_client = setup_k8s_env (args )
311313 install_storage_crd (k8s_client )
@@ -1317,3 +1319,183 @@ def prepare_gpus(system: SystemCharacteristics):
13171319 err_code = disable_mglru_on_cluster ()
13181320 if err_code > 0 :
13191321 xpk_exit (err_code )
1322+
1323+ def install_cert_manager (version : str = 'v1.13.0' ):
1324+ """
1325+ Apply the cert-manager manifest.
1326+
1327+ Returns:
1328+ 0 if successful and 1 otherwise.
1329+ """
1330+
1331+ command = (
1332+ f'kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/'
1333+ f'{ version } /cert-manager.yaml'
1334+ )
1335+
1336+ return_code = run_command_with_updates (
1337+ command , f'Applying cert-manager { version } manifest...'
1338+ )
1339+
1340+ if return_code != 0 :
1341+ xpk_print (f'Applying cert-manager returned with ERROR { return_code } .\n ' )
1342+
1343+ return return_code
1344+
1345+ def download_mldiagnostics_yaml (package_name : str , version : str ):
1346+ """
1347+ Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1348+
1349+ Returns:
1350+ 0 if successful and 1 otherwise.
1351+ """
1352+
1353+ command = (
1354+ f'gcloud artifacts generic download --repository=mldiagnostics-webhook-and-operator-yaml --location=us '
1355+ f'--package={ package_name } --version={ version } --destination=./ '
1356+ f'--project=ai-on-gke'
1357+ )
1358+
1359+ return_code , return_output = run_command_for_value (
1360+ command , f'Starting gcloud artifacts download for { package_name } { version } ...'
1361+ )
1362+
1363+ if return_code != 0 :
1364+ if 'already exists' in return_output :
1365+ xpk_print (f'Artifact file for { package_name } { version } already exists locally. Skipping download.' )
1366+ return 0
1367+ xpk_print (f'gcloud download returned with ERROR { return_code } .\n ' )
1368+ xpk_exit (return_code )
1369+
1370+ xpk_print (f'Artifact download completed successfully.' )
1371+ return return_code
1372+
1373+ def create_mldiagnostics_namespace ():
1374+ """
1375+ Creates the 'gke-diagon' namespace.
1376+
1377+ Returns:
1378+ 0 if successful and 1 otherwise.
1379+ """
1380+
1381+ command = f'kubectl create namespace gke-diagon'
1382+
1383+ return_code , return_output = run_command_for_value (
1384+ command , f'Starting kubectl create namespace...'
1385+ )
1386+
1387+ if return_code != 0 :
1388+ if 'already exists' in return_output :
1389+ xpk_print ('Namespace already exists locally. Skipping creation.' )
1390+ return 0
1391+ xpk_print (f'Namespace creation returned with ERROR { return_code } .\n ' )
1392+ xpk_exit (return_code )
1393+
1394+ xpk_print (f'gke-diagon Namespace created or already exists.' )
1395+ return return_code
1396+
1397+ def install_mldiagnostics_yaml (artifact_filename : str ):
1398+ """
1399+ Applies the mldiagnostics injection webhook YAML manifest.
1400+
1401+ Returns:
1402+ 0 if successful and 1 otherwise.
1403+ """
1404+
1405+ command = f'kubectl apply -f { artifact_filename } '
1406+
1407+ return_code = run_command_with_updates (
1408+ command , f'Starting kubectl apply -f { artifact_filename } ...'
1409+ )
1410+
1411+ if return_code != 0 :
1412+ xpk_print (f'kubectl apply returned with ERROR { return_code } .\n ' )
1413+ xpk_exit (return_code )
1414+
1415+ xpk_print (f'{ artifact_filename } applied successfully.' )
1416+
1417+ if os .path .exists (artifact_filename ):
1418+ try :
1419+ os .remove (artifact_filename )
1420+ xpk_print (f'Successfully deleted local file: { artifact_filename } ' )
1421+
1422+ except PermissionError :
1423+ xpk_print (f'Failed to delete file { artifact_filename } due to Permission Error.' )
1424+
1425+ except Exception as e :
1426+ xpk_print (f'Failed to delete file { artifact_filename } . Unexpected error: { e } ' )
1427+
1428+ else :
1429+ xpk_print (f'File { artifact_filename } does not exist locally. Skipping deletion (Cleanup assumed).' )
1430+
1431+ return return_code
1432+
1433+ def label_default_namespace_mldiagnostics ():
1434+ """
1435+ Labels the 'default' namespace with 'diagon-enabled=true'.
1436+
1437+ Returns:
1438+ 0 if successful and 1 otherwise.
1439+ """
1440+
1441+ command = f'kubectl label namespace default diagon-enabled=true'
1442+
1443+ return_code = run_command_with_updates (
1444+ command , f"Starting kubectl label namespace default with diagon-enabled=true..."
1445+ )
1446+
1447+ if return_code != 0 :
1448+ xpk_print (f'Namespace labeling returned with ERROR { return_code } .\n ' )
1449+ xpk_exit (return_code )
1450+
1451+ xpk_print ('default Namespace successfully labeled.' )
1452+ return return_code
1453+
1454+ def install_diagon_prerequisites ():
1455+ """
1456+ Diagon installation requirements.
1457+
1458+ Returns:
1459+ 0 if successful and 1 otherwise.
1460+ """
1461+
1462+ return_code = install_cert_manager ()
1463+ if return_code != 0 :
1464+ return return_code
1465+
1466+
1467+ webhook_package = "mldiagnostics-injection-webhook"
1468+ webhook_version = "v0.3.0"
1469+ webhook_filename = f"{ webhook_package } -{ webhook_version } .yaml"
1470+
1471+ return_code = download_mldiagnostics_yaml (package_name = webhook_package , version = webhook_version )
1472+ if return_code != 0 :
1473+ return return_code
1474+
1475+ return_code = create_mldiagnostics_namespace ()
1476+ if return_code != 0 :
1477+ return return_code
1478+
1479+ return_code = install_mldiagnostics_yaml (artifact_filename = webhook_filename )
1480+ if return_code != 0 :
1481+ return return_code
1482+
1483+ return_code = label_default_namespace_mldiagnostics ()
1484+ if return_code != 0 :
1485+ return return_code
1486+
1487+ # --- Install Operator ---
1488+ operator_package = "mldiagnostics-connection-operator"
1489+ operator_version = "v0.3.0"
1490+ operator_filename = f"{ operator_package } -{ operator_version } .yaml"
1491+
1492+ return_code = download_mldiagnostics_yaml (package_name = operator_package , version = operator_version )
1493+ if return_code != 0 :
1494+ return return_code
1495+
1496+ return_code = install_mldiagnostics_yaml (artifact_filename = operator_filename )
1497+ if return_code != 0 :
1498+ return return_code
1499+
1500+ xpk_print ("All diagon installation and setup steps have been successfully completed!" )
1501+ return return_code
0 commit comments