@@ -308,6 +308,8 @@ def cluster_create(args) -> None:
308308 if update_coredns_command_code != 0 :
309309 xpk_exit (update_cluster_command_code )
310310
311+ install_diagon_prerequisites ()
312+
311313 if not is_dry_run ():
312314 k8s_client = setup_k8s_env (args )
313315 install_storage_crd (k8s_client )
@@ -1320,3 +1322,183 @@ def prepare_gpus(system: SystemCharacteristics):
13201322 err_code = disable_mglru_on_cluster ()
13211323 if err_code > 0 :
13221324 xpk_exit (err_code )
1325+
1326+ def install_cert_manager (version : str = 'v1.13.0' ):
1327+ """
1328+ Apply the cert-manager manifest.
1329+
1330+ Returns:
1331+ 0 if successful and 1 otherwise.
1332+ """
1333+
1334+ command = (
1335+ f'kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/'
1336+ f'{ version } /cert-manager.yaml'
1337+ )
1338+
1339+ return_code = run_command_with_updates (
1340+ command , f'Applying cert-manager { version } manifest...'
1341+ )
1342+
1343+ if return_code != 0 :
1344+ xpk_print (f'Applying cert-manager returned with ERROR { return_code } .\n ' )
1345+
1346+ return return_code
1347+
1348+ def download_mldiagnostics_yaml (package_name : str , version : str ):
1349+ """
1350+ Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1351+
1352+ Returns:
1353+ 0 if successful and 1 otherwise.
1354+ """
1355+
1356+ command = (
1357+ f'gcloud artifacts generic download --repository=mldiagnostics-webhook-and-operator-yaml --location=us '
1358+ f'--package={ package_name } --version={ version } --destination=./ '
1359+ f'--project=ai-on-gke'
1360+ )
1361+
1362+ return_code , return_output = run_command_for_value (
1363+ command , f'Starting gcloud artifacts download for { package_name } { version } ...'
1364+ )
1365+
1366+ if return_code != 0 :
1367+ if 'already exists' in return_output :
1368+ xpk_print (f'Artifact file for { package_name } { version } already exists locally. Skipping download.' )
1369+ return 0
1370+ xpk_print (f'gcloud download returned with ERROR { return_code } .\n ' )
1371+ xpk_exit (return_code )
1372+
1373+ xpk_print (f'Artifact download completed successfully.' )
1374+ return return_code
1375+
1376+ def create_mldiagnostics_namespace ():
1377+ """
1378+ Creates the 'gke-diagon' namespace.
1379+
1380+ Returns:
1381+ 0 if successful and 1 otherwise.
1382+ """
1383+
1384+ command = f'kubectl create namespace gke-diagon'
1385+
1386+ return_code , return_output = run_command_for_value (
1387+ command , f'Starting kubectl create namespace...'
1388+ )
1389+
1390+ if return_code != 0 :
1391+ if 'already exists' in return_output :
1392+ xpk_print ('Namespace already exists locally. Skipping creation.' )
1393+ return 0
1394+ xpk_print (f'Namespace creation returned with ERROR { return_code } .\n ' )
1395+ xpk_exit (return_code )
1396+
1397+ xpk_print (f'gke-diagon Namespace created or already exists.' )
1398+ return return_code
1399+
1400+ def install_mldiagnostics_yaml (artifact_filename : str ):
1401+ """
1402+ Applies the mldiagnostics injection webhook YAML manifest.
1403+
1404+ Returns:
1405+ 0 if successful and 1 otherwise.
1406+ """
1407+
1408+ command = f'kubectl apply -f { artifact_filename } '
1409+
1410+ return_code = run_command_with_updates (
1411+ command , f'Starting kubectl apply -f { artifact_filename } ...'
1412+ )
1413+
1414+ if return_code != 0 :
1415+ xpk_print (f'kubectl apply returned with ERROR { return_code } .\n ' )
1416+ xpk_exit (return_code )
1417+
1418+ xpk_print (f'{ artifact_filename } applied successfully.' )
1419+
1420+ if os .path .exists (artifact_filename ):
1421+ try :
1422+ os .remove (artifact_filename )
1423+ xpk_print (f'Successfully deleted local file: { artifact_filename } ' )
1424+
1425+ except PermissionError :
1426+ xpk_print (f'Failed to delete file { artifact_filename } due to Permission Error.' )
1427+
1428+ except Exception as e :
1429+ xpk_print (f'Failed to delete file { artifact_filename } . Unexpected error: { e } ' )
1430+
1431+ else :
1432+ xpk_print (f'File { artifact_filename } does not exist locally. Skipping deletion (Cleanup assumed).' )
1433+
1434+ return return_code
1435+
1436+ def label_default_namespace_mldiagnostics ():
1437+ """
1438+ Labels the 'default' namespace with 'diagon-enabled=true'.
1439+
1440+ Returns:
1441+ 0 if successful and 1 otherwise.
1442+ """
1443+
1444+ command = f'kubectl label namespace default diagon-enabled=true'
1445+
1446+ return_code = run_command_with_updates (
1447+ command , f"Starting kubectl label namespace default with diagon-enabled=true..."
1448+ )
1449+
1450+ if return_code != 0 :
1451+ xpk_print (f'Namespace labeling returned with ERROR { return_code } .\n ' )
1452+ xpk_exit (return_code )
1453+
1454+ xpk_print ('default Namespace successfully labeled.' )
1455+ return return_code
1456+
1457+ def install_diagon_prerequisites ():
1458+ """
1459+ Diagon installation requirements.
1460+
1461+ Returns:
1462+ 0 if successful and 1 otherwise.
1463+ """
1464+
1465+ return_code = install_cert_manager ()
1466+ if return_code != 0 :
1467+ return return_code
1468+
1469+
1470+ webhook_package = "mldiagnostics-injection-webhook"
1471+ webhook_version = "v0.3.0"
1472+ webhook_filename = f"{ webhook_package } -{ webhook_version } .yaml"
1473+
1474+ return_code = download_mldiagnostics_yaml (package_name = webhook_package , version = webhook_version )
1475+ if return_code != 0 :
1476+ return return_code
1477+
1478+ return_code = create_mldiagnostics_namespace ()
1479+ if return_code != 0 :
1480+ return return_code
1481+
1482+ return_code = install_mldiagnostics_yaml (artifact_filename = webhook_filename )
1483+ if return_code != 0 :
1484+ return return_code
1485+
1486+ return_code = label_default_namespace_mldiagnostics ()
1487+ if return_code != 0 :
1488+ return return_code
1489+
1490+ # --- Install Operator ---
1491+ operator_package = "mldiagnostics-connection-operator"
1492+ operator_version = "v0.3.0"
1493+ operator_filename = f"{ operator_package } -{ operator_version } .yaml"
1494+
1495+ return_code = download_mldiagnostics_yaml (package_name = operator_package , version = operator_version )
1496+ if return_code != 0 :
1497+ return return_code
1498+
1499+ return_code = install_mldiagnostics_yaml (artifact_filename = operator_filename )
1500+ if return_code != 0 :
1501+ return return_code
1502+
1503+ xpk_print ("All diagon installation and setup steps have been successfully completed!" )
1504+ return return_code
0 commit comments