Skip to content

Commit 681f902

Browse files
committed
Add Diagon installation during cluster creation and modify the workload.py
1 parent 4f66276 commit 681f902

File tree

3 files changed

+199
-0
lines changed

3 files changed

+199
-0
lines changed

src/xpk/commands/cluster.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,8 @@ def cluster_create(args) -> None:
306306
if update_coredns_command_code != 0:
307307
xpk_exit(update_cluster_command_code)
308308

309+
install_diagon_prerequisites()
310+
309311
if not is_dry_run():
310312
k8s_client = setup_k8s_env(args)
311313
install_storage_crd(k8s_client)
@@ -1317,3 +1319,183 @@ def prepare_gpus(system: SystemCharacteristics):
13171319
err_code = disable_mglru_on_cluster()
13181320
if err_code > 0:
13191321
xpk_exit(err_code)
1322+
1323+
def install_cert_manager(version: str = 'v1.13.0'):
1324+
"""
1325+
Apply the cert-manager manifest.
1326+
1327+
Returns:
1328+
0 if successful and 1 otherwise.
1329+
"""
1330+
1331+
command = (
1332+
f'kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/'
1333+
f'{version}/cert-manager.yaml'
1334+
)
1335+
1336+
return_code = run_command_with_updates(
1337+
command, f'Applying cert-manager {version} manifest...'
1338+
)
1339+
1340+
if return_code != 0:
1341+
xpk_print(f'Applying cert-manager returned with ERROR {return_code}.\n')
1342+
1343+
return return_code
1344+
1345+
def download_mldiagnostics_yaml(package_name: str, version: str):
1346+
"""
1347+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1348+
1349+
Returns:
1350+
0 if successful and 1 otherwise.
1351+
"""
1352+
1353+
command = (
1354+
f'gcloud artifacts generic download --repository=mldiagnostics-webhook-and-operator-yaml --location=us '
1355+
f'--package={package_name} --version={version} --destination=./ '
1356+
f'--project=ai-on-gke'
1357+
)
1358+
1359+
return_code, return_output = run_command_for_value(
1360+
command, f'Starting gcloud artifacts download for {package_name} {version}...'
1361+
)
1362+
1363+
if return_code != 0:
1364+
if 'already exists' in return_output:
1365+
xpk_print(f'Artifact file for {package_name} {version} already exists locally. Skipping download.')
1366+
return 0
1367+
xpk_print(f'gcloud download returned with ERROR {return_code}.\n')
1368+
xpk_exit(return_code)
1369+
1370+
xpk_print(f'Artifact download completed successfully.')
1371+
return return_code
1372+
1373+
def create_mldiagnostics_namespace():
1374+
"""
1375+
Creates the 'gke-diagon' namespace.
1376+
1377+
Returns:
1378+
0 if successful and 1 otherwise.
1379+
"""
1380+
1381+
command = f'kubectl create namespace gke-diagon'
1382+
1383+
return_code, return_output = run_command_for_value(
1384+
command, f'Starting kubectl create namespace...'
1385+
)
1386+
1387+
if return_code != 0:
1388+
if 'already exists' in return_output:
1389+
xpk_print('Namespace already exists locally. Skipping creation.')
1390+
return 0
1391+
xpk_print(f'Namespace creation returned with ERROR {return_code}.\n')
1392+
xpk_exit(return_code)
1393+
1394+
xpk_print(f'gke-diagon Namespace created or already exists.')
1395+
return return_code
1396+
1397+
def install_mldiagnostics_yaml(artifact_filename: str):
1398+
"""
1399+
Applies the mldiagnostics injection webhook YAML manifest.
1400+
1401+
Returns:
1402+
0 if successful and 1 otherwise.
1403+
"""
1404+
1405+
command = f'kubectl apply -f {artifact_filename}'
1406+
1407+
return_code = run_command_with_updates(
1408+
command, f'Starting kubectl apply -f {artifact_filename} ...'
1409+
)
1410+
1411+
if return_code != 0:
1412+
xpk_print(f'kubectl apply returned with ERROR {return_code}.\n')
1413+
xpk_exit(return_code)
1414+
1415+
xpk_print(f'{artifact_filename} applied successfully.')
1416+
1417+
if os.path.exists(artifact_filename):
1418+
try:
1419+
os.remove(artifact_filename)
1420+
xpk_print(f'Successfully deleted local file: {artifact_filename}')
1421+
1422+
except PermissionError:
1423+
xpk_print(f'Failed to delete file {artifact_filename} due to Permission Error.')
1424+
1425+
except Exception as e:
1426+
xpk_print(f'Failed to delete file {artifact_filename}. Unexpected error: {e}')
1427+
1428+
else:
1429+
xpk_print(f'File {artifact_filename} does not exist locally. Skipping deletion (Cleanup assumed).')
1430+
1431+
return return_code
1432+
1433+
def label_default_namespace_mldiagnostics():
1434+
"""
1435+
Labels the 'default' namespace with 'diagon-enabled=true'.
1436+
1437+
Returns:
1438+
0 if successful and 1 otherwise.
1439+
"""
1440+
1441+
command = f'kubectl label namespace default diagon-enabled=true'
1442+
1443+
return_code = run_command_with_updates(
1444+
command, f"Starting kubectl label namespace default with diagon-enabled=true..."
1445+
)
1446+
1447+
if return_code != 0:
1448+
xpk_print(f'Namespace labeling returned with ERROR {return_code}.\n')
1449+
xpk_exit(return_code)
1450+
1451+
xpk_print('default Namespace successfully labeled.')
1452+
return return_code
1453+
1454+
def install_diagon_prerequisites():
1455+
"""
1456+
Diagon installation requirements.
1457+
1458+
Returns:
1459+
0 if successful and 1 otherwise.
1460+
"""
1461+
1462+
return_code = install_cert_manager()
1463+
if return_code != 0:
1464+
return return_code
1465+
1466+
1467+
webhook_package = "mldiagnostics-injection-webhook"
1468+
webhook_version = "v0.3.0"
1469+
webhook_filename = f"{webhook_package}-{webhook_version}.yaml"
1470+
1471+
return_code = download_mldiagnostics_yaml(package_name=webhook_package, version=webhook_version)
1472+
if return_code != 0:
1473+
return return_code
1474+
1475+
return_code = create_mldiagnostics_namespace()
1476+
if return_code != 0:
1477+
return return_code
1478+
1479+
return_code = install_mldiagnostics_yaml(artifact_filename=webhook_filename)
1480+
if return_code != 0:
1481+
return return_code
1482+
1483+
return_code = label_default_namespace_mldiagnostics()
1484+
if return_code != 0:
1485+
return return_code
1486+
1487+
# --- Install Operator ---
1488+
operator_package = "mldiagnostics-connection-operator"
1489+
operator_version = "v0.3.0"
1490+
operator_filename = f"{operator_package}-{operator_version}.yaml"
1491+
1492+
return_code = download_mldiagnostics_yaml(package_name=operator_package, version=operator_version)
1493+
if return_code != 0:
1494+
return return_code
1495+
1496+
return_code = install_mldiagnostics_yaml(artifact_filename=operator_filename)
1497+
if return_code != 0:
1498+
return return_code
1499+
1500+
xpk_print("All diagon installation and setup steps have been successfully completed!")
1501+
return return_code

src/xpk/commands/workload.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
labels:
113113
kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
114114
xpk.google.com/workload: {args.workload}
115+
{mldiagnostics_labels}
115116
annotations:
116117
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
117118
spec:
@@ -400,6 +401,12 @@ def workload_create(args) -> None:
400401
if return_code != 0:
401402
xpk_exit(return_code)
402403

404+
mldiagnostics_labels = ''
405+
406+
if args.managed_mldiagnostics:
407+
mldiagnostics_labels = """diagon-enabled: "true" """
408+
xpk_print('Managed ML Diagnostics injection enabled. Adding Pod Label.')
409+
403410
service_account = ''
404411
all_storages = []
405412
# Currently storage customization is not supported for Pathways workloads. b/408468941
@@ -601,6 +608,7 @@ def workload_create(args) -> None:
601608
""" if system.accelerator_type == AcceleratorType['TPU'] else '',
602609
failure_policy_rules=failure_policy_rules,
603610
pod_failure_policy=pod_failure_policy,
611+
mldiagnostics_labels=mldiagnostics_labels,
604612
)
605613
tmp = write_tmp_file(yml_string)
606614
command = f'kubectl apply -f {str(tmp)}'

src/xpk/parser/workload.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,15 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
216216
' conditions.'
217217
),
218218
)
219+
workload_create_parser_optional_arguments.add_argument(
220+
'--managed-mldiagnostics',
221+
action='store_true',
222+
default=False,
223+
help=(
224+
'[Optional] Enables injection of GKE managed ML Diagnostics'
225+
' webhook metadata.'
226+
),
227+
)
219228

220229
add_shared_workload_create_required_arguments([
221230
workload_create_parser_required_arguments,

0 commit comments

Comments
 (0)