Skip to content

Commit 7df858a

Browse files
committed
Add Diagon installation during cluster creation and modify the workload.py
1 parent 68dd6a6 commit 7df858a

File tree

3 files changed

+199
-0
lines changed

3 files changed

+199
-0
lines changed

src/xpk/commands/cluster.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,8 @@ def cluster_create(args) -> None:
308308
if update_coredns_command_code != 0:
309309
xpk_exit(update_cluster_command_code)
310310

311+
install_diagon_prerequisites()
312+
311313
if not is_dry_run():
312314
k8s_client = setup_k8s_env(args)
313315
install_storage_crd(k8s_client)
@@ -1319,3 +1321,183 @@ def prepare_gpus(system: SystemCharacteristics):
13191321
err_code = disable_mglru_on_cluster()
13201322
if err_code > 0:
13211323
xpk_exit(err_code)
1324+
1325+
def install_cert_manager(version: str = 'v1.13.0'):
1326+
"""
1327+
Apply the cert-manager manifest.
1328+
1329+
Returns:
1330+
0 if successful and 1 otherwise.
1331+
"""
1332+
1333+
command = (
1334+
f'kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/'
1335+
f'{version}/cert-manager.yaml'
1336+
)
1337+
1338+
return_code = run_command_with_updates(
1339+
command, f'Applying cert-manager {version} manifest...'
1340+
)
1341+
1342+
if return_code != 0:
1343+
xpk_print(f'Applying cert-manager returned with ERROR {return_code}.\n')
1344+
1345+
return return_code
1346+
1347+
def download_mldiagnostics_yaml(package_name: str, version: str):
1348+
"""
1349+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1350+
1351+
Returns:
1352+
0 if successful and 1 otherwise.
1353+
"""
1354+
1355+
command = (
1356+
f'gcloud artifacts generic download --repository=mldiagnostics-webhook-and-operator-yaml --location=us '
1357+
f'--package={package_name} --version={version} --destination=./ '
1358+
f'--project=ai-on-gke'
1359+
)
1360+
1361+
return_code, return_output = run_command_for_value(
1362+
command, f'Starting gcloud artifacts download for {package_name} {version}...'
1363+
)
1364+
1365+
if return_code != 0:
1366+
if 'already exists' in return_output:
1367+
xpk_print(f'Artifact file for {package_name} {version} already exists locally. Skipping download.')
1368+
return 0
1369+
xpk_print(f'gcloud download returned with ERROR {return_code}.\n')
1370+
xpk_exit(return_code)
1371+
1372+
xpk_print(f'Artifact download completed successfully.')
1373+
return return_code
1374+
1375+
def create_mldiagnostics_namespace():
1376+
"""
1377+
Creates the 'gke-diagon' namespace.
1378+
1379+
Returns:
1380+
0 if successful and 1 otherwise.
1381+
"""
1382+
1383+
command = f'kubectl create namespace gke-diagon'
1384+
1385+
return_code, return_output = run_command_for_value(
1386+
command, f'Starting kubectl create namespace...'
1387+
)
1388+
1389+
if return_code != 0:
1390+
if 'already exists' in return_output:
1391+
xpk_print('Namespace already exists locally. Skipping creation.')
1392+
return 0
1393+
xpk_print(f'Namespace creation returned with ERROR {return_code}.\n')
1394+
xpk_exit(return_code)
1395+
1396+
xpk_print(f'gke-diagon Namespace created or already exists.')
1397+
return return_code
1398+
1399+
def install_mldiagnostics_yaml(artifact_filename: str):
1400+
"""
1401+
Applies the mldiagnostics injection webhook YAML manifest.
1402+
1403+
Returns:
1404+
0 if successful and 1 otherwise.
1405+
"""
1406+
1407+
command = f'kubectl apply -f {artifact_filename}'
1408+
1409+
return_code = run_command_with_updates(
1410+
command, f'Starting kubectl apply -f {artifact_filename} ...'
1411+
)
1412+
1413+
if return_code != 0:
1414+
xpk_print(f'kubectl apply returned with ERROR {return_code}.\n')
1415+
xpk_exit(return_code)
1416+
1417+
xpk_print(f'{artifact_filename} applied successfully.')
1418+
1419+
if os.path.exists(artifact_filename):
1420+
try:
1421+
os.remove(artifact_filename)
1422+
xpk_print(f'Successfully deleted local file: {artifact_filename}')
1423+
1424+
except PermissionError:
1425+
xpk_print(f'Failed to delete file {artifact_filename} due to Permission Error.')
1426+
1427+
except Exception as e:
1428+
xpk_print(f'Failed to delete file {artifact_filename}. Unexpected error: {e}')
1429+
1430+
else:
1431+
xpk_print(f'File {artifact_filename} does not exist locally. Skipping deletion (Cleanup assumed).')
1432+
1433+
return return_code
1434+
1435+
def label_default_namespace_mldiagnostics():
1436+
"""
1437+
Labels the 'default' namespace with 'diagon-enabled=true'.
1438+
1439+
Returns:
1440+
0 if successful and 1 otherwise.
1441+
"""
1442+
1443+
command = f'kubectl label namespace default diagon-enabled=true'
1444+
1445+
return_code = run_command_with_updates(
1446+
command, f"Starting kubectl label namespace default with diagon-enabled=true..."
1447+
)
1448+
1449+
if return_code != 0:
1450+
xpk_print(f'Namespace labeling returned with ERROR {return_code}.\n')
1451+
xpk_exit(return_code)
1452+
1453+
xpk_print('default Namespace successfully labeled.')
1454+
return return_code
1455+
1456+
def install_diagon_prerequisites():
1457+
"""
1458+
Diagon installation requirements.
1459+
1460+
Returns:
1461+
0 if successful and 1 otherwise.
1462+
"""
1463+
1464+
return_code = install_cert_manager()
1465+
if return_code != 0:
1466+
return return_code
1467+
1468+
1469+
webhook_package = "mldiagnostics-injection-webhook"
1470+
webhook_version = "v0.3.0"
1471+
webhook_filename = f"{webhook_package}-{webhook_version}.yaml"
1472+
1473+
return_code = download_mldiagnostics_yaml(package_name=webhook_package, version=webhook_version)
1474+
if return_code != 0:
1475+
return return_code
1476+
1477+
return_code = create_mldiagnostics_namespace()
1478+
if return_code != 0:
1479+
return return_code
1480+
1481+
return_code = install_mldiagnostics_yaml(artifact_filename=webhook_filename)
1482+
if return_code != 0:
1483+
return return_code
1484+
1485+
return_code = label_default_namespace_mldiagnostics()
1486+
if return_code != 0:
1487+
return return_code
1488+
1489+
# --- Install Operator ---
1490+
operator_package = "mldiagnostics-connection-operator"
1491+
operator_version = "v0.3.0"
1492+
operator_filename = f"{operator_package}-{operator_version}.yaml"
1493+
1494+
return_code = download_mldiagnostics_yaml(package_name=operator_package, version=operator_version)
1495+
if return_code != 0:
1496+
return return_code
1497+
1498+
return_code = install_mldiagnostics_yaml(artifact_filename=operator_filename)
1499+
if return_code != 0:
1500+
return return_code
1501+
1502+
xpk_print("All diagon installation and setup steps have been successfully completed!")
1503+
return return_code

src/xpk/commands/workload.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
labels:
112112
kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
113113
xpk.google.com/workload: {args.workload}
114+
{mldiagnostics_labels}
114115
annotations:
115116
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
116117
spec:
@@ -398,6 +399,12 @@ def workload_create(args) -> None:
398399
if return_code != 0:
399400
xpk_exit(return_code)
400401

402+
mldiagnostics_labels = ''
403+
404+
if args.managed_mldiagnostics:
405+
mldiagnostics_labels = """diagon-enabled: "true" """
406+
xpk_print('Managed ML Diagnostics injection enabled. Adding Pod Label.')
407+
401408
service_account = ''
402409
all_storages = []
403410
# Currently storage customization is not supported for Pathways workloads. b/408468941
@@ -599,6 +606,7 @@ def workload_create(args) -> None:
599606
""" if system.accelerator_type == AcceleratorType.TPU else '',
600607
failure_policy_rules=failure_policy_rules,
601608
pod_failure_policy=pod_failure_policy,
609+
mldiagnostics_labels=mldiagnostics_labels,
602610
)
603611
tmp = write_tmp_file(yml_string)
604612
command = f'kubectl apply -f {str(tmp)}'

src/xpk/parser/workload.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,15 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
224224
' conditions.'
225225
),
226226
)
227+
workload_create_parser_optional_arguments.add_argument(
228+
'--managed-mldiagnostics',
229+
action='store_true',
230+
default=False,
231+
help=(
232+
'[Optional] Enables injection of GKE managed ML Diagnostics'
233+
' webhook metadata.'
234+
),
235+
)
227236

228237
add_shared_workload_create_required_arguments([
229238
workload_create_parser_required_arguments,

0 commit comments

Comments
 (0)