Skip to content

Commit 3c46565

Browse files
committed
Add Diagon installation during cluster creation and modify the workload.py
1 parent 7ce4537 commit 3c46565

File tree

3 files changed

+199
-0
lines changed

3 files changed

+199
-0
lines changed

src/xpk/commands/cluster.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,8 @@ def cluster_create(args) -> None:
308308
if update_coredns_command_code != 0:
309309
xpk_exit(update_cluster_command_code)
310310

311+
install_diagon_prerequisites()
312+
311313
if not is_dry_run():
312314
k8s_client = setup_k8s_env(args)
313315
install_storage_crd(k8s_client)
@@ -1320,3 +1322,183 @@ def prepare_gpus(system: SystemCharacteristics):
13201322
err_code = disable_mglru_on_cluster()
13211323
if err_code > 0:
13221324
xpk_exit(err_code)
1325+
1326+
def install_cert_manager(version: str = 'v1.13.0'):
1327+
"""
1328+
Apply the cert-manager manifest.
1329+
1330+
Returns:
1331+
0 if successful and 1 otherwise.
1332+
"""
1333+
1334+
command = (
1335+
f'kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/'
1336+
f'{version}/cert-manager.yaml'
1337+
)
1338+
1339+
return_code = run_command_with_updates(
1340+
command, f'Applying cert-manager {version} manifest...'
1341+
)
1342+
1343+
if return_code != 0:
1344+
xpk_print(f'Applying cert-manager returned with ERROR {return_code}.\n')
1345+
1346+
return return_code
1347+
1348+
def download_mldiagnostics_yaml(package_name: str, version: str):
1349+
"""
1350+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1351+
1352+
Returns:
1353+
0 if successful and 1 otherwise.
1354+
"""
1355+
1356+
command = (
1357+
f'gcloud artifacts generic download --repository=mldiagnostics-webhook-and-operator-yaml --location=us '
1358+
f'--package={package_name} --version={version} --destination=./ '
1359+
f'--project=ai-on-gke'
1360+
)
1361+
1362+
return_code, return_output = run_command_for_value(
1363+
command, f'Starting gcloud artifacts download for {package_name} {version}...'
1364+
)
1365+
1366+
if return_code != 0:
1367+
if 'already exists' in return_output:
1368+
xpk_print(f'Artifact file for {package_name} {version} already exists locally. Skipping download.')
1369+
return 0
1370+
xpk_print(f'gcloud download returned with ERROR {return_code}.\n')
1371+
xpk_exit(return_code)
1372+
1373+
xpk_print(f'Artifact download completed successfully.')
1374+
return return_code
1375+
1376+
def create_mldiagnostics_namespace():
1377+
"""
1378+
Creates the 'gke-diagon' namespace.
1379+
1380+
Returns:
1381+
0 if successful and 1 otherwise.
1382+
"""
1383+
1384+
command = f'kubectl create namespace gke-diagon'
1385+
1386+
return_code, return_output = run_command_for_value(
1387+
command, f'Starting kubectl create namespace...'
1388+
)
1389+
1390+
if return_code != 0:
1391+
if 'already exists' in return_output:
1392+
xpk_print('Namespace already exists locally. Skipping creation.')
1393+
return 0
1394+
xpk_print(f'Namespace creation returned with ERROR {return_code}.\n')
1395+
xpk_exit(return_code)
1396+
1397+
xpk_print(f'gke-diagon Namespace created or already exists.')
1398+
return return_code
1399+
1400+
def install_mldiagnostics_yaml(artifact_filename: str):
1401+
"""
1402+
Applies the mldiagnostics injection webhook YAML manifest.
1403+
1404+
Returns:
1405+
0 if successful and 1 otherwise.
1406+
"""
1407+
1408+
command = f'kubectl apply -f {artifact_filename}'
1409+
1410+
return_code = run_command_with_updates(
1411+
command, f'Starting kubectl apply -f {artifact_filename} ...'
1412+
)
1413+
1414+
if return_code != 0:
1415+
xpk_print(f'kubectl apply returned with ERROR {return_code}.\n')
1416+
xpk_exit(return_code)
1417+
1418+
xpk_print(f'{artifact_filename} applied successfully.')
1419+
1420+
if os.path.exists(artifact_filename):
1421+
try:
1422+
os.remove(artifact_filename)
1423+
xpk_print(f'Successfully deleted local file: {artifact_filename}')
1424+
1425+
except PermissionError:
1426+
xpk_print(f'Failed to delete file {artifact_filename} due to Permission Error.')
1427+
1428+
except Exception as e:
1429+
xpk_print(f'Failed to delete file {artifact_filename}. Unexpected error: {e}')
1430+
1431+
else:
1432+
xpk_print(f'File {artifact_filename} does not exist locally. Skipping deletion (Cleanup assumed).')
1433+
1434+
return return_code
1435+
1436+
def label_default_namespace_mldiagnostics():
1437+
"""
1438+
Labels the 'default' namespace with 'diagon-enabled=true'.
1439+
1440+
Returns:
1441+
0 if successful and 1 otherwise.
1442+
"""
1443+
1444+
command = f'kubectl label namespace default diagon-enabled=true'
1445+
1446+
return_code = run_command_with_updates(
1447+
command, f"Starting kubectl label namespace default with diagon-enabled=true..."
1448+
)
1449+
1450+
if return_code != 0:
1451+
xpk_print(f'Namespace labeling returned with ERROR {return_code}.\n')
1452+
xpk_exit(return_code)
1453+
1454+
xpk_print('default Namespace successfully labeled.')
1455+
return return_code
1456+
1457+
def install_diagon_prerequisites():
1458+
"""
1459+
Diagon installation requirements.
1460+
1461+
Returns:
1462+
0 if successful and 1 otherwise.
1463+
"""
1464+
1465+
return_code = install_cert_manager()
1466+
if return_code != 0:
1467+
return return_code
1468+
1469+
1470+
webhook_package = "mldiagnostics-injection-webhook"
1471+
webhook_version = "v0.3.0"
1472+
webhook_filename = f"{webhook_package}-{webhook_version}.yaml"
1473+
1474+
return_code = download_mldiagnostics_yaml(package_name=webhook_package, version=webhook_version)
1475+
if return_code != 0:
1476+
return return_code
1477+
1478+
return_code = create_mldiagnostics_namespace()
1479+
if return_code != 0:
1480+
return return_code
1481+
1482+
return_code = install_mldiagnostics_yaml(artifact_filename=webhook_filename)
1483+
if return_code != 0:
1484+
return return_code
1485+
1486+
return_code = label_default_namespace_mldiagnostics()
1487+
if return_code != 0:
1488+
return return_code
1489+
1490+
# --- Install Operator ---
1491+
operator_package = "mldiagnostics-connection-operator"
1492+
operator_version = "v0.3.0"
1493+
operator_filename = f"{operator_package}-{operator_version}.yaml"
1494+
1495+
return_code = download_mldiagnostics_yaml(package_name=operator_package, version=operator_version)
1496+
if return_code != 0:
1497+
return return_code
1498+
1499+
return_code = install_mldiagnostics_yaml(artifact_filename=operator_filename)
1500+
if return_code != 0:
1501+
return return_code
1502+
1503+
xpk_print("All diagon installation and setup steps have been successfully completed!")
1504+
return return_code

src/xpk/commands/workload.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
labels:
113113
kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
114114
xpk.google.com/workload: {args.workload}
115+
{mldiagnostics_labels}
115116
annotations:
116117
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
117118
spec:
@@ -399,6 +400,12 @@ def workload_create(args) -> None:
399400
if return_code != 0:
400401
xpk_exit(return_code)
401402

403+
mldiagnostics_labels = ''
404+
405+
if args.managed_mldiagnostics:
406+
mldiagnostics_labels = """diagon-enabled: "true" """
407+
xpk_print('Managed ML Diagnostics injection enabled. Adding Pod Label.')
408+
402409
service_account = ''
403410
all_storages = []
404411
# Currently storage customization is not supported for Pathways workloads. b/408468941
@@ -600,6 +607,7 @@ def workload_create(args) -> None:
600607
""" if system.accelerator_type == AcceleratorType.TPU else '',
601608
failure_policy_rules=failure_policy_rules,
602609
pod_failure_policy=pod_failure_policy,
610+
mldiagnostics_labels=mldiagnostics_labels,
603611
)
604612
tmp = write_tmp_file(yml_string)
605613
command = f'kubectl apply -f {str(tmp)}'

src/xpk/parser/workload.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,15 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
224224
' conditions.'
225225
),
226226
)
227+
workload_create_parser_optional_arguments.add_argument(
228+
'--managed-mldiagnostics',
229+
action='store_true',
230+
default=False,
231+
help=(
232+
'[Optional] Enables injection of GKE managed ML Diagnostics'
233+
' webhook metadata.'
234+
),
235+
)
227236

228237
add_shared_workload_create_required_arguments([
229238
workload_create_parser_required_arguments,

0 commit comments

Comments
 (0)