Skip to content

Commit 7a1eecb

Browse files
committed
Add Diagon installation during cluster creation and modify the workload.py
Add wait_for_deployment_ready() Added unit test update goldens.yaml update goldens.yaml update goldens.yaml Fixed parser/cluster.py update goldens.yaml fixed linter fixed linter pyink Test unit test
1 parent e5cf2f3 commit 7a1eecb

File tree

4 files changed

+369
-1
lines changed

4 files changed

+369
-1
lines changed

src/xpk/commands/cluster.py

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
from ..utils.templates import get_templates_absolute_path
8282
import shutil
8383
import os
84+
import time
8485

8586
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
8687

@@ -407,6 +408,8 @@ def cluster_create(args) -> None:
407408
# pylint: disable=line-too-long
408409
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
409410
)
411+
if args.managed_mldiagnostics:
412+
install_mldiagnostics_prerequisites()
410413
xpk_exit(0)
411414

412415

@@ -1319,3 +1322,271 @@ def prepare_gpus(system: SystemCharacteristics):
13191322
err_code = disable_mglru_on_cluster()
13201323
if err_code > 0:
13211324
xpk_exit(err_code)
1325+
1326+
1327+
def install_cert_manager(version: str = 'v1.13.0'):
1328+
"""
1329+
Apply the cert-manager manifest.
1330+
1331+
Returns:
1332+
0 if successful and 1 otherwise.
1333+
"""
1334+
1335+
command = (
1336+
'kubectl apply -f'
1337+
' https://github.com/cert-manager/cert-manager/releases/download/'
1338+
f'{version}/cert-manager.yaml'
1339+
)
1340+
1341+
return_code = run_command_with_updates(
1342+
command, f'Applying cert-manager {version} manifest...'
1343+
)
1344+
1345+
if return_code != 0:
1346+
xpk_print(f'Applying cert-manager returned with ERROR {return_code}.\n')
1347+
1348+
return return_code
1349+
1350+
1351+
def download_mldiagnostics_yaml(package_name: str, version: str):
1352+
"""
1353+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1354+
1355+
Returns:
1356+
0 if successful and 1 otherwise.
1357+
"""
1358+
1359+
command = (
1360+
'gcloud artifacts generic download'
1361+
' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
1362+
f' --package={package_name} --version={version} --destination=./'
1363+
' --project=ai-on-gke'
1364+
)
1365+
1366+
return_code, return_output = run_command_for_value(
1367+
command,
1368+
f'Starting gcloud artifacts download for {package_name} {version}...',
1369+
)
1370+
1371+
if return_code != 0:
1372+
if 'already exists' in return_output:
1373+
xpk_print(
1374+
f'Artifact file for {package_name} {version} already exists locally.'
1375+
' Skipping download.'
1376+
)
1377+
return 0
1378+
xpk_print(f'gcloud download returned with ERROR {return_code}.\n')
1379+
xpk_exit(return_code)
1380+
1381+
xpk_print('Artifact download completed successfully.')
1382+
return return_code
1383+
1384+
1385+
def create_mldiagnostics_namespace():
1386+
"""
1387+
Creates the 'gke-mldiagnostics' namespace.
1388+
1389+
Returns:
1390+
0 if successful and 1 otherwise.
1391+
"""
1392+
1393+
command = 'kubectl create namespace gke-mldiagnostics'
1394+
1395+
return_code, return_output = run_command_for_value(
1396+
command, 'Starting kubectl create namespace...'
1397+
)
1398+
1399+
if return_code != 0:
1400+
if 'already exists' in return_output:
1401+
xpk_print('Namespace already exists locally. Skipping creation.')
1402+
return 0
1403+
xpk_print(f'Namespace creation returned with ERROR {return_code}.\n')
1404+
xpk_exit(return_code)
1405+
1406+
xpk_print('gke-mldiagnostics Namespace created or already exists.')
1407+
return return_code
1408+
1409+
1410+
def install_mldiagnostics_yaml(artifact_filename: str):
1411+
"""
1412+
Applies the mldiagnostics injection webhook YAML manifest.
1413+
1414+
Returns:
1415+
0 if successful and 1 otherwise.
1416+
"""
1417+
1418+
command = f'kubectl apply -f {artifact_filename} -n gke-mldiagnostics'
1419+
1420+
return_code = run_command_with_updates(
1421+
command,
1422+
f'Starting kubectl apply -f {artifact_filename} -n gke-mldiagnostics...',
1423+
)
1424+
1425+
if return_code != 0:
1426+
xpk_print(f'kubectl apply returned with ERROR {return_code}.\n')
1427+
xpk_exit(return_code)
1428+
1429+
xpk_print(f'{artifact_filename} applied successfully.')
1430+
1431+
if os.path.exists(artifact_filename):
1432+
try:
1433+
os.remove(artifact_filename)
1434+
xpk_print(f'Successfully deleted local file: {artifact_filename}')
1435+
1436+
except PermissionError:
1437+
xpk_print(
1438+
f'Failed to delete file {artifact_filename} due to Permission Error.'
1439+
)
1440+
1441+
else:
1442+
xpk_print(
1443+
f'File {artifact_filename} does not exist locally. Skipping deletion'
1444+
' (Cleanup assumed).'
1445+
)
1446+
1447+
return return_code
1448+
1449+
1450+
def label_default_namespace_mldiagnostics():
1451+
"""
1452+
Labels the 'default' namespace with 'managed-mldiagnostics-gke=true'.
1453+
1454+
Returns:
1455+
0 if successful and 1 otherwise.
1456+
"""
1457+
1458+
command = 'kubectl label namespace default managed-mldiagnostics-gke=true'
1459+
1460+
return_code = run_command_with_updates(
1461+
command,
1462+
'Starting kubectl label namespace default with'
1463+
' managed-mldiagnostics-gke=true...',
1464+
)
1465+
1466+
if return_code != 0:
1467+
xpk_print(f'Namespace labeling returned with ERROR {return_code}.\n')
1468+
xpk_exit(return_code)
1469+
1470+
xpk_print('default Namespace successfully labeled.')
1471+
return return_code
1472+
1473+
1474+
def install_mldiagnostics_prerequisites():
1475+
"""
1476+
Mldiagnostics installation requirements.
1477+
1478+
Returns:
1479+
0 if successful and 1 otherwise.
1480+
"""
1481+
deployment_name = 'kueue-controller-manager'
1482+
namespace_name = 'kueue-system'
1483+
cert_webhook_deployment_name = 'cert-manager-webhook'
1484+
cert_webhook_namespace_name = 'cert-manager'
1485+
# is_running = wait_for_cluster_running(args)
1486+
is_running = wait_for_deployment_ready(deployment_name, namespace_name)
1487+
time.sleep(30)
1488+
if is_running:
1489+
return_code = install_cert_manager()
1490+
if return_code != 0:
1491+
return return_code
1492+
1493+
cert_webhook_ready = wait_for_deployment_ready(
1494+
cert_webhook_deployment_name, cert_webhook_namespace_name
1495+
)
1496+
time.sleep(30)
1497+
if cert_webhook_ready:
1498+
1499+
webhook_package = 'mldiagnostics-injection-webhook'
1500+
webhook_version = 'v0.5.0'
1501+
webhook_filename = f'{webhook_package}-{webhook_version}.yaml'
1502+
1503+
return_code = download_mldiagnostics_yaml(
1504+
package_name=webhook_package, version=webhook_version
1505+
)
1506+
if return_code != 0:
1507+
return return_code
1508+
1509+
return_code = create_mldiagnostics_namespace()
1510+
if return_code != 0:
1511+
return return_code
1512+
1513+
return_code = install_mldiagnostics_yaml(
1514+
artifact_filename=webhook_filename
1515+
)
1516+
if return_code != 0:
1517+
return return_code
1518+
1519+
return_code = label_default_namespace_mldiagnostics()
1520+
if return_code != 0:
1521+
return return_code
1522+
1523+
# --- Install Operator ---
1524+
operator_package = 'mldiagnostics-connection-operator'
1525+
operator_version = 'v0.5.0'
1526+
operator_filename = f'{operator_package}-{operator_version}.yaml'
1527+
1528+
return_code = download_mldiagnostics_yaml(
1529+
package_name=operator_package, version=operator_version
1530+
)
1531+
if return_code != 0:
1532+
return return_code
1533+
1534+
return_code = install_mldiagnostics_yaml(
1535+
artifact_filename=operator_filename
1536+
)
1537+
if return_code != 0:
1538+
return return_code
1539+
1540+
xpk_print(
1541+
'All mldiagnostics installation and setup steps have been'
1542+
' successfully completed!'
1543+
)
1544+
return return_code
1545+
else:
1546+
xpk_print('The cert-manager-webhook installation failed.')
1547+
xpk_exit(1)
1548+
else:
1549+
xpk_print(
1550+
f'Application {deployment_name} failed to become ready within the'
1551+
' timeout.'
1552+
)
1553+
xpk_exit(1)
1554+
1555+
1556+
def wait_for_deployment_ready(
1557+
deployment_name: str, namespace: str, timeout_seconds: int = 300
1558+
) -> bool:
1559+
"""
1560+
Polls the Kubernetes Deployment status using kubectl rollout status
1561+
until it successfully rolls out (all replicas are ready) or times out.
1562+
1563+
Args:
1564+
deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager').
1565+
namespace: The namespace where the Deployment is located (e.g., 'kueue-system').
1566+
timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes).
1567+
1568+
Returns:
1569+
bool: True if the Deployment successfully rolled out, False otherwise (timeout or error).
1570+
"""
1571+
1572+
command = (
1573+
f'kubectl rollout status deployment/{deployment_name} -n {namespace}'
1574+
f' --timeout={timeout_seconds}s'
1575+
)
1576+
1577+
print(
1578+
f'Waiting for deployment {deployment_name} in namespace {namespace} to'
1579+
' successfully roll out...'
1580+
)
1581+
1582+
return_code, return_output = run_command_for_value(
1583+
command, f'Checking status of deployment {deployment_name}...'
1584+
)
1585+
1586+
if return_code != 0:
1587+
xpk_print(f'\nError: Deployment {deployment_name} failed to roll out.')
1588+
xpk_print(f'kubectl output: {return_output}')
1589+
return False
1590+
1591+
xpk_print(f'Success: Deployment {deployment_name} successfully rolled out.')
1592+
return True

src/xpk/commands/cluster_test.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from unittest.mock import MagicMock, patch
2121
import pytest
2222

23-
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command
23+
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command, install_mldiagnostics_prerequisites
2424
from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
2525
from xpk.core.testing.commands_tester import CommandsTester
2626
from xpk.utils.feature_flags import FeatureFlags
@@ -56,6 +56,9 @@ def mocks(mocker) -> _Mocks:
5656
run_command_with_updates_path=(
5757
'xpk.commands.cluster.run_command_with_updates'
5858
),
59+
run_command_for_value_path=(
60+
'xpk.commands.cluster.run_command_for_value'
61+
),
5962
),
6063
)
6164

@@ -87,6 +90,7 @@ def construct_args(**kwargs: Any) -> Namespace:
8790
memory_limit='100Gi',
8891
cpu_limit=100,
8992
cluster_cpu_machine_type='',
93+
managed_mldiagnostics=False,
9094
)
9195
args_dict.update(kwargs)
9296
return Namespace(**args_dict)
@@ -247,3 +251,58 @@ def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag
247251
mocks.commands_tester.assert_command_run(
248252
'clusters create', ' --no-enable-autoupgrade'
249253
)
254+
255+
256+
def test_install_mldiagnostics_prerequisites_commands_executed(
257+
mocks: _Mocks,
258+
mocker,
259+
):
260+
mock_sleep = mocker.patch('time.sleep', return_value=None)
261+
262+
mock_wait_ready = mocker.patch(
263+
'xpk.commands.cluster.wait_for_deployment_ready', return_value=True
264+
)
265+
mock_install_cert = mocker.patch(
266+
'xpk.commands.cluster.install_cert_manager', return_value=0
267+
)
268+
mock_download = mocker.patch(
269+
'xpk.commands.cluster.download_mldiagnostics_yaml', return_value=0
270+
)
271+
mock_create_ns = mocker.patch(
272+
'xpk.commands.cluster.create_mldiagnostics_namespace', return_value=0
273+
)
274+
mock_install_yaml = mocker.patch(
275+
'xpk.commands.cluster.install_mldiagnostics_yaml', return_value=0
276+
)
277+
mock_label_ns = mocker.patch(
278+
'xpk.commands.cluster.label_default_namespace_mldiagnostics',
279+
return_value=0,
280+
)
281+
282+
mocker.patch('os.path.exists', return_value=True)
283+
mocker.patch('os.remove')
284+
285+
install_mldiagnostics_prerequisites()
286+
287+
mock_wait_ready.assert_any_call('kueue-controller-manager', 'kueue-system')
288+
289+
assert mock_sleep.call_count == 2
290+
mock_sleep.assert_any_call(30)
291+
292+
mock_install_cert.assert_called_once()
293+
294+
mock_wait_ready.assert_any_call('cert-manager-webhook', 'cert-manager')
295+
296+
assert mock_download.call_count == 2
297+
mock_download.assert_any_call(
298+
package_name='mldiagnostics-injection-webhook', version='v0.5.0'
299+
)
300+
mock_download.assert_any_call(
301+
package_name='mldiagnostics-connection-operator', version='v0.5.0'
302+
)
303+
304+
mock_create_ns.assert_called_once()
305+
306+
assert mock_install_yaml.call_count == 2
307+
308+
mock_label_ns.assert_called_once()

0 commit comments

Comments
 (0)