Skip to content

Commit 0596553

Browse files
committed
Add Diagon installation during cluster creation and modify the workload.py
Add wait_for_deployment_ready() Added unit test update goldens.yaml update goldens.yaml update goldens.yaml Fixed parser/cluster.py update goldens.yaml fixed linter fixed linter pyink Test unit test
1 parent ff371b9 commit 0596553

File tree

4 files changed

+367
-1
lines changed

4 files changed

+367
-1
lines changed

src/xpk/commands/cluster.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
from ..utils.templates import get_templates_absolute_path
8282
import shutil
8383
import os
84+
import time
8485

8586
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
8687

@@ -407,6 +408,8 @@ def cluster_create(args) -> None:
407408
# pylint: disable=line-too-long
408409
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
409410
)
411+
if args.managed_mldiagnostics:
412+
install_diagon_prerequisites()
410413
xpk_exit(0)
411414

412415

@@ -1319,3 +1322,269 @@ def prepare_gpus(system: SystemCharacteristics):
13191322
err_code = disable_mglru_on_cluster()
13201323
if err_code > 0:
13211324
xpk_exit(err_code)
1325+
1326+
1327+
def install_cert_manager(version: str = 'v1.13.0'):
1328+
"""
1329+
Apply the cert-manager manifest.
1330+
1331+
Returns:
1332+
0 if successful and 1 otherwise.
1333+
"""
1334+
1335+
command = (
1336+
'kubectl apply -f'
1337+
' https://github.com/cert-manager/cert-manager/releases/download/'
1338+
f'{version}/cert-manager.yaml'
1339+
)
1340+
1341+
return_code = run_command_with_updates(
1342+
command, f'Applying cert-manager {version} manifest...'
1343+
)
1344+
1345+
if return_code != 0:
1346+
xpk_print(f'Applying cert-manager returned with ERROR {return_code}.\n')
1347+
1348+
return return_code
1349+
1350+
1351+
def download_mldiagnostics_yaml(package_name: str, version: str):
1352+
"""
1353+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
1354+
1355+
Returns:
1356+
0 if successful and 1 otherwise.
1357+
"""
1358+
1359+
command = (
1360+
'gcloud artifacts generic download'
1361+
' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
1362+
f' --package={package_name} --version={version} --destination=./'
1363+
' --project=ai-on-gke'
1364+
)
1365+
1366+
return_code, return_output = run_command_for_value(
1367+
command,
1368+
f'Starting gcloud artifacts download for {package_name} {version}...',
1369+
)
1370+
1371+
if return_code != 0:
1372+
if 'already exists' in return_output:
1373+
xpk_print(
1374+
f'Artifact file for {package_name} {version} already exists locally.'
1375+
' Skipping download.'
1376+
)
1377+
return 0
1378+
xpk_print(f'gcloud download returned with ERROR {return_code}.\n')
1379+
xpk_exit(return_code)
1380+
1381+
xpk_print('Artifact download completed successfully.')
1382+
return return_code
1383+
1384+
1385+
def create_mldiagnostics_namespace():
1386+
"""
1387+
Creates the 'gke-diagon' namespace.
1388+
1389+
Returns:
1390+
0 if successful and 1 otherwise.
1391+
"""
1392+
1393+
command = 'kubectl create namespace gke-diagon'
1394+
1395+
return_code, return_output = run_command_for_value(
1396+
command, 'Starting kubectl create namespace...'
1397+
)
1398+
1399+
if return_code != 0:
1400+
if 'already exists' in return_output:
1401+
xpk_print('Namespace already exists locally. Skipping creation.')
1402+
return 0
1403+
xpk_print(f'Namespace creation returned with ERROR {return_code}.\n')
1404+
xpk_exit(return_code)
1405+
1406+
xpk_print('gke-diagon Namespace created or already exists.')
1407+
return return_code
1408+
1409+
1410+
def install_mldiagnostics_yaml(artifact_filename: str):
1411+
"""
1412+
Applies the mldiagnostics injection webhook YAML manifest.
1413+
1414+
Returns:
1415+
0 if successful and 1 otherwise.
1416+
"""
1417+
1418+
command = f'kubectl apply -f {artifact_filename} -n gke-diagon'
1419+
1420+
return_code = run_command_with_updates(
1421+
command, f'Starting kubectl apply -f {artifact_filename} -n gke-diagon...'
1422+
)
1423+
1424+
if return_code != 0:
1425+
xpk_print(f'kubectl apply returned with ERROR {return_code}.\n')
1426+
xpk_exit(return_code)
1427+
1428+
xpk_print(f'{artifact_filename} applied successfully.')
1429+
1430+
if os.path.exists(artifact_filename):
1431+
try:
1432+
os.remove(artifact_filename)
1433+
xpk_print(f'Successfully deleted local file: {artifact_filename}')
1434+
1435+
except PermissionError:
1436+
xpk_print(
1437+
f'Failed to delete file {artifact_filename} due to Permission Error.'
1438+
)
1439+
1440+
else:
1441+
xpk_print(
1442+
f'File {artifact_filename} does not exist locally. Skipping deletion'
1443+
' (Cleanup assumed).'
1444+
)
1445+
1446+
return return_code
1447+
1448+
1449+
def label_default_namespace_mldiagnostics():
1450+
"""
1451+
Labels the 'default' namespace with 'diagon-enabled=true'.
1452+
1453+
Returns:
1454+
0 if successful and 1 otherwise.
1455+
"""
1456+
1457+
command = 'kubectl label namespace default diagon-enabled=true'
1458+
1459+
return_code = run_command_with_updates(
1460+
command,
1461+
'Starting kubectl label namespace default with diagon-enabled=true...',
1462+
)
1463+
1464+
if return_code != 0:
1465+
xpk_print(f'Namespace labeling returned with ERROR {return_code}.\n')
1466+
xpk_exit(return_code)
1467+
1468+
xpk_print('default Namespace successfully labeled.')
1469+
return return_code
1470+
1471+
1472+
def install_diagon_prerequisites():
1473+
"""
1474+
Diagon installation requirements.
1475+
1476+
Returns:
1477+
0 if successful and 1 otherwise.
1478+
"""
1479+
deployment_name = 'kueue-controller-manager'
1480+
namespace_name = 'kueue-system'
1481+
cert_webhook_deployment_name = 'cert-manager-webhook'
1482+
cert_webhook_namespace_name = 'cert-manager'
1483+
# is_running = wait_for_cluster_running(args)
1484+
is_running = wait_for_deployment_ready(deployment_name, namespace_name)
1485+
time.sleep(30)
1486+
if is_running:
1487+
return_code = install_cert_manager()
1488+
if return_code != 0:
1489+
return return_code
1490+
1491+
cert_webhook_ready = wait_for_deployment_ready(
1492+
cert_webhook_deployment_name, cert_webhook_namespace_name
1493+
)
1494+
time.sleep(30)
1495+
if cert_webhook_ready:
1496+
1497+
webhook_package = 'mldiagnostics-injection-webhook'
1498+
webhook_version = 'v0.3.0'
1499+
webhook_filename = f'{webhook_package}-{webhook_version}.yaml'
1500+
1501+
return_code = download_mldiagnostics_yaml(
1502+
package_name=webhook_package, version=webhook_version
1503+
)
1504+
if return_code != 0:
1505+
return return_code
1506+
1507+
return_code = create_mldiagnostics_namespace()
1508+
if return_code != 0:
1509+
return return_code
1510+
1511+
return_code = install_mldiagnostics_yaml(
1512+
artifact_filename=webhook_filename
1513+
)
1514+
if return_code != 0:
1515+
return return_code
1516+
1517+
return_code = label_default_namespace_mldiagnostics()
1518+
if return_code != 0:
1519+
return return_code
1520+
1521+
# --- Install Operator ---
1522+
operator_package = 'mldiagnostics-connection-operator'
1523+
operator_version = 'v0.3.0'
1524+
operator_filename = f'{operator_package}-{operator_version}.yaml'
1525+
1526+
return_code = download_mldiagnostics_yaml(
1527+
package_name=operator_package, version=operator_version
1528+
)
1529+
if return_code != 0:
1530+
return return_code
1531+
1532+
return_code = install_mldiagnostics_yaml(
1533+
artifact_filename=operator_filename
1534+
)
1535+
if return_code != 0:
1536+
return return_code
1537+
1538+
xpk_print(
1539+
'All diagon installation and setup steps have been successfully'
1540+
' completed!'
1541+
)
1542+
return return_code
1543+
else:
1544+
xpk_print('The cert-manager-webhook installation failed.')
1545+
xpk_exit(1)
1546+
else:
1547+
xpk_print(
1548+
f'Application {deployment_name} failed to become ready within the'
1549+
' timeout.'
1550+
)
1551+
xpk_exit(1)
1552+
1553+
1554+
def wait_for_deployment_ready(
1555+
deployment_name: str, namespace: str, timeout_seconds: int = 300
1556+
) -> bool:
1557+
"""
1558+
Polls the Kubernetes Deployment status using kubectl rollout status
1559+
until it successfully rolls out (all replicas are ready) or times out.
1560+
1561+
Args:
1562+
deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager').
1563+
namespace: The namespace where the Deployment is located (e.g., 'kueue-system').
1564+
timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes).
1565+
1566+
Returns:
1567+
bool: True if the Deployment successfully rolled out, False otherwise (timeout or error).
1568+
"""
1569+
1570+
command = (
1571+
f'kubectl rollout status deployment/{deployment_name} -n {namespace}'
1572+
f' --timeout={timeout_seconds}s'
1573+
)
1574+
1575+
print(
1576+
f'Waiting for deployment {deployment_name} in namespace {namespace} to'
1577+
' successfully roll out...'
1578+
)
1579+
1580+
return_code, return_output = run_command_for_value(
1581+
command, f'Checking status of deployment {deployment_name}...'
1582+
)
1583+
1584+
if return_code != 0:
1585+
xpk_print(f'\nError: Deployment {deployment_name} failed to roll out.')
1586+
xpk_print(f'kubectl output: {return_output}')
1587+
return False
1588+
1589+
xpk_print(f'Success: Deployment {deployment_name} successfully rolled out.')
1590+
return True

src/xpk/commands/cluster_test.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from unittest.mock import MagicMock, patch
2121
import pytest
2222

23-
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command
23+
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command, install_diagon_prerequisites
2424
from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
2525
from xpk.core.testing.commands_tester import CommandsTester
2626
from xpk.utils.feature_flags import FeatureFlags
@@ -56,6 +56,9 @@ def mocks(mocker) -> _Mocks:
5656
run_command_with_updates_path=(
5757
'xpk.commands.cluster.run_command_with_updates'
5858
),
59+
run_command_for_value_path=(
60+
'xpk.commands.cluster.run_command_for_value'
61+
),
5962
),
6063
)
6164

@@ -87,6 +90,7 @@ def construct_args(**kwargs: Any) -> Namespace:
8790
memory_limit='100Gi',
8891
cpu_limit=100,
8992
cluster_cpu_machine_type='',
93+
managed_mldiagnostics=False,
9094
)
9195
args_dict.update(kwargs)
9296
return Namespace(**args_dict)
@@ -247,3 +251,58 @@ def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag
247251
mocks.commands_tester.assert_command_run(
248252
'clusters create', ' --no-enable-autoupgrade'
249253
)
254+
255+
256+
def test_install_diagon_prerequisites_commands_executed(
257+
mocks: _Mocks,
258+
mocker,
259+
):
260+
mock_sleep = mocker.patch('time.sleep', return_value=None)
261+
262+
mock_wait_ready = mocker.patch(
263+
'xpk.commands.cluster.wait_for_deployment_ready', return_value=True
264+
)
265+
mock_install_cert = mocker.patch(
266+
'xpk.commands.cluster.install_cert_manager', return_value=0
267+
)
268+
mock_download = mocker.patch(
269+
'xpk.commands.cluster.download_mldiagnostics_yaml', return_value=0
270+
)
271+
mock_create_ns = mocker.patch(
272+
'xpk.commands.cluster.create_mldiagnostics_namespace', return_value=0
273+
)
274+
mock_install_yaml = mocker.patch(
275+
'xpk.commands.cluster.install_mldiagnostics_yaml', return_value=0
276+
)
277+
mock_label_ns = mocker.patch(
278+
'xpk.commands.cluster.label_default_namespace_mldiagnostics',
279+
return_value=0,
280+
)
281+
282+
mocker.patch('os.path.exists', return_value=True)
283+
mocker.patch('os.remove')
284+
285+
install_diagon_prerequisites()
286+
287+
mock_wait_ready.assert_any_call('kueue-controller-manager', 'kueue-system')
288+
289+
assert mock_sleep.call_count == 2
290+
mock_sleep.assert_any_call(30)
291+
292+
mock_install_cert.assert_called_once()
293+
294+
mock_wait_ready.assert_any_call('cert-manager-webhook', 'cert-manager')
295+
296+
assert mock_download.call_count == 2
297+
mock_download.assert_any_call(
298+
package_name='mldiagnostics-injection-webhook', version='v0.3.0'
299+
)
300+
mock_download.assert_any_call(
301+
package_name='mldiagnostics-connection-operator', version='v0.3.0'
302+
)
303+
304+
mock_create_ns.assert_called_once()
305+
306+
assert mock_install_yaml.call_count == 2
307+
308+
mock_label_ns.assert_called_once()

0 commit comments

Comments
 (0)