Skip to content

Commit 789fa88

Browse files
authored
Merge pull request #509 from AI-Hypercomputer/sa-missing-rbac
Provided the required permissions for JAX to list the pods
2 parents 7f714da + 18a1710 commit 789fa88

File tree

5 files changed

+110
-9
lines changed

5 files changed

+110
-9
lines changed

src/xpk/commands/batch.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from argparse import Namespace
1919

2020
from ..core.cluster import (
21-
create_xpk_k8s_service_account,
21+
setup_k8s_service_accounts,
2222
get_cluster_credentials,
2323
)
2424
from ..core.commands import run_command_for_value
@@ -54,14 +54,14 @@ def batch(args: Namespace) -> None:
5454
err_code = prepare_kjob(args)
5555
if err_code > 0:
5656
xpk_exit(err_code)
57-
create_xpk_k8s_service_account()
57+
setup_k8s_service_accounts()
5858

5959
submit_job(args)
6060

6161

6262
def submit_job(args: Namespace) -> None:
6363

64-
create_xpk_k8s_service_account()
64+
setup_k8s_service_accounts()
6565

6666
cmd = (
6767
'kubectl kjob create slurm'

src/xpk/commands/run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from argparse import Namespace
1818

1919
from ..core.cluster import (
20-
create_xpk_k8s_service_account,
20+
setup_k8s_service_accounts,
2121
get_cluster_credentials,
2222
)
2323
from ..core.commands import run_command_with_full_controls
@@ -53,7 +53,7 @@ def run(args: Namespace) -> None:
5353
err_code = prepare_kjob(args)
5454
if err_code > 0:
5555
xpk_exit(err_code)
56-
create_xpk_k8s_service_account()
56+
setup_k8s_service_accounts()
5757

5858
submit_job(args)
5959

src/xpk/commands/shell.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"""
1313

1414
from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
15-
from ..core.cluster import get_cluster_credentials, add_zone_and_project, create_xpk_k8s_service_account
15+
from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
1616
from ..utils.console import xpk_exit, xpk_print
1717
from argparse import Namespace
1818

@@ -82,7 +82,7 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
8282
err_code = prepare_kjob(args)
8383
if err_code > 0:
8484
xpk_exit(err_code)
85-
create_xpk_k8s_service_account()
85+
setup_k8s_service_accounts()
8686

8787
cmd = (
8888
'kubectl-kjob create interactive --profile'

src/xpk/commands/workload.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
)
2323
from ..core.cluster import (
2424
XPK_SA,
25-
create_xpk_k8s_service_account,
25+
setup_k8s_service_accounts,
2626
get_cluster_credentials,
2727
setup_k8s_env,
2828
)
@@ -297,7 +297,7 @@ def workload_create(args) -> None:
297297
0 if successful and 1 otherwise.
298298
"""
299299
k8s_api_client = setup_k8s_env(args)
300-
create_xpk_k8s_service_account()
300+
setup_k8s_service_accounts()
301301

302302
workload_exists = check_if_workload_exists(args)
303303

src/xpk/core/cluster.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,19 @@ def get_gpu_type_from_cluster(args) -> str:
423423
return ''
424424

425425

426+
def setup_k8s_service_accounts() -> None:
427+
"""
428+
Creates/sets up SAs and the roles for them
429+
"""
430+
default_sa = 'default'
431+
432+
create_xpk_k8s_service_account()
433+
434+
role_name = create_pod_reader_role()
435+
create_role_binding(default_sa, role_name)
436+
create_role_binding(XPK_SA, role_name)
437+
438+
426439
def create_xpk_k8s_service_account() -> None:
427440
k8s_core_client = k8s_client.CoreV1Api()
428441
sa = k8s_client.V1ServiceAccount(
@@ -441,6 +454,94 @@ def create_xpk_k8s_service_account() -> None:
441454
)
442455

443456

457+
def create_pod_reader_role() -> str:
458+
"""
459+
Creates the 'pod-reader' Role in the default namespace.
460+
"""
461+
k8s_rbac_client = k8s_client.RbacAuthorizationV1Api()
462+
role_name = 'pod-reader'
463+
464+
role = k8s_client.V1Role(
465+
metadata=k8s_client.V1ObjectMeta(
466+
name=role_name, namespace=DEFAULT_NAMESPACE
467+
),
468+
rules=[
469+
k8s_client.V1PolicyRule(
470+
api_groups=[''],
471+
resources=['pods', 'services'],
472+
verbs=['get', 'list', 'watch'],
473+
),
474+
k8s_client.V1PolicyRule(
475+
api_groups=['batch'],
476+
resources=['jobs'],
477+
verbs=['get', 'list', 'watch'],
478+
),
479+
],
480+
)
481+
482+
xpk_print(
483+
f'Attempting to create Role: {role_name} in namespace:'
484+
f' {DEFAULT_NAMESPACE}'
485+
)
486+
try:
487+
k8s_rbac_client.create_namespaced_role(DEFAULT_NAMESPACE, role, pretty=True)
488+
xpk_print(f'Successfully created Role: {role_name}')
489+
return role_name
490+
except ApiException as e:
491+
if e.status == 409: # Conflict, meaning it already exists
492+
xpk_print(f'Role: {role_name} already exists. Skipping its creation.')
493+
return role_name
494+
else:
495+
xpk_print(f'Error creating Role {role_name}: {e}')
496+
xpk_exit(1)
497+
498+
499+
def create_role_binding(sa: str, role_name: str) -> None:
500+
"""
501+
Creates a RoleBinding to associate the Service Account
502+
with the Role in the default namespace.
503+
Assumes the Service Account and the Role already exist.
504+
"""
505+
k8s_rbac_client = k8s_client.RbacAuthorizationV1Api()
506+
role_binding_name = f'{sa}-{role_name}-binding'
507+
508+
role_binding = k8s_client.V1RoleBinding(
509+
metadata=k8s_client.V1ObjectMeta(
510+
name=role_binding_name, namespace=DEFAULT_NAMESPACE
511+
),
512+
subjects=[
513+
k8s_client.RbacV1Subject(
514+
kind='ServiceAccount', name=sa, namespace=DEFAULT_NAMESPACE
515+
)
516+
],
517+
role_ref=k8s_client.V1RoleRef(
518+
kind='Role', name=role_name, api_group='rbac.authorization.k8s.io'
519+
),
520+
)
521+
522+
xpk_print(
523+
f'Attempting to create RoleBinding: {role_binding_name} for Service'
524+
f' Account: {XPK_SA} to Role: {role_name} in namespace:'
525+
f' {DEFAULT_NAMESPACE}'
526+
)
527+
try:
528+
k8s_rbac_client.create_namespaced_role_binding(
529+
DEFAULT_NAMESPACE, role_binding, pretty=True
530+
)
531+
xpk_print(
532+
f'Successfully created RoleBinding: {role_binding_name} for {XPK_SA}'
533+
)
534+
except ApiException as e:
535+
if e.status == 409: # Conflict, meaning it already exists
536+
xpk_print(
537+
f'RoleBinding: {role_binding_name} already exists. Skipping its'
538+
' creation.'
539+
)
540+
else:
541+
xpk_print(f'Error creating RoleBinding {role_binding_name}: {e}')
542+
xpk_exit(1)
543+
544+
444545
def update_gke_cluster_with_clouddns(args) -> int:
445546
"""Run the GKE cluster update command for existing clusters and enable CloudDNS.
446547

0 commit comments

Comments
 (0)